blob: c42cd0c6b33476a79380a4d36fb74268ccadad19 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000118/* case 0x0009: * HORIZONTAL TABULATION */
119/* case 0x000A: * LINE FEED */
120/* case 0x000B: * VERTICAL TABULATION */
121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000149/* 0x000A, * LINE FEED */
150/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000153/* 0x001C, * FILE SEPARATOR */
154/* 0x001D, * GROUP SEPARATOR */
155/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000161
Benjamin Peterson857ce152009-01-31 16:29:18 +0000162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000170};
171
172
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000173Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000174PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000176#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000178#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000182#endif
183}
184
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000185/* --- Bloom Filters ----------------------------------------------------- */
186
187/* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
190
191/* the linebreak mask is set up by Unicode_Init below */
192
193#define BLOOM_MASK unsigned long
194
195static BLOOM_MASK bloom_linebreak;
196
197#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
198
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000199#define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000202
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000203Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204{
205 /* calculate simple bloom-style bitmask for a given unicode string */
206
207 long mask;
208 Py_ssize_t i;
209
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
213
214 return mask;
215}
216
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000217Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218{
219 Py_ssize_t i;
220
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
224
Fredrik Lundh77633512006-05-23 19:47:35 +0000225 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226}
227
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000228#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
230
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231/* --- Unicode Object ----------------------------------------------------- */
232
233static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000234int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000235 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000236{
237 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000238
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000241 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000246
Benjamin Peterson857ce152009-01-31 16:29:18 +0000247 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
250 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 return -1;
254 }
255
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000258 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259 it contains). */
260
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000262 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000263 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000265 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 PyErr_NoMemory();
267 return -1;
268 }
269 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000270 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000272 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 }
278 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000279
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return 0;
281}
282
283/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000284 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285
286 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000287 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288
289*/
290
291static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000292PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293{
294 register PyUnicodeObject *unicode;
295
Andrew Dalkee0df7622006-05-27 11:04:36 +0000296 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
300 }
301
Neal Norwitze7d8be82008-07-31 17:17:14 +0000302 /* Ensure we won't overflow the size. */
303 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
304 return (PyUnicodeObject *)PyErr_NoMemory();
305 }
306
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000308 if (free_list) {
309 unicode = free_list;
310 free_list = *(PyUnicodeObject **)unicode;
311 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 if (unicode->str) {
313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
315 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000316 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000317 PyObject_DEL(unicode->str);
318 unicode->str = NULL;
319 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000320 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000321 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000322 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
323 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000324 }
325 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 }
327 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000328 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000329 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 if (unicode == NULL)
331 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000332 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
333 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 }
335
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000336 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 PyErr_NoMemory();
338 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000339 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000340 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
345 * that case.
346 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000347 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000348 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000349 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000350 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000351 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000353
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000354 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000355 /* XXX UNREF/NEWREF interface should be more symmetrical */
356 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000357 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000358 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360}
361
362static
Guido van Rossum9475a232001-10-05 20:51:39 +0000363void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000365 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000366 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000367 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000368 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
369 PyObject_DEL(unicode->str);
370 unicode->str = NULL;
371 unicode->length = 0;
372 }
373 if (unicode->defenc) {
374 Py_DECREF(unicode->defenc);
375 unicode->defenc = NULL;
376 }
377 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000378 *(PyUnicodeObject **)unicode = free_list;
379 free_list = unicode;
380 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381 }
382 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000383 PyObject_DEL(unicode->str);
384 Py_XDECREF(unicode->defenc);
385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 }
387}
388
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000389static
390int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391{
392 register PyUnicodeObject *v;
393
394 /* Argument checks */
395 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000396 PyErr_BadInternalCall();
397 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000398 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000399 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000400 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000401 PyErr_BadInternalCall();
402 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000403 }
404
405 /* Resizing unicode_empty and single character objects is not
406 possible since these are being shared. We simply return a fresh
407 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000408 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000409 (v == unicode_empty || v->length == 1)) {
410 PyUnicodeObject *w = _PyUnicode_New(length);
411 if (w == NULL)
412 return -1;
413 Py_UNICODE_COPY(w->str, v->str,
414 length < v->length ? length : v->length);
415 Py_DECREF(*unicode);
416 *unicode = w;
417 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000418 }
419
420 /* Note that we don't have to modify *unicode for unshared Unicode
421 objects, since we can modify them in-place. */
422 return unicode_resize(v, length);
423}
424
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000425int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
426{
427 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
428}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000431 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432{
433 PyUnicodeObject *unicode;
434
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 /* If the Unicode data is known at construction time, we can apply
436 some optimizations which share commonly used objects. */
437 if (u != NULL) {
438
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000439 /* Optimization for empty strings */
440 if (size == 0 && unicode_empty != NULL) {
441 Py_INCREF(unicode_empty);
442 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000443 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000444
445 /* Single character Unicode objects in the Latin-1 range are
446 shared when using this constructor */
447 if (size == 1 && *u < 256) {
448 unicode = unicode_latin1[*u];
449 if (!unicode) {
450 unicode = _PyUnicode_New(1);
451 if (!unicode)
452 return NULL;
453 unicode->str[0] = *u;
454 unicode_latin1[*u] = unicode;
455 }
456 Py_INCREF(unicode);
457 return (PyObject *)unicode;
458 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
Tim Petersced69f82003-09-16 20:30:58 +0000460
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
464
465 /* Copy the Unicode data into the new object */
466 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000467 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000468
469 return (PyObject *)unicode;
470}
471
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000472PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
473{
474 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000475
Benjamin Peterson857ce152009-01-31 16:29:18 +0000476 if (size < 0) {
477 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000478 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000479 return NULL;
480 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000481
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects.
484 Also, this means the input must be UTF-8, so fall back to the
485 UTF-8 decoder at the end. */
486 if (u != NULL) {
487
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000488 /* Optimization for empty strings */
489 if (size == 0 && unicode_empty != NULL) {
490 Py_INCREF(unicode_empty);
491 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000492 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000493
494 /* Single characters are shared when using this constructor.
495 Restrict to ASCII, since the input must be UTF-8. */
496 if (size == 1 && Py_CHARMASK(*u) < 128) {
497 unicode = unicode_latin1[Py_CHARMASK(*u)];
498 if (!unicode) {
499 unicode = _PyUnicode_New(1);
500 if (!unicode)
501 return NULL;
502 unicode->str[0] = Py_CHARMASK(*u);
503 unicode_latin1[Py_CHARMASK(*u)] = unicode;
504 }
505 Py_INCREF(unicode);
506 return (PyObject *)unicode;
507 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000508
509 return PyUnicode_DecodeUTF8(u, size, NULL);
510 }
511
512 unicode = _PyUnicode_New(size);
513 if (!unicode)
514 return NULL;
515
516 return (PyObject *)unicode;
517}
518
519PyObject *PyUnicode_FromString(const char *u)
520{
521 size_t size = strlen(u);
522 if (size > PY_SSIZE_T_MAX) {
523 PyErr_SetString(PyExc_OverflowError, "input too long");
524 return NULL;
525 }
526
527 return PyUnicode_FromStringAndSize(u, size);
528}
529
Guido van Rossumd57fd912000-03-10 22:53:23 +0000530#ifdef HAVE_WCHAR_H
531
Mark Dickinson6b265f12009-03-18 16:07:26 +0000532#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
533# define CONVERT_WCHAR_TO_SURROGATES
534#endif
535
536#ifdef CONVERT_WCHAR_TO_SURROGATES
537
538/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
539 to convert from UTF32 to UTF16. */
540
541PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
542 Py_ssize_t size)
543{
544 PyUnicodeObject *unicode;
545 register Py_ssize_t i;
546 Py_ssize_t alloc;
547 const wchar_t *orig_w;
548
549 if (w == NULL) {
550 PyErr_BadInternalCall();
551 return NULL;
552 }
553
554 alloc = size;
555 orig_w = w;
556 for (i = size; i > 0; i--) {
557 if (*w > 0xFFFF)
558 alloc++;
559 w++;
560 }
561 w = orig_w;
562 unicode = _PyUnicode_New(alloc);
563 if (!unicode)
564 return NULL;
565
566 /* Copy the wchar_t data into the new object */
567 {
568 register Py_UNICODE *u;
569 u = PyUnicode_AS_UNICODE(unicode);
570 for (i = size; i > 0; i--) {
571 if (*w > 0xFFFF) {
572 wchar_t ordinal = *w++;
573 ordinal -= 0x10000;
574 *u++ = 0xD800 | (ordinal >> 10);
575 *u++ = 0xDC00 | (ordinal & 0x3FF);
576 }
577 else
578 *u++ = *w++;
579 }
580 }
581 return (PyObject *)unicode;
582}
583
584#else
585
Guido van Rossumd57fd912000-03-10 22:53:23 +0000586PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000587 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000588{
589 PyUnicodeObject *unicode;
590
591 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000592 PyErr_BadInternalCall();
593 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000594 }
595
596 unicode = _PyUnicode_New(size);
597 if (!unicode)
598 return NULL;
599
600 /* Copy the wchar_t data into the new object */
601#ifdef HAVE_USABLE_WCHAR_T
602 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000603#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000604 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000605 register Py_UNICODE *u;
606 register Py_ssize_t i;
607 u = PyUnicode_AS_UNICODE(unicode);
608 for (i = size; i > 0; i--)
609 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000610 }
611#endif
612
613 return (PyObject *)unicode;
614}
615
Mark Dickinson6b265f12009-03-18 16:07:26 +0000616#endif /* CONVERT_WCHAR_TO_SURROGATES */
617
618#undef CONVERT_WCHAR_TO_SURROGATES
619
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000620static void
621makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
622{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000623 *fmt++ = '%';
624 if (width) {
625 if (zeropad)
626 *fmt++ = '0';
627 fmt += sprintf(fmt, "%d", width);
628 }
629 if (precision)
630 fmt += sprintf(fmt, ".%d", precision);
631 if (longflag)
632 *fmt++ = 'l';
633 else if (size_tflag) {
634 char *f = PY_FORMAT_SIZE_T;
635 while (*f)
636 *fmt++ = *f++;
637 }
638 *fmt++ = c;
639 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000640}
641
642#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
643
644PyObject *
645PyUnicode_FromFormatV(const char *format, va_list vargs)
646{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000647 va_list count;
648 Py_ssize_t callcount = 0;
649 PyObject **callresults = NULL;
650 PyObject **callresult = NULL;
651 Py_ssize_t n = 0;
652 int width = 0;
653 int precision = 0;
654 int zeropad;
655 const char* f;
656 Py_UNICODE *s;
657 PyObject *string;
658 /* used by sprintf */
659 char buffer[21];
660 /* use abuffer instead of buffer, if we need more space
661 * (which can happen if there's a format specifier with width). */
662 char *abuffer = NULL;
663 char *realbuffer;
664 Py_ssize_t abuffersize = 0;
665 char fmt[60]; /* should be enough for %0width.precisionld */
666 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000667
668#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000669 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000670#else
671#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000672 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000673#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000674 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000675#endif
676#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000677 /* step 1: count the number of %S/%R/%s format specifications
678 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
679 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000680 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000681 if (*f == '%') {
682 if (*(f+1)=='%')
683 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000684 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000685 ++callcount;
686 while (isdigit((unsigned)*f))
687 width = (width*10) + *f++ - '0';
688 while (*++f && *f != '%' && !isalpha((unsigned)*f))
689 ;
690 if (*f == 's')
691 ++callcount;
692 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000693 }
694 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000695 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000696 if (callcount) {
697 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
698 if (!callresults) {
699 PyErr_NoMemory();
700 return NULL;
701 }
702 callresult = callresults;
703 }
704 /* step 3: figure out how large a buffer we need */
705 for (f = format; *f; f++) {
706 if (*f == '%') {
707 const char* p = f;
708 width = 0;
709 while (isdigit((unsigned)*f))
710 width = (width*10) + *f++ - '0';
711 while (*++f && *f != '%' && !isalpha((unsigned)*f))
712 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000713
Benjamin Peterson857ce152009-01-31 16:29:18 +0000714 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
715 * they don't affect the amount of space we reserve.
716 */
717 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000718 (f[1] == 'd' || f[1] == 'u'))
719 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000720
Benjamin Peterson857ce152009-01-31 16:29:18 +0000721 switch (*f) {
722 case 'c':
723 (void)va_arg(count, int);
724 /* fall through... */
725 case '%':
726 n++;
727 break;
728 case 'd': case 'u': case 'i': case 'x':
729 (void) va_arg(count, int);
730 /* 20 bytes is enough to hold a 64-bit
731 integer. Decimal takes the most space.
732 This isn't enough for octal.
733 If a width is specified we need more
734 (which we allocate later). */
735 if (width < 20)
736 width = 20;
737 n += width;
738 if (abuffersize < width)
739 abuffersize = width;
740 break;
741 case 's':
742 {
743 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000744 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000745 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
746 if (!str)
747 goto fail;
748 n += PyUnicode_GET_SIZE(str);
749 /* Remember the str and switch to the next slot */
750 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000751 break;
752 }
753 case 'U':
754 {
755 PyObject *obj = va_arg(count, PyObject *);
756 assert(obj && PyUnicode_Check(obj));
757 n += PyUnicode_GET_SIZE(obj);
758 break;
759 }
760 case 'V':
761 {
762 PyObject *obj = va_arg(count, PyObject *);
763 const char *str = va_arg(count, const char *);
764 assert(obj || str);
765 assert(!obj || PyUnicode_Check(obj));
766 if (obj)
767 n += PyUnicode_GET_SIZE(obj);
768 else
769 n += strlen(str);
770 break;
771 }
772 case 'S':
773 {
774 PyObject *obj = va_arg(count, PyObject *);
775 PyObject *str;
776 assert(obj);
777 str = PyObject_Str(obj);
778 if (!str)
779 goto fail;
780 n += PyUnicode_GET_SIZE(str);
781 /* Remember the str and switch to the next slot */
782 *callresult++ = str;
783 break;
784 }
785 case 'R':
786 {
787 PyObject *obj = va_arg(count, PyObject *);
788 PyObject *repr;
789 assert(obj);
790 repr = PyObject_Repr(obj);
791 if (!repr)
792 goto fail;
793 n += PyUnicode_GET_SIZE(repr);
794 /* Remember the repr and switch to the next slot */
795 *callresult++ = repr;
796 break;
797 }
798 case 'p':
799 (void) va_arg(count, int);
800 /* maximum 64-bit pointer representation:
801 * 0xffffffffffffffff
802 * so 19 characters is enough.
803 * XXX I count 18 -- what's the extra for?
804 */
805 n += 19;
806 break;
807 default:
808 /* if we stumble upon an unknown
809 formatting code, copy the rest of
810 the format string to the output
811 string. (we cannot just skip the
812 code, since there's no way to know
813 what's in the argument list) */
814 n += strlen(p);
815 goto expand;
816 }
817 } else
818 n++;
819 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000820 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000821 if (abuffersize > 20) {
822 abuffer = PyObject_Malloc(abuffersize);
823 if (!abuffer) {
824 PyErr_NoMemory();
825 goto fail;
826 }
827 realbuffer = abuffer;
828 }
829 else
830 realbuffer = buffer;
831 /* step 4: fill the buffer */
832 /* Since we've analyzed how much space we need for the worst case,
833 we don't have to resize the string.
834 There can be no errors beyond this point. */
835 string = PyUnicode_FromUnicode(NULL, n);
836 if (!string)
837 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000838
Benjamin Peterson857ce152009-01-31 16:29:18 +0000839 s = PyUnicode_AS_UNICODE(string);
840 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000841
Benjamin Peterson857ce152009-01-31 16:29:18 +0000842 for (f = format; *f; f++) {
843 if (*f == '%') {
844 const char* p = f++;
845 int longflag = 0;
846 int size_tflag = 0;
847 zeropad = (*f == '0');
848 /* parse the width.precision part */
849 width = 0;
850 while (isdigit((unsigned)*f))
851 width = (width*10) + *f++ - '0';
852 precision = 0;
853 if (*f == '.') {
854 f++;
855 while (isdigit((unsigned)*f))
856 precision = (precision*10) + *f++ - '0';
857 }
858 /* handle the long flag, but only for %ld and %lu.
859 others can be added when necessary. */
860 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
861 longflag = 1;
862 ++f;
863 }
864 /* handle the size_t flag. */
865 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
866 size_tflag = 1;
867 ++f;
868 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000869
Benjamin Peterson857ce152009-01-31 16:29:18 +0000870 switch (*f) {
871 case 'c':
872 *s++ = va_arg(vargs, int);
873 break;
874 case 'd':
875 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
876 if (longflag)
877 sprintf(realbuffer, fmt, va_arg(vargs, long));
878 else if (size_tflag)
879 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
880 else
881 sprintf(realbuffer, fmt, va_arg(vargs, int));
882 appendstring(realbuffer);
883 break;
884 case 'u':
885 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
886 if (longflag)
887 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
888 else if (size_tflag)
889 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
890 else
891 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
892 appendstring(realbuffer);
893 break;
894 case 'i':
895 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
896 sprintf(realbuffer, fmt, va_arg(vargs, int));
897 appendstring(realbuffer);
898 break;
899 case 'x':
900 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
901 sprintf(realbuffer, fmt, va_arg(vargs, int));
902 appendstring(realbuffer);
903 break;
904 case 's':
905 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000906 /* unused, since we already have the result */
907 (void) va_arg(vargs, char *);
908 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
909 PyUnicode_GET_SIZE(*callresult));
910 s += PyUnicode_GET_SIZE(*callresult);
911 /* We're done with the unicode()/repr() => forget it */
912 Py_DECREF(*callresult);
913 /* switch to next unicode()/repr() result */
914 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000915 break;
916 }
917 case 'U':
918 {
919 PyObject *obj = va_arg(vargs, PyObject *);
920 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
921 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
922 s += size;
923 break;
924 }
925 case 'V':
926 {
927 PyObject *obj = va_arg(vargs, PyObject *);
928 const char *str = va_arg(vargs, const char *);
929 if (obj) {
930 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
931 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
932 s += size;
933 } else {
934 appendstring(str);
935 }
936 break;
937 }
938 case 'S':
939 case 'R':
940 {
941 Py_UNICODE *ucopy;
942 Py_ssize_t usize;
943 Py_ssize_t upos;
944 /* unused, since we already have the result */
945 (void) va_arg(vargs, PyObject *);
946 ucopy = PyUnicode_AS_UNICODE(*callresult);
947 usize = PyUnicode_GET_SIZE(*callresult);
948 for (upos = 0; upos<usize;)
949 *s++ = ucopy[upos++];
950 /* We're done with the unicode()/repr() => forget it */
951 Py_DECREF(*callresult);
952 /* switch to next unicode()/repr() result */
953 ++callresult;
954 break;
955 }
956 case 'p':
957 sprintf(buffer, "%p", va_arg(vargs, void*));
958 /* %p is ill-defined: ensure leading 0x. */
959 if (buffer[1] == 'X')
960 buffer[1] = 'x';
961 else if (buffer[1] != 'x') {
962 memmove(buffer+2, buffer, strlen(buffer)+1);
963 buffer[0] = '0';
964 buffer[1] = 'x';
965 }
966 appendstring(buffer);
967 break;
968 case '%':
969 *s++ = '%';
970 break;
971 default:
972 appendstring(p);
973 goto end;
974 }
975 } else
976 *s++ = *f;
977 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000978
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000979 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000980 if (callresults)
981 PyObject_Free(callresults);
982 if (abuffer)
983 PyObject_Free(abuffer);
984 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
985 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000986 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000987 if (callresults) {
988 PyObject **callresult2 = callresults;
989 while (callresult2 < callresult) {
990 Py_DECREF(*callresult2);
991 ++callresult2;
992 }
993 PyObject_Free(callresults);
994 }
995 if (abuffer)
996 PyObject_Free(abuffer);
997 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000998}
999
1000#undef appendstring
1001
1002PyObject *
1003PyUnicode_FromFormat(const char *format, ...)
1004{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001005 PyObject* ret;
1006 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001007
1008#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001009 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001010#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001011 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001012#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001013 ret = PyUnicode_FromFormatV(format, vargs);
1014 va_end(vargs);
1015 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001016}
1017
Martin v. Löwis18e16552006-02-15 17:27:45 +00001018Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001019 wchar_t *w,
1020 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001021{
1022 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001023 PyErr_BadInternalCall();
1024 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001025 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001026
1027 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001028 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001029 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001030
Guido van Rossumd57fd912000-03-10 22:53:23 +00001031#ifdef HAVE_USABLE_WCHAR_T
1032 memcpy(w, unicode->str, size * sizeof(wchar_t));
1033#else
1034 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001035 register Py_UNICODE *u;
1036 register Py_ssize_t i;
1037 u = PyUnicode_AS_UNICODE(unicode);
1038 for (i = size; i > 0; i--)
1039 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040 }
1041#endif
1042
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001043 if (size > PyUnicode_GET_SIZE(unicode))
1044 return PyUnicode_GET_SIZE(unicode);
1045 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001046 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047}
1048
1049#endif
1050
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001051PyObject *PyUnicode_FromOrdinal(int ordinal)
1052{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001053 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001054
1055#ifdef Py_UNICODE_WIDE
1056 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001057 PyErr_SetString(PyExc_ValueError,
1058 "unichr() arg not in range(0x110000) "
1059 "(wide Python build)");
1060 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001061 }
1062#else
1063 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001064 PyErr_SetString(PyExc_ValueError,
1065 "unichr() arg not in range(0x10000) "
1066 "(narrow Python build)");
1067 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001068 }
1069#endif
1070
Hye-Shik Chang40574832004-04-06 07:24:51 +00001071 s[0] = (Py_UNICODE)ordinal;
1072 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001073}
1074
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075PyObject *PyUnicode_FromObject(register PyObject *obj)
1076{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001077 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001078 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001079 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001080 Py_INCREF(obj);
1081 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001082 }
1083 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001084 /* For a Unicode subtype that's not a Unicode object,
1085 return a true Unicode object with the same data. */
1086 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1087 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001088 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001089 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1090}
1091
1092PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001093 const char *encoding,
1094 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001095{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001096 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001097 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001098 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001099
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001101 PyErr_BadInternalCall();
1102 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001104
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001105#if 0
1106 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001107 that no encodings is given and then redirect to
1108 PyObject_Unicode() which then applies the additional logic for
1109 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001110
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001111 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001112 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001113
1114 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001115 if (PyUnicode_Check(obj)) {
1116 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001117 PyErr_SetString(PyExc_TypeError,
1118 "decoding Unicode is not supported");
1119 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001120 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001121 return PyObject_Unicode(obj);
1122 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001123#else
1124 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001125 PyErr_SetString(PyExc_TypeError,
1126 "decoding Unicode is not supported");
1127 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001128 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001129#endif
1130
1131 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001132 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001133 s = PyString_AS_STRING(obj);
1134 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001135 }
Christian Heimes3497f942008-05-26 12:29:14 +00001136 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001137 /* Python 2.x specific */
1138 PyErr_Format(PyExc_TypeError,
1139 "decoding bytearray is not supported");
1140 return NULL;
1141 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001142 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001143 /* Overwrite the error message with something more useful in
1144 case of a TypeError. */
1145 if (PyErr_ExceptionMatches(PyExc_TypeError))
1146 PyErr_Format(PyExc_TypeError,
1147 "coercing to Unicode: need string or buffer, "
1148 "%.80s found",
1149 Py_TYPE(obj)->tp_name);
1150 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001151 }
Tim Petersced69f82003-09-16 20:30:58 +00001152
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001153 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 if (len == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001155 Py_INCREF(unicode_empty);
1156 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001157 }
Tim Petersced69f82003-09-16 20:30:58 +00001158 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001159 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001160
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001161 return v;
1162
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001163 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001164 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165}
1166
1167PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001168 Py_ssize_t size,
1169 const char *encoding,
1170 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171{
1172 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001173
1174 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001175 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001176
1177 /* Shortcuts for common default encodings */
1178 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001180 else if (strcmp(encoding, "latin-1") == 0)
1181 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001182#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1183 else if (strcmp(encoding, "mbcs") == 0)
1184 return PyUnicode_DecodeMBCS(s, size, errors);
1185#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001186 else if (strcmp(encoding, "ascii") == 0)
1187 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188
1189 /* Decode via the codec registry */
1190 buffer = PyBuffer_FromMemory((void *)s, size);
1191 if (buffer == NULL)
1192 goto onError;
1193 unicode = PyCodec_Decode(buffer, encoding, errors);
1194 if (unicode == NULL)
1195 goto onError;
1196 if (!PyUnicode_Check(unicode)) {
1197 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001198 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001199 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 Py_DECREF(unicode);
1201 goto onError;
1202 }
1203 Py_DECREF(buffer);
1204 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001205
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001206 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001207 Py_XDECREF(buffer);
1208 return NULL;
1209}
1210
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001211PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1212 const char *encoding,
1213 const char *errors)
1214{
1215 PyObject *v;
1216
1217 if (!PyUnicode_Check(unicode)) {
1218 PyErr_BadArgument();
1219 goto onError;
1220 }
1221
1222 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001223 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001224
1225 /* Decode via the codec registry */
1226 v = PyCodec_Decode(unicode, encoding, errors);
1227 if (v == NULL)
1228 goto onError;
1229 return v;
1230
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001231 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001232 return NULL;
1233}
1234
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001236 Py_ssize_t size,
1237 const char *encoding,
1238 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239{
1240 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001241
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 unicode = PyUnicode_FromUnicode(s, size);
1243 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001244 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1246 Py_DECREF(unicode);
1247 return v;
1248}
1249
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001250PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1251 const char *encoding,
1252 const char *errors)
1253{
1254 PyObject *v;
1255
1256 if (!PyUnicode_Check(unicode)) {
1257 PyErr_BadArgument();
1258 goto onError;
1259 }
1260
1261 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001262 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001263
1264 /* Encode via the codec registry */
1265 v = PyCodec_Encode(unicode, encoding, errors);
1266 if (v == NULL)
1267 goto onError;
1268 return v;
1269
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001270 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001271 return NULL;
1272}
1273
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1275 const char *encoding,
1276 const char *errors)
1277{
1278 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001279
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280 if (!PyUnicode_Check(unicode)) {
1281 PyErr_BadArgument();
1282 goto onError;
1283 }
Fred Drakee4315f52000-05-09 19:53:39 +00001284
Tim Petersced69f82003-09-16 20:30:58 +00001285 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001286 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001287
1288 /* Shortcuts for common default encodings */
1289 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001290 if (strcmp(encoding, "utf-8") == 0)
1291 return PyUnicode_AsUTF8String(unicode);
1292 else if (strcmp(encoding, "latin-1") == 0)
1293 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001294#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001295 else if (strcmp(encoding, "mbcs") == 0)
1296 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001297#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001298 else if (strcmp(encoding, "ascii") == 0)
1299 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001300 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301
1302 /* Encode via the codec registry */
1303 v = PyCodec_Encode(unicode, encoding, errors);
1304 if (v == NULL)
1305 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001306 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001308 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001309 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310 Py_DECREF(v);
1311 goto onError;
1312 }
1313 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001314
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001315 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316 return NULL;
1317}
1318
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001319PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001320 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001321{
1322 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1323
1324 if (v)
1325 return v;
1326 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1327 if (v && errors == NULL)
1328 ((PyUnicodeObject *)unicode)->defenc = v;
1329 return v;
1330}
1331
Guido van Rossumd57fd912000-03-10 22:53:23 +00001332Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1333{
1334 if (!PyUnicode_Check(unicode)) {
1335 PyErr_BadArgument();
1336 goto onError;
1337 }
1338 return PyUnicode_AS_UNICODE(unicode);
1339
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001340 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001341 return NULL;
1342}
1343
Martin v. Löwis18e16552006-02-15 17:27:45 +00001344Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345{
1346 if (!PyUnicode_Check(unicode)) {
1347 PyErr_BadArgument();
1348 goto onError;
1349 }
1350 return PyUnicode_GET_SIZE(unicode);
1351
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001352 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001353 return -1;
1354}
1355
Thomas Wouters78890102000-07-22 19:25:51 +00001356const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001357{
1358 return unicode_default_encoding;
1359}
1360
1361int PyUnicode_SetDefaultEncoding(const char *encoding)
1362{
1363 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001364
Fred Drakee4315f52000-05-09 19:53:39 +00001365 /* Make sure the encoding is valid. As side effect, this also
1366 loads the encoding into the codec registry cache. */
1367 v = _PyCodec_Lookup(encoding);
1368 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001369 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001370 Py_DECREF(v);
1371 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001372 encoding,
1373 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001374 return 0;
1375
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001376 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001377 return -1;
1378}
1379
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001380/* error handling callback helper:
1381 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001382 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001383 and adjust various state variables.
1384 return 0 on success, -1 on error
1385*/
1386
1387static
1388int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001389 const char *encoding, const char *reason,
1390 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1391 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1392 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001393{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001394 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001395
1396 PyObject *restuple = NULL;
1397 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001398 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1399 Py_ssize_t requiredsize;
1400 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001401 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001402 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001403 int res = -1;
1404
1405 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001406 *errorHandler = PyCodec_LookupError(errors);
1407 if (*errorHandler == NULL)
1408 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001409 }
1410
1411 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001412 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001413 encoding, input, insize, *startinpos, *endinpos, reason);
1414 if (*exceptionObject == NULL)
1415 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001416 }
1417 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001418 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1419 goto onError;
1420 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1421 goto onError;
1422 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1423 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001424 }
1425
1426 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1427 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001428 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001429 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001430 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001431 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001432 }
1433 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001434 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001436 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001437 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001438 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1439 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001440 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001441
1442 /* need more space? (at least enough for what we
1443 have+the replacement+the rest of the string (starting
1444 at the new input position), so we won't have to check space
1445 when there are no errors in the rest of the string) */
1446 repptr = PyUnicode_AS_UNICODE(repunicode);
1447 repsize = PyUnicode_GET_SIZE(repunicode);
1448 requiredsize = *outpos + repsize + insize-newpos;
1449 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001450 if (requiredsize<2*outsize)
1451 requiredsize = 2*outsize;
1452 if (_PyUnicode_Resize(output, requiredsize) < 0)
1453 goto onError;
1454 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001455 }
1456 *endinpos = newpos;
1457 *inptr = input + newpos;
1458 Py_UNICODE_COPY(*outptr, repptr, repsize);
1459 *outptr += repsize;
1460 *outpos += repsize;
1461 /* we made it! */
1462 res = 0;
1463
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001464 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001465 Py_XDECREF(restuple);
1466 return res;
1467}
1468
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001469/* --- UTF-7 Codec -------------------------------------------------------- */
1470
Antoine Pitrou653dece2009-05-04 18:32:32 +00001471/* See RFC2152 for details. We encode conservatively and decode liberally. */
1472
1473/* Three simple macros defining base-64. */
1474
1475/* Is c a base-64 character? */
1476
1477#define IS_BASE64(c) \
1478 (isalnum(c) || (c) == '+' || (c) == '/')
1479
1480/* given that c is a base-64 character, what is its base-64 value? */
1481
1482#define FROM_BASE64(c) \
1483 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1484 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1485 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1486 (c) == '+' ? 62 : 63)
1487
1488/* What is the base-64 character of the bottom 6 bits of n? */
1489
1490#define TO_BASE64(n) \
1491 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1492
1493/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1494 * decoded as itself. We are permissive on decoding; the only ASCII
1495 * byte not decoding to itself is the + which begins a base64
1496 * string. */
1497
1498#define DECODE_DIRECT(c) \
1499 ((c) <= 127 && (c) != '+')
1500
1501/* The UTF-7 encoder treats ASCII characters differently according to
1502 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1503 * the above). See RFC2152. This array identifies these different
1504 * sets:
1505 * 0 : "Set D"
1506 * alphanumeric and '(),-./:?
1507 * 1 : "Set O"
1508 * !"#$%&*;<=>@[]^_`{|}
1509 * 2 : "whitespace"
1510 * ht nl cr sp
1511 * 3 : special (must be base64 encoded)
1512 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1513 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001514
Tim Petersced69f82003-09-16 20:30:58 +00001515static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001516char utf7_category[128] = {
1517/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1518 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1519/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1520 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1521/* sp ! " # $ % & ' ( ) * + , - . / */
1522 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1523/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1524 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1525/* @ A B C D E F G H I J K L M N O */
1526 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1527/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1528 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1529/* ` a b c d e f g h i j k l m n o */
1530 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1531/* p q r s t u v w x y z { | } ~ del */
1532 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001533};
1534
Antoine Pitrou653dece2009-05-04 18:32:32 +00001535/* ENCODE_DIRECT: this character should be encoded as itself. The
1536 * answer depends on whether we are encoding set O as itself, and also
1537 * on whether we are encoding whitespace as itself. RFC2152 makes it
1538 * clear that the answers to these questions vary between
1539 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001540
Antoine Pitrou653dece2009-05-04 18:32:32 +00001541#define ENCODE_DIRECT(c, directO, directWS) \
1542 ((c) < 128 && (c) > 0 && \
1543 ((utf7_category[(c)] == 0) || \
1544 (directWS && (utf7_category[(c)] == 2)) || \
1545 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001546
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001547PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001548 Py_ssize_t size,
1549 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001550{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001551 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1552}
1553
Antoine Pitrou653dece2009-05-04 18:32:32 +00001554/* The decoder. The only state we preserve is our read position,
1555 * i.e. how many characters we have consumed. So if we end in the
1556 * middle of a shift sequence we have to back off the read position
1557 * and the output to the beginning of the sequence, otherwise we lose
1558 * all the shift state (seen bits, number of bits seen, high
1559 * surrogate). */
1560
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001561PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001562 Py_ssize_t size,
1563 const char *errors,
1564 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001565{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001566 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001567 Py_ssize_t startinpos;
1568 Py_ssize_t endinpos;
1569 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001570 const char *e;
1571 PyUnicodeObject *unicode;
1572 Py_UNICODE *p;
1573 const char *errmsg = "";
1574 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001575 Py_UNICODE *shiftOutStart;
1576 unsigned int base64bits = 0;
1577 unsigned long base64buffer = 0;
1578 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001579 PyObject *errorHandler = NULL;
1580 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001581
1582 unicode = _PyUnicode_New(size);
1583 if (!unicode)
1584 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001585 if (size == 0) {
1586 if (consumed)
1587 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001588 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001589 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001590
1591 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001592 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001593 e = s + size;
1594
1595 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001596 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001597
Antoine Pitrou653dece2009-05-04 18:32:32 +00001598 if (inShift) { /* in a base-64 section */
1599 if (IS_BASE64(ch)) { /* consume a base-64 character */
1600 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1601 base64bits += 6;
1602 s++;
1603 if (base64bits >= 16) {
1604 /* we have enough bits for a UTF-16 value */
1605 Py_UNICODE outCh = (Py_UNICODE)
1606 (base64buffer >> (base64bits-16));
1607 base64bits -= 16;
1608 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1609 if (surrogate) {
1610 /* expecting a second surrogate */
1611 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1612#ifdef Py_UNICODE_WIDE
1613 *p++ = (((surrogate & 0x3FF)<<10)
1614 | (outCh & 0x3FF)) + 0x10000;
1615#else
1616 *p++ = surrogate;
1617 *p++ = outCh;
1618#endif
1619 surrogate = 0;
1620 }
1621 else {
1622 surrogate = 0;
1623 errmsg = "second surrogate missing";
1624 goto utf7Error;
1625 }
1626 }
1627 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1628 /* first surrogate */
1629 surrogate = outCh;
1630 }
1631 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1632 errmsg = "unexpected second surrogate";
1633 goto utf7Error;
1634 }
1635 else {
1636 *p++ = outCh;
1637 }
1638 }
1639 }
1640 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001641 inShift = 0;
1642 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001643 if (surrogate) {
1644 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001645 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001646 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001647 if (base64bits > 0) { /* left-over bits */
1648 if (base64bits >= 6) {
1649 /* We've seen at least one base-64 character */
1650 errmsg = "partial character in shift sequence";
1651 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001653 else {
1654 /* Some bits remain; they should be zero */
1655 if (base64buffer != 0) {
1656 errmsg = "non-zero padding bits in shift sequence";
1657 goto utf7Error;
1658 }
1659 }
1660 }
1661 if (ch != '-') {
1662 /* '-' is absorbed; other terminating
1663 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001664 *p++ = ch;
1665 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001666 }
1667 }
1668 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001669 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001670 s++; /* consume '+' */
1671 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001672 s++;
1673 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001674 }
1675 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001676 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001677 shiftOutStart = p;
1678 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 }
1680 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001681 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001682 *p++ = ch;
1683 s++;
1684 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001685 else {
1686 startinpos = s-starts;
1687 s++;
1688 errmsg = "unexpected special character";
1689 goto utf7Error;
1690 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001691 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001692utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001693 outpos = p-PyUnicode_AS_UNICODE(unicode);
1694 endinpos = s-starts;
1695 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001696 errors, &errorHandler,
1697 "utf7", errmsg,
1698 starts, size, &startinpos, &endinpos, &exc, &s,
1699 &unicode, &outpos, &p))
1700 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001701 }
1702
Antoine Pitrou653dece2009-05-04 18:32:32 +00001703 /* end of string */
1704
1705 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1706 /* if we're in an inconsistent state, that's an error */
1707 if (surrogate ||
1708 (base64bits >= 6) ||
1709 (base64bits > 0 && base64buffer != 0)) {
1710 outpos = p-PyUnicode_AS_UNICODE(unicode);
1711 endinpos = size;
1712 if (unicode_decode_call_errorhandler(
1713 errors, &errorHandler,
1714 "utf7", "unterminated shift sequence",
1715 starts, size, &startinpos, &endinpos, &exc, &s,
1716 &unicode, &outpos, &p))
1717 goto onError;
1718 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001719 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001720
1721 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001722 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001723 if (inShift) {
1724 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001725 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001726 }
1727 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001728 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001729 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001730 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001731
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001732 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001733 goto onError;
1734
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001735 Py_XDECREF(errorHandler);
1736 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001737 return (PyObject *)unicode;
1738
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001739 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001740 Py_XDECREF(errorHandler);
1741 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001742 Py_DECREF(unicode);
1743 return NULL;
1744}
1745
1746
1747PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001748 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001749 int base64SetO,
1750 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001751 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001752{
1753 PyObject *v;
1754 /* It might be possible to tighten this worst case */
Antoine Pitrou653dece2009-05-04 18:32:32 +00001755 Py_ssize_t allocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001756 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001757 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001758 unsigned int base64bits = 0;
1759 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001760 char * out;
1761 char * start;
1762
Antoine Pitrou653dece2009-05-04 18:32:32 +00001763 if (allocated / 5 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001764 return PyErr_NoMemory();
1765
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001766 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001767 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001768
Antoine Pitrou653dece2009-05-04 18:32:32 +00001769 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001770 if (v == NULL)
1771 return NULL;
1772
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001773 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001774 for (;i < size; ++i) {
1775 Py_UNICODE ch = s[i];
1776
Antoine Pitrou653dece2009-05-04 18:32:32 +00001777 if (inShift) {
1778 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1779 /* shifting out */
1780 if (base64bits) { /* output remaining bits */
1781 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1782 base64buffer = 0;
1783 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001784 }
1785 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001786 /* Characters not in the BASE64 set implicitly unshift the sequence
1787 so no '-' is required, except if the character is itself a '-' */
1788 if (IS_BASE64(ch) || ch == '-') {
1789 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001790 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001791 *out++ = (char) ch;
1792 }
1793 else {
1794 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001795 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001796 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001797 else { /* not in a shift sequence */
1798 if (ch == '+') {
1799 *out++ = '+';
1800 *out++ = '-';
1801 }
1802 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1803 *out++ = (char) ch;
1804 }
1805 else {
1806 *out++ = '+';
1807 inShift = 1;
1808 goto encode_char;
1809 }
1810 }
1811 continue;
1812encode_char:
1813#ifdef Py_UNICODE_WIDE
1814 if (ch >= 0x10000) {
1815 /* code first surrogate */
1816 base64bits += 16;
1817 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1818 while (base64bits >= 6) {
1819 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1820 base64bits -= 6;
1821 }
1822 /* prepare second surrogate */
1823 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1824 }
1825#endif
1826 base64bits += 16;
1827 base64buffer = (base64buffer << 16) | ch;
1828 while (base64bits >= 6) {
1829 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1830 base64bits -= 6;
1831 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001832 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001833 if (base64bits)
1834 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1835 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001836 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001837
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001838 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001839 return v;
1840}
1841
Antoine Pitrou653dece2009-05-04 18:32:32 +00001842#undef IS_BASE64
1843#undef FROM_BASE64
1844#undef TO_BASE64
1845#undef DECODE_DIRECT
1846#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001847
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848/* --- UTF-8 Codec -------------------------------------------------------- */
1849
Tim Petersced69f82003-09-16 20:30:58 +00001850static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851char utf8_code_length[256] = {
1852 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1853 illegal prefix. see RFC 2279 for details */
1854 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1855 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1856 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1857 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1858 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1859 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1860 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1861 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1862 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1863 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1864 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1865 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1866 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1867 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1868 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1869 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1870};
1871
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001873 Py_ssize_t size,
1874 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875{
Walter Dörwald69652032004-09-07 20:24:22 +00001876 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1877}
1878
1879PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001880 Py_ssize_t size,
1881 const char *errors,
1882 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001883{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001884 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001885 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001886 Py_ssize_t startinpos;
1887 Py_ssize_t endinpos;
1888 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889 const char *e;
1890 PyUnicodeObject *unicode;
1891 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001892 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001893 PyObject *errorHandler = NULL;
1894 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895
1896 /* Note: size will always be longer than the resulting Unicode
1897 character count */
1898 unicode = _PyUnicode_New(size);
1899 if (!unicode)
1900 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001901 if (size == 0) {
1902 if (consumed)
1903 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001905 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001906
1907 /* Unpack UTF-8 encoded data */
1908 p = unicode->str;
1909 e = s + size;
1910
1911 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001912 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913
1914 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001915 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001916 s++;
1917 continue;
1918 }
1919
1920 n = utf8_code_length[ch];
1921
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001922 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001923 if (consumed)
1924 break;
1925 else {
1926 errmsg = "unexpected end of data";
1927 startinpos = s-starts;
1928 endinpos = size;
1929 goto utf8Error;
1930 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001931 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932
1933 switch (n) {
1934
1935 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001936 errmsg = "unexpected code byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001937 startinpos = s-starts;
1938 endinpos = startinpos+1;
1939 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940
1941 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001942 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001943 startinpos = s-starts;
1944 endinpos = startinpos+1;
1945 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001946
1947 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001948 if ((s[1] & 0xc0) != 0x80) {
1949 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001950 startinpos = s-starts;
1951 endinpos = startinpos+2;
1952 goto utf8Error;
1953 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001955 if (ch < 0x80) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001956 startinpos = s-starts;
1957 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001958 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001959 goto utf8Error;
1960 }
1961 else
1962 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963 break;
1964
1965 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001966 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001967 (s[2] & 0xc0) != 0x80) {
1968 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001969 startinpos = s-starts;
1970 endinpos = startinpos+3;
1971 goto utf8Error;
1972 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001974 if (ch < 0x0800) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001975 /* Note: UTF-8 encodings of surrogates are considered
1976 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001977
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001978 XXX For wide builds (UCS-4) we should probably try
1979 to recombine the surrogates into a single code
1980 unit.
1981 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001982 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001983 startinpos = s-starts;
1984 endinpos = startinpos+3;
1985 goto utf8Error;
1986 }
1987 else
1988 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001989 break;
1990
1991 case 4:
1992 if ((s[1] & 0xc0) != 0x80 ||
1993 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001994 (s[3] & 0xc0) != 0x80) {
1995 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001996 startinpos = s-starts;
1997 endinpos = startinpos+4;
1998 goto utf8Error;
1999 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002000 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002001 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002002 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002003 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002004 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002005 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002006 UTF-16 */
2007 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002008 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002009 startinpos = s-starts;
2010 endinpos = startinpos+4;
2011 goto utf8Error;
2012 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002013#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002014 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002015#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002016 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002017
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002018 /* translate from 10000..10FFFF to 0..FFFF */
2019 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002020
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002021 /* high surrogate = top 10 bits added to D800 */
2022 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002023
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002024 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002025 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002026#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002027 break;
2028
2029 default:
2030 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002031 errmsg = "unsupported Unicode code range";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002032 startinpos = s-starts;
2033 endinpos = startinpos+n;
2034 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 }
2036 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002037 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002038
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002039 utf8Error:
2040 outpos = p-PyUnicode_AS_UNICODE(unicode);
2041 if (unicode_decode_call_errorhandler(
2042 errors, &errorHandler,
2043 "utf8", errmsg,
2044 starts, size, &startinpos, &endinpos, &exc, &s,
2045 &unicode, &outpos, &p))
2046 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 }
Walter Dörwald69652032004-09-07 20:24:22 +00002048 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002049 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050
2051 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002052 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 goto onError;
2054
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002055 Py_XDECREF(errorHandler);
2056 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002057 return (PyObject *)unicode;
2058
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002059 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002060 Py_XDECREF(errorHandler);
2061 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002062 Py_DECREF(unicode);
2063 return NULL;
2064}
2065
Tim Peters602f7402002-04-27 18:03:26 +00002066/* Allocation strategy: if the string is short, convert into a stack buffer
2067 and allocate exactly as much space needed at the end. Else allocate the
2068 maximum possible needed (4 result bytes per Unicode character), and return
2069 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002070*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002071PyObject *
2072PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002073 Py_ssize_t size,
2074 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075{
Tim Peters602f7402002-04-27 18:03:26 +00002076#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002077
Martin v. Löwis18e16552006-02-15 17:27:45 +00002078 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002079 PyObject *v; /* result string object */
2080 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002081 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002082 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002083 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002084
Tim Peters602f7402002-04-27 18:03:26 +00002085 assert(s != NULL);
2086 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087
Tim Peters602f7402002-04-27 18:03:26 +00002088 if (size <= MAX_SHORT_UNICHARS) {
2089 /* Write into the stack buffer; nallocated can't overflow.
2090 * At the end, we'll allocate exactly as much heap space as it
2091 * turns out we need.
2092 */
2093 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2094 v = NULL; /* will allocate after we're done */
2095 p = stackbuf;
2096 }
2097 else {
2098 /* Overallocate on the heap, and give the excess back at the end. */
2099 nallocated = size * 4;
2100 if (nallocated / 4 != size) /* overflow! */
2101 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002102 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002103 if (v == NULL)
2104 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002105 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002106 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002107
Tim Peters602f7402002-04-27 18:03:26 +00002108 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002109 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002110
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002111 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002112 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002114
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002116 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002117 *p++ = (char)(0xc0 | (ch >> 6));
2118 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002119 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002120 else {
Tim Peters602f7402002-04-27 18:03:26 +00002121 /* Encode UCS2 Unicode ordinals */
2122 if (ch < 0x10000) {
2123 /* Special case: check for high surrogate */
2124 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2125 Py_UCS4 ch2 = s[i];
2126 /* Check for low surrogate and combine the two to
2127 form a UCS4 value */
2128 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002129 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002130 i++;
2131 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002132 }
Tim Peters602f7402002-04-27 18:03:26 +00002133 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002134 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002135 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002136 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2137 *p++ = (char)(0x80 | (ch & 0x3f));
2138 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002139 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002140 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002141 /* Encode UCS4 Unicode ordinals */
2142 *p++ = (char)(0xf0 | (ch >> 18));
2143 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2144 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2145 *p++ = (char)(0x80 | (ch & 0x3f));
2146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002148
Tim Peters602f7402002-04-27 18:03:26 +00002149 if (v == NULL) {
2150 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002151 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002152 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002153 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002154 }
2155 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002156 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002157 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002158 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002159 _PyString_Resize(&v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002160 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002161 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002162
Tim Peters602f7402002-04-27 18:03:26 +00002163#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002164}
2165
Guido van Rossumd57fd912000-03-10 22:53:23 +00002166PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2167{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 if (!PyUnicode_Check(unicode)) {
2169 PyErr_BadArgument();
2170 return NULL;
2171 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002172 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002173 PyUnicode_GET_SIZE(unicode),
2174 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175}
2176
Walter Dörwald6e390802007-08-17 16:41:28 +00002177/* --- UTF-32 Codec ------------------------------------------------------- */
2178
2179PyObject *
2180PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002181 Py_ssize_t size,
2182 const char *errors,
2183 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002184{
2185 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2186}
2187
2188PyObject *
2189PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002190 Py_ssize_t size,
2191 const char *errors,
2192 int *byteorder,
2193 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002194{
2195 const char *starts = s;
2196 Py_ssize_t startinpos;
2197 Py_ssize_t endinpos;
2198 Py_ssize_t outpos;
2199 PyUnicodeObject *unicode;
2200 Py_UNICODE *p;
2201#ifndef Py_UNICODE_WIDE
2202 int i, pairs;
2203#else
2204 const int pairs = 0;
2205#endif
2206 const unsigned char *q, *e;
2207 int bo = 0; /* assume native ordering by default */
2208 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002209 /* Offsets from q for retrieving bytes in the right order. */
2210#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2211 int iorder[] = {0, 1, 2, 3};
2212#else
2213 int iorder[] = {3, 2, 1, 0};
2214#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002215 PyObject *errorHandler = NULL;
2216 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002217 /* On narrow builds we split characters outside the BMP into two
2218 codepoints => count how much extra space we need. */
2219#ifndef Py_UNICODE_WIDE
2220 for (i = pairs = 0; i < size/4; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002221 if (((Py_UCS4 *)s)[i] >= 0x10000)
2222 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002223#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002224
2225 /* This might be one to much, because of a BOM */
2226 unicode = _PyUnicode_New((size+3)/4+pairs);
2227 if (!unicode)
2228 return NULL;
2229 if (size == 0)
2230 return (PyObject *)unicode;
2231
2232 /* Unpack UTF-32 encoded data */
2233 p = unicode->str;
2234 q = (unsigned char *)s;
2235 e = q + size;
2236
2237 if (byteorder)
2238 bo = *byteorder;
2239
2240 /* Check for BOM marks (U+FEFF) in the input and adjust current
2241 byte order setting accordingly. In native mode, the leading BOM
2242 mark is skipped, in all other modes, it is copied to the output
2243 stream as-is (giving a ZWNBSP character). */
2244 if (bo == 0) {
2245 if (size >= 4) {
2246 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002247 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002248#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002249 if (bom == 0x0000FEFF) {
2250 q += 4;
2251 bo = -1;
2252 }
2253 else if (bom == 0xFFFE0000) {
2254 q += 4;
2255 bo = 1;
2256 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002257#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002258 if (bom == 0x0000FEFF) {
2259 q += 4;
2260 bo = 1;
2261 }
2262 else if (bom == 0xFFFE0000) {
2263 q += 4;
2264 bo = -1;
2265 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002266#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002267 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002268 }
2269
2270 if (bo == -1) {
2271 /* force LE */
2272 iorder[0] = 0;
2273 iorder[1] = 1;
2274 iorder[2] = 2;
2275 iorder[3] = 3;
2276 }
2277 else if (bo == 1) {
2278 /* force BE */
2279 iorder[0] = 3;
2280 iorder[1] = 2;
2281 iorder[2] = 1;
2282 iorder[3] = 0;
2283 }
2284
2285 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002286 Py_UCS4 ch;
2287 /* remaining bytes at the end? (size should be divisible by 4) */
2288 if (e-q<4) {
2289 if (consumed)
2290 break;
2291 errmsg = "truncated data";
2292 startinpos = ((const char *)q)-starts;
2293 endinpos = ((const char *)e)-starts;
2294 goto utf32Error;
2295 /* The remaining input chars are ignored if the callback
2296 chooses to skip the input */
2297 }
2298 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2299 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002300
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002301 if (ch >= 0x110000)
2302 {
2303 errmsg = "codepoint not in range(0x110000)";
2304 startinpos = ((const char *)q)-starts;
2305 endinpos = startinpos+4;
2306 goto utf32Error;
2307 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002308#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002309 if (ch >= 0x10000)
2310 {
2311 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2312 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2313 }
2314 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002315#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002316 *p++ = ch;
2317 q += 4;
2318 continue;
2319 utf32Error:
2320 outpos = p-PyUnicode_AS_UNICODE(unicode);
2321 if (unicode_decode_call_errorhandler(
2322 errors, &errorHandler,
2323 "utf32", errmsg,
2324 starts, size, &startinpos, &endinpos, &exc, &s,
2325 &unicode, &outpos, &p))
2326 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002327 }
2328
2329 if (byteorder)
2330 *byteorder = bo;
2331
2332 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002333 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002334
2335 /* Adjust length */
2336 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2337 goto onError;
2338
2339 Py_XDECREF(errorHandler);
2340 Py_XDECREF(exc);
2341 return (PyObject *)unicode;
2342
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002343 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002344 Py_DECREF(unicode);
2345 Py_XDECREF(errorHandler);
2346 Py_XDECREF(exc);
2347 return NULL;
2348}
2349
2350PyObject *
2351PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002352 Py_ssize_t size,
2353 const char *errors,
2354 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002355{
2356 PyObject *v;
2357 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002358 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002359#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002360 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002361#else
2362 const int pairs = 0;
2363#endif
2364 /* Offsets from p for storing byte pairs in the right order. */
2365#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2366 int iorder[] = {0, 1, 2, 3};
2367#else
2368 int iorder[] = {3, 2, 1, 0};
2369#endif
2370
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002371#define STORECHAR(CH) \
2372 do { \
2373 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2374 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2375 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2376 p[iorder[0]] = (CH) & 0xff; \
2377 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002378 } while(0)
2379
2380 /* In narrow builds we can output surrogate pairs as one codepoint,
2381 so we need less space. */
2382#ifndef Py_UNICODE_WIDE
2383 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002384 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2385 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2386 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002387#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002388 nsize = (size - pairs + (byteorder == 0));
2389 bytesize = nsize * 4;
2390 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002391 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002392 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002393 if (v == NULL)
2394 return NULL;
2395
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002396 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002397 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002398 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002399 if (size == 0)
2400 return v;
2401
2402 if (byteorder == -1) {
2403 /* force LE */
2404 iorder[0] = 0;
2405 iorder[1] = 1;
2406 iorder[2] = 2;
2407 iorder[3] = 3;
2408 }
2409 else if (byteorder == 1) {
2410 /* force BE */
2411 iorder[0] = 3;
2412 iorder[1] = 2;
2413 iorder[2] = 1;
2414 iorder[3] = 0;
2415 }
2416
2417 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002418 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002419#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002420 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2421 Py_UCS4 ch2 = *s;
2422 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2423 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2424 s++;
2425 size--;
2426 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002427 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002428#endif
2429 STORECHAR(ch);
2430 }
2431 return v;
2432#undef STORECHAR
2433}
2434
2435PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2436{
2437 if (!PyUnicode_Check(unicode)) {
2438 PyErr_BadArgument();
2439 return NULL;
2440 }
2441 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002442 PyUnicode_GET_SIZE(unicode),
2443 NULL,
2444 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002445}
2446
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447/* --- UTF-16 Codec ------------------------------------------------------- */
2448
Tim Peters772747b2001-08-09 22:21:55 +00002449PyObject *
2450PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002451 Py_ssize_t size,
2452 const char *errors,
2453 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002454{
Walter Dörwald69652032004-09-07 20:24:22 +00002455 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2456}
2457
2458PyObject *
2459PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002460 Py_ssize_t size,
2461 const char *errors,
2462 int *byteorder,
2463 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002464{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002465 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002466 Py_ssize_t startinpos;
2467 Py_ssize_t endinpos;
2468 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002469 PyUnicodeObject *unicode;
2470 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002471 const unsigned char *q, *e;
2472 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002473 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002474 /* Offsets from q for retrieving byte pairs in the right order. */
2475#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2476 int ihi = 1, ilo = 0;
2477#else
2478 int ihi = 0, ilo = 1;
2479#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002480 PyObject *errorHandler = NULL;
2481 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482
2483 /* Note: size will always be longer than the resulting Unicode
2484 character count */
2485 unicode = _PyUnicode_New(size);
2486 if (!unicode)
2487 return NULL;
2488 if (size == 0)
2489 return (PyObject *)unicode;
2490
2491 /* Unpack UTF-16 encoded data */
2492 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002493 q = (unsigned char *)s;
2494 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495
2496 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002497 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002498
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002499 /* Check for BOM marks (U+FEFF) in the input and adjust current
2500 byte order setting accordingly. In native mode, the leading BOM
2501 mark is skipped, in all other modes, it is copied to the output
2502 stream as-is (giving a ZWNBSP character). */
2503 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002504 if (size >= 2) {
2505 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002506#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002507 if (bom == 0xFEFF) {
2508 q += 2;
2509 bo = -1;
2510 }
2511 else if (bom == 0xFFFE) {
2512 q += 2;
2513 bo = 1;
2514 }
Tim Petersced69f82003-09-16 20:30:58 +00002515#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002516 if (bom == 0xFEFF) {
2517 q += 2;
2518 bo = 1;
2519 }
2520 else if (bom == 0xFFFE) {
2521 q += 2;
2522 bo = -1;
2523 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002524#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002525 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002526 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527
Tim Peters772747b2001-08-09 22:21:55 +00002528 if (bo == -1) {
2529 /* force LE */
2530 ihi = 1;
2531 ilo = 0;
2532 }
2533 else if (bo == 1) {
2534 /* force BE */
2535 ihi = 0;
2536 ilo = 1;
2537 }
2538
2539 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002540 Py_UNICODE ch;
2541 /* remaining bytes at the end? (size should be even) */
2542 if (e-q<2) {
2543 if (consumed)
2544 break;
2545 errmsg = "truncated data";
2546 startinpos = ((const char *)q)-starts;
2547 endinpos = ((const char *)e)-starts;
2548 goto utf16Error;
2549 /* The remaining input chars are ignored if the callback
2550 chooses to skip the input */
2551 }
2552 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002553
Benjamin Peterson857ce152009-01-31 16:29:18 +00002554 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002555
2556 if (ch < 0xD800 || ch > 0xDFFF) {
2557 *p++ = ch;
2558 continue;
2559 }
2560
2561 /* UTF-16 code pair: */
2562 if (q >= e) {
2563 errmsg = "unexpected end of data";
2564 startinpos = (((const char *)q)-2)-starts;
2565 endinpos = ((const char *)e)-starts;
2566 goto utf16Error;
2567 }
2568 if (0xD800 <= ch && ch <= 0xDBFF) {
2569 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2570 q += 2;
2571 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002572#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002573 *p++ = ch;
2574 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002575#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002576 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002577#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002578 continue;
2579 }
2580 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002581 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002582 startinpos = (((const char *)q)-4)-starts;
2583 endinpos = startinpos+2;
2584 goto utf16Error;
2585 }
2586
Benjamin Peterson857ce152009-01-31 16:29:18 +00002587 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002588 errmsg = "illegal encoding";
2589 startinpos = (((const char *)q)-2)-starts;
2590 endinpos = startinpos+2;
2591 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002592
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002593 utf16Error:
2594 outpos = p-PyUnicode_AS_UNICODE(unicode);
2595 if (unicode_decode_call_errorhandler(
2596 errors, &errorHandler,
2597 "utf16", errmsg,
2598 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2599 &unicode, &outpos, &p))
2600 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601 }
2602
2603 if (byteorder)
2604 *byteorder = bo;
2605
Walter Dörwald69652032004-09-07 20:24:22 +00002606 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002607 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002608
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002610 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611 goto onError;
2612
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002613 Py_XDECREF(errorHandler);
2614 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 return (PyObject *)unicode;
2616
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002617 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002619 Py_XDECREF(errorHandler);
2620 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621 return NULL;
2622}
2623
Tim Peters772747b2001-08-09 22:21:55 +00002624PyObject *
2625PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002626 Py_ssize_t size,
2627 const char *errors,
2628 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629{
2630 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002631 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002632 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002633#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002634 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002635#else
2636 const int pairs = 0;
2637#endif
Tim Peters772747b2001-08-09 22:21:55 +00002638 /* Offsets from p for storing byte pairs in the right order. */
2639#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2640 int ihi = 1, ilo = 0;
2641#else
2642 int ihi = 0, ilo = 1;
2643#endif
2644
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002645#define STORECHAR(CH) \
2646 do { \
2647 p[ihi] = ((CH) >> 8) & 0xff; \
2648 p[ilo] = (CH) & 0xff; \
2649 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002650 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002652#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002653 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002654 if (s[i] >= 0x10000)
2655 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002656#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002657 /* 2 * (size + pairs + (byteorder == 0)) */
2658 if (size > PY_SSIZE_T_MAX ||
2659 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002660 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002661 nsize = size + pairs + (byteorder == 0);
2662 bytesize = nsize * 2;
2663 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002664 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002665 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666 if (v == NULL)
2667 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002669 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002670 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002671 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002672 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002673 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002674
2675 if (byteorder == -1) {
2676 /* force LE */
2677 ihi = 1;
2678 ilo = 0;
2679 }
2680 else if (byteorder == 1) {
2681 /* force BE */
2682 ihi = 0;
2683 ilo = 1;
2684 }
2685
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002686 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002687 Py_UNICODE ch = *s++;
2688 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002689#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002690 if (ch >= 0x10000) {
2691 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2692 ch = 0xD800 | ((ch-0x10000) >> 10);
2693 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002694#endif
Tim Peters772747b2001-08-09 22:21:55 +00002695 STORECHAR(ch);
2696 if (ch2)
2697 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002700#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701}
2702
2703PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2704{
2705 if (!PyUnicode_Check(unicode)) {
2706 PyErr_BadArgument();
2707 return NULL;
2708 }
2709 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002710 PyUnicode_GET_SIZE(unicode),
2711 NULL,
2712 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713}
2714
2715/* --- Unicode Escape Codec ----------------------------------------------- */
2716
Fredrik Lundh06d12682001-01-24 07:59:11 +00002717static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002718
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002720 Py_ssize_t size,
2721 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002722{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002723 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002724 Py_ssize_t startinpos;
2725 Py_ssize_t endinpos;
2726 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002729 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002731 char* message;
2732 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002733 PyObject *errorHandler = NULL;
2734 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002735
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 /* Escaped strings will always be longer than the resulting
2737 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 length after conversion to the true value.
2739 (but if the error callback returns a long replacement string
2740 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 v = _PyUnicode_New(size);
2742 if (v == NULL)
2743 goto onError;
2744 if (size == 0)
2745 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002746
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002747 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002748 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002749
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750 while (s < end) {
2751 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002752 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002753 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002754
2755 /* Non-escape characters are interpreted as Unicode ordinals */
2756 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002757 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 continue;
2759 }
2760
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002761 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 /* \ - Escapes */
2763 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002764 c = *s++;
2765 if (s > end)
2766 c = '\0'; /* Invalid after \ */
2767 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002769 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770 case '\n': break;
2771 case '\\': *p++ = '\\'; break;
2772 case '\'': *p++ = '\''; break;
2773 case '\"': *p++ = '\"'; break;
2774 case 'b': *p++ = '\b'; break;
2775 case 'f': *p++ = '\014'; break; /* FF */
2776 case 't': *p++ = '\t'; break;
2777 case 'n': *p++ = '\n'; break;
2778 case 'r': *p++ = '\r'; break;
2779 case 'v': *p++ = '\013'; break; /* VT */
2780 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2781
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002782 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783 case '0': case '1': case '2': case '3':
2784 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002785 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002786 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002787 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002788 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002789 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002791 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 break;
2793
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002794 /* hex escapes */
2795 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002797 digits = 2;
2798 message = "truncated \\xXX escape";
2799 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002801 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002803 digits = 4;
2804 message = "truncated \\uXXXX escape";
2805 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002807 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002808 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002809 digits = 8;
2810 message = "truncated \\UXXXXXXXX escape";
2811 hexescape:
2812 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002813 outpos = p-PyUnicode_AS_UNICODE(v);
2814 if (s+digits>end) {
2815 endinpos = size;
2816 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002817 errors, &errorHandler,
2818 "unicodeescape", "end of string in escape sequence",
2819 starts, size, &startinpos, &endinpos, &exc, &s,
2820 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002821 goto onError;
2822 goto nextByte;
2823 }
2824 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002825 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002826 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002827 endinpos = (s+i+1)-starts;
2828 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002829 errors, &errorHandler,
2830 "unicodeescape", message,
2831 starts, size, &startinpos, &endinpos, &exc, &s,
2832 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002833 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002834 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002835 }
2836 chr = (chr<<4) & ~0xF;
2837 if (c >= '0' && c <= '9')
2838 chr += c - '0';
2839 else if (c >= 'a' && c <= 'f')
2840 chr += 10 + c - 'a';
2841 else
2842 chr += 10 + c - 'A';
2843 }
2844 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002845 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002846 /* _decoding_error will have already written into the
2847 target buffer. */
2848 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002849 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002850 /* when we get here, chr is a 32-bit unicode character */
2851 if (chr <= 0xffff)
2852 /* UCS-2 character */
2853 *p++ = (Py_UNICODE) chr;
2854 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002855 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002856 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002857#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002858 *p++ = chr;
2859#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002860 chr -= 0x10000L;
2861 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002862 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002863#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002864 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002865 endinpos = s-starts;
2866 outpos = p-PyUnicode_AS_UNICODE(v);
2867 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002868 errors, &errorHandler,
2869 "unicodeescape", "illegal Unicode character",
2870 starts, size, &startinpos, &endinpos, &exc, &s,
2871 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002872 goto onError;
2873 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002874 break;
2875
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002876 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002877 case 'N':
2878 message = "malformed \\N character escape";
2879 if (ucnhash_CAPI == NULL) {
2880 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002881 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002882 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002883 if (m == NULL)
2884 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002885 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002886 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002887 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002888 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002889 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002890 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002891 if (ucnhash_CAPI == NULL)
2892 goto ucnhashError;
2893 }
2894 if (*s == '{') {
2895 const char *start = s+1;
2896 /* look for the closing brace */
2897 while (*s != '}' && s < end)
2898 s++;
2899 if (s > start && s < end && *s == '}') {
2900 /* found a name. look it up in the unicode database */
2901 message = "unknown Unicode character name";
2902 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002903 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002904 goto store;
2905 }
2906 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002907 endinpos = s-starts;
2908 outpos = p-PyUnicode_AS_UNICODE(v);
2909 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002910 errors, &errorHandler,
2911 "unicodeescape", message,
2912 starts, size, &startinpos, &endinpos, &exc, &s,
2913 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002914 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002915 break;
2916
2917 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002918 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002919 message = "\\ at end of string";
2920 s--;
2921 endinpos = s-starts;
2922 outpos = p-PyUnicode_AS_UNICODE(v);
2923 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002924 errors, &errorHandler,
2925 "unicodeescape", message,
2926 starts, size, &startinpos, &endinpos, &exc, &s,
2927 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002928 goto onError;
2929 }
2930 else {
2931 *p++ = '\\';
2932 *p++ = (unsigned char)s[-1];
2933 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002934 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002936 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002937 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002939 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002940 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002941 Py_XDECREF(errorHandler);
2942 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002944
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002945 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002946 PyErr_SetString(
2947 PyExc_UnicodeError,
2948 "\\N escapes not supported (can't load unicodedata module)"
2949 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002950 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002951 Py_XDECREF(errorHandler);
2952 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002953 return NULL;
2954
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002955 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002957 Py_XDECREF(errorHandler);
2958 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002959 return NULL;
2960}
2961
2962/* Return a Unicode-Escape string version of the Unicode object.
2963
2964 If quotes is true, the string is enclosed in u"" or u'' quotes as
2965 appropriate.
2966
2967*/
2968
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002969Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002970 Py_ssize_t size,
2971 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002972{
2973 /* like wcschr, but doesn't stop at NULL characters */
2974
2975 while (size-- > 0) {
2976 if (*s == ch)
2977 return s;
2978 s++;
2979 }
2980
2981 return NULL;
2982}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002983
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984static
2985PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002986 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 int quotes)
2988{
2989 PyObject *repr;
2990 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002992 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002993#ifdef Py_UNICODE_WIDE
2994 const Py_ssize_t expandsize = 10;
2995#else
2996 const Py_ssize_t expandsize = 6;
2997#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998
Neal Norwitz17753ec2006-08-21 22:21:19 +00002999 /* XXX(nnorwitz): rather than over-allocating, it would be
3000 better to choose a different scheme. Perhaps scan the
3001 first N-chars of the string and allocate based on that size.
3002 */
3003 /* Initial allocation is based on the longest-possible unichr
3004 escape.
3005
3006 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3007 unichr, so in this case it's the longest unichr escape. In
3008 narrow (UTF-16) builds this is five chars per source unichr
3009 since there are two unichrs in the surrogate pair, so in narrow
3010 (UTF-16) builds it's not the longest unichr escape.
3011
3012 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3013 so in the narrow (UTF-16) build case it's the longest unichr
3014 escape.
3015 */
3016
Neal Norwitze7d8be82008-07-31 17:17:14 +00003017 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003018 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003019
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003020 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003021 2
3022 + expandsize*size
3023 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 if (repr == NULL)
3025 return NULL;
3026
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003027 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028
3029 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003031 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032 !findchar(s, size, '"')) ? '"' : '\'';
3033 }
3034 while (size-- > 0) {
3035 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003036
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003037 /* Escape quotes and backslashes */
3038 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003039 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040 *p++ = '\\';
3041 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003042 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003043 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003044
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003045#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003046 /* Map 21-bit characters to '\U00xxxxxx' */
3047 else if (ch >= 0x10000) {
3048 *p++ = '\\';
3049 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003050 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3051 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3052 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3053 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3054 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3055 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3056 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003057 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003058 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003059 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003060#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003061 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3062 else if (ch >= 0xD800 && ch < 0xDC00) {
3063 Py_UNICODE ch2;
3064 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003065
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003066 ch2 = *s++;
3067 size--;
3068 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3069 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3070 *p++ = '\\';
3071 *p++ = 'U';
3072 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3073 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3074 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3075 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3076 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3077 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3078 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3079 *p++ = hexdigit[ucs & 0x0000000F];
3080 continue;
3081 }
3082 /* Fall through: isolated surrogates are copied as-is */
3083 s--;
3084 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003085 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003086#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003087
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003089 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 *p++ = '\\';
3091 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003092 *p++ = hexdigit[(ch >> 12) & 0x000F];
3093 *p++ = hexdigit[(ch >> 8) & 0x000F];
3094 *p++ = hexdigit[(ch >> 4) & 0x000F];
3095 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003097
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003098 /* Map special whitespace to '\t', \n', '\r' */
3099 else if (ch == '\t') {
3100 *p++ = '\\';
3101 *p++ = 't';
3102 }
3103 else if (ch == '\n') {
3104 *p++ = '\\';
3105 *p++ = 'n';
3106 }
3107 else if (ch == '\r') {
3108 *p++ = '\\';
3109 *p++ = 'r';
3110 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003111
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003112 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003113 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003115 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003116 *p++ = hexdigit[(ch >> 4) & 0x000F];
3117 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003118 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003119
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 /* Copy everything else as-is */
3121 else
3122 *p++ = (char) ch;
3123 }
3124 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003125 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126
3127 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003128 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129 return repr;
3130}
3131
3132PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003133 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134{
3135 return unicodeescape_string(s, size, 0);
3136}
3137
3138PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3139{
3140 if (!PyUnicode_Check(unicode)) {
3141 PyErr_BadArgument();
3142 return NULL;
3143 }
3144 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003145 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146}
3147
3148/* --- Raw Unicode Escape Codec ------------------------------------------- */
3149
3150PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003151 Py_ssize_t size,
3152 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003154 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003155 Py_ssize_t startinpos;
3156 Py_ssize_t endinpos;
3157 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003159 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 const char *end;
3161 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003162 PyObject *errorHandler = NULL;
3163 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003164
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 /* Escaped strings will always be longer than the resulting
3166 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003167 length after conversion to the true value. (But decoding error
3168 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169 v = _PyUnicode_New(size);
3170 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003171 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003173 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003174 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175 end = s + size;
3176 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003177 unsigned char c;
3178 Py_UCS4 x;
3179 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003180 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003181
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003182 /* Non-escape characters are interpreted as Unicode ordinals */
3183 if (*s != '\\') {
3184 *p++ = (unsigned char)*s++;
3185 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003186 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003187 startinpos = s-starts;
3188
3189 /* \u-escapes are only interpreted iff the number of leading
3190 backslashes if odd */
3191 bs = s;
3192 for (;s < end;) {
3193 if (*s != '\\')
3194 break;
3195 *p++ = (unsigned char)*s++;
3196 }
3197 if (((s - bs) & 1) == 0 ||
3198 s >= end ||
3199 (*s != 'u' && *s != 'U')) {
3200 continue;
3201 }
3202 p--;
3203 count = *s=='u' ? 4 : 8;
3204 s++;
3205
3206 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3207 outpos = p-PyUnicode_AS_UNICODE(v);
3208 for (x = 0, i = 0; i < count; ++i, ++s) {
3209 c = (unsigned char)*s;
3210 if (!isxdigit(c)) {
3211 endinpos = s-starts;
3212 if (unicode_decode_call_errorhandler(
3213 errors, &errorHandler,
3214 "rawunicodeescape", "truncated \\uXXXX",
3215 starts, size, &startinpos, &endinpos, &exc, &s,
3216 &v, &outpos, &p))
3217 goto onError;
3218 goto nextByte;
3219 }
3220 x = (x<<4) & ~0xF;
3221 if (c >= '0' && c <= '9')
3222 x += c - '0';
3223 else if (c >= 'a' && c <= 'f')
3224 x += 10 + c - 'a';
3225 else
3226 x += 10 + c - 'A';
3227 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003228 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003229 /* UCS-2 character */
3230 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003231 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003232 /* UCS-4 character. Either store directly, or as
3233 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003234#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003235 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003236#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003237 x -= 0x10000L;
3238 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3239 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003240#endif
3241 } else {
3242 endinpos = s-starts;
3243 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003244 if (unicode_decode_call_errorhandler(
3245 errors, &errorHandler,
3246 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003247 starts, size, &startinpos, &endinpos, &exc, &s,
3248 &v, &outpos, &p))
3249 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003250 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003251 nextByte:
3252 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003254 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003255 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 Py_XDECREF(errorHandler);
3257 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003259
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003260 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003262 Py_XDECREF(errorHandler);
3263 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 return NULL;
3265}
3266
3267PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003268 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269{
3270 PyObject *repr;
3271 char *p;
3272 char *q;
3273
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003274 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003275#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003276 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003277#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003278 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003279#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003280
Neal Norwitze7d8be82008-07-31 17:17:14 +00003281 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003282 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003283
Neal Norwitze7d8be82008-07-31 17:17:14 +00003284 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003285 if (repr == NULL)
3286 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003287 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003288 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003290 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291 while (size-- > 0) {
3292 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003293#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003294 /* Map 32-bit characters to '\Uxxxxxxxx' */
3295 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003296 *p++ = '\\';
3297 *p++ = 'U';
3298 *p++ = hexdigit[(ch >> 28) & 0xf];
3299 *p++ = hexdigit[(ch >> 24) & 0xf];
3300 *p++ = hexdigit[(ch >> 20) & 0xf];
3301 *p++ = hexdigit[(ch >> 16) & 0xf];
3302 *p++ = hexdigit[(ch >> 12) & 0xf];
3303 *p++ = hexdigit[(ch >> 8) & 0xf];
3304 *p++ = hexdigit[(ch >> 4) & 0xf];
3305 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003306 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003307 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003308#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003309 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3310 if (ch >= 0xD800 && ch < 0xDC00) {
3311 Py_UNICODE ch2;
3312 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003313
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003314 ch2 = *s++;
3315 size--;
3316 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3317 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3318 *p++ = '\\';
3319 *p++ = 'U';
3320 *p++ = hexdigit[(ucs >> 28) & 0xf];
3321 *p++ = hexdigit[(ucs >> 24) & 0xf];
3322 *p++ = hexdigit[(ucs >> 20) & 0xf];
3323 *p++ = hexdigit[(ucs >> 16) & 0xf];
3324 *p++ = hexdigit[(ucs >> 12) & 0xf];
3325 *p++ = hexdigit[(ucs >> 8) & 0xf];
3326 *p++ = hexdigit[(ucs >> 4) & 0xf];
3327 *p++ = hexdigit[ucs & 0xf];
3328 continue;
3329 }
3330 /* Fall through: isolated surrogates are copied as-is */
3331 s--;
3332 size++;
3333 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003334#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003335 /* Map 16-bit characters to '\uxxxx' */
3336 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 *p++ = '\\';
3338 *p++ = 'u';
3339 *p++ = hexdigit[(ch >> 12) & 0xf];
3340 *p++ = hexdigit[(ch >> 8) & 0xf];
3341 *p++ = hexdigit[(ch >> 4) & 0xf];
3342 *p++ = hexdigit[ch & 15];
3343 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003344 /* Copy everything else as-is */
3345 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346 *p++ = (char) ch;
3347 }
3348 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003349 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350 return repr;
3351}
3352
3353PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3354{
3355 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003356 PyErr_BadArgument();
3357 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358 }
3359 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003360 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361}
3362
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003363/* --- Unicode Internal Codec ------------------------------------------- */
3364
3365PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003366 Py_ssize_t size,
3367 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003368{
3369 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003370 Py_ssize_t startinpos;
3371 Py_ssize_t endinpos;
3372 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003373 PyUnicodeObject *v;
3374 Py_UNICODE *p;
3375 const char *end;
3376 const char *reason;
3377 PyObject *errorHandler = NULL;
3378 PyObject *exc = NULL;
3379
Neal Norwitzd43069c2006-01-08 01:12:10 +00003380#ifdef Py_UNICODE_WIDE
3381 Py_UNICODE unimax = PyUnicode_GetMax();
3382#endif
3383
Armin Rigo7ccbca92006-10-04 12:17:45 +00003384 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003385 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3386 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003387 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003388 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003389 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003390 p = PyUnicode_AS_UNICODE(v);
3391 end = s + size;
3392
3393 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003394 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003395 /* We have to sanity check the raw data, otherwise doom looms for
3396 some malformed UCS-4 data. */
3397 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003398#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003399 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003400#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003401 end-s < Py_UNICODE_SIZE
3402 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003403 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003404 startinpos = s - starts;
3405 if (end-s < Py_UNICODE_SIZE) {
3406 endinpos = end-starts;
3407 reason = "truncated input";
3408 }
3409 else {
3410 endinpos = s - starts + Py_UNICODE_SIZE;
3411 reason = "illegal code point (> 0x10FFFF)";
3412 }
3413 outpos = p - PyUnicode_AS_UNICODE(v);
3414 if (unicode_decode_call_errorhandler(
3415 errors, &errorHandler,
3416 "unicode_internal", reason,
3417 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003418 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003419 goto onError;
3420 }
3421 }
3422 else {
3423 p++;
3424 s += Py_UNICODE_SIZE;
3425 }
3426 }
3427
Martin v. Löwis412fb672006-04-13 06:34:32 +00003428 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003429 goto onError;
3430 Py_XDECREF(errorHandler);
3431 Py_XDECREF(exc);
3432 return (PyObject *)v;
3433
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003434 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003435 Py_XDECREF(v);
3436 Py_XDECREF(errorHandler);
3437 Py_XDECREF(exc);
3438 return NULL;
3439}
3440
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441/* --- Latin-1 Codec ------------------------------------------------------ */
3442
3443PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003444 Py_ssize_t size,
3445 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446{
3447 PyUnicodeObject *v;
3448 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003449
Guido van Rossumd57fd912000-03-10 22:53:23 +00003450 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003451 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003452 Py_UNICODE r = *(unsigned char*)s;
3453 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003454 }
3455
Guido van Rossumd57fd912000-03-10 22:53:23 +00003456 v = _PyUnicode_New(size);
3457 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003458 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003460 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461 p = PyUnicode_AS_UNICODE(v);
3462 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003463 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003465
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003466 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 Py_XDECREF(v);
3468 return NULL;
3469}
3470
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003471/* create or adjust a UnicodeEncodeError */
3472static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003473 const char *encoding,
3474 const Py_UNICODE *unicode, Py_ssize_t size,
3475 Py_ssize_t startpos, Py_ssize_t endpos,
3476 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003477{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003478 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003479 *exceptionObject = PyUnicodeEncodeError_Create(
3480 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003481 }
3482 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003483 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3484 goto onError;
3485 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3486 goto onError;
3487 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3488 goto onError;
3489 return;
3490 onError:
3491 Py_DECREF(*exceptionObject);
3492 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493 }
3494}
3495
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003496/* raises a UnicodeEncodeError */
3497static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003498 const char *encoding,
3499 const Py_UNICODE *unicode, Py_ssize_t size,
3500 Py_ssize_t startpos, Py_ssize_t endpos,
3501 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502{
3503 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003504 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003506 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507}
3508
3509/* error handling callback helper:
3510 build arguments, call the callback and check the arguments,
3511 put the result into newpos and return the replacement string, which
3512 has to be freed by the caller */
3513static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003514 PyObject **errorHandler,
3515 const char *encoding, const char *reason,
3516 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3517 Py_ssize_t startpos, Py_ssize_t endpos,
3518 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003520 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521
3522 PyObject *restuple;
3523 PyObject *resunicode;
3524
3525 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003526 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003528 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 }
3530
3531 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003532 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003534 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535
3536 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003537 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003539 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003541 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003542 Py_DECREF(restuple);
3543 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 }
3545 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003546 &resunicode, newpos)) {
3547 Py_DECREF(restuple);
3548 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003549 }
3550 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003551 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003552 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003553 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3554 Py_DECREF(restuple);
3555 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003556 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003557 Py_INCREF(resunicode);
3558 Py_DECREF(restuple);
3559 return resunicode;
3560}
3561
3562static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003563 Py_ssize_t size,
3564 const char *errors,
3565 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566{
3567 /* output object */
3568 PyObject *res;
3569 /* pointers to the beginning and end+1 of input */
3570 const Py_UNICODE *startp = p;
3571 const Py_UNICODE *endp = p + size;
3572 /* pointer to the beginning of the unencodable characters */
3573 /* const Py_UNICODE *badp = NULL; */
3574 /* pointer into the output */
3575 char *str;
3576 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003577 Py_ssize_t respos = 0;
3578 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003579 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3580 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581 PyObject *errorHandler = NULL;
3582 PyObject *exc = NULL;
3583 /* the following variable is used for caching string comparisons
3584 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3585 int known_errorHandler = -1;
3586
3587 /* allocate enough for a simple encoding without
3588 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003589 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590 if (res == NULL)
3591 goto onError;
3592 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003593 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003594 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003595 ressize = size;
3596
3597 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003598 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003600 /* can we encode this? */
3601 if (c<limit) {
3602 /* no overflow check, because we know that the space is enough */
3603 *str++ = (char)c;
3604 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003605 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003606 else {
3607 Py_ssize_t unicodepos = p-startp;
3608 Py_ssize_t requiredsize;
3609 PyObject *repunicode;
3610 Py_ssize_t repsize;
3611 Py_ssize_t newpos;
3612 Py_ssize_t respos;
3613 Py_UNICODE *uni2;
3614 /* startpos for collecting unencodable chars */
3615 const Py_UNICODE *collstart = p;
3616 const Py_UNICODE *collend = p;
3617 /* find all unecodable characters */
3618 while ((collend < endp) && ((*collend)>=limit))
3619 ++collend;
3620 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3621 if (known_errorHandler==-1) {
3622 if ((errors==NULL) || (!strcmp(errors, "strict")))
3623 known_errorHandler = 1;
3624 else if (!strcmp(errors, "replace"))
3625 known_errorHandler = 2;
3626 else if (!strcmp(errors, "ignore"))
3627 known_errorHandler = 3;
3628 else if (!strcmp(errors, "xmlcharrefreplace"))
3629 known_errorHandler = 4;
3630 else
3631 known_errorHandler = 0;
3632 }
3633 switch (known_errorHandler) {
3634 case 1: /* strict */
3635 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3636 goto onError;
3637 case 2: /* replace */
3638 while (collstart++<collend)
3639 *str++ = '?'; /* fall through */
3640 case 3: /* ignore */
3641 p = collend;
3642 break;
3643 case 4: /* xmlcharrefreplace */
3644 respos = str-PyString_AS_STRING(res);
3645 /* determine replacement size (temporarily (mis)uses p) */
3646 for (p = collstart, repsize = 0; p < collend; ++p) {
3647 if (*p<10)
3648 repsize += 2+1+1;
3649 else if (*p<100)
3650 repsize += 2+2+1;
3651 else if (*p<1000)
3652 repsize += 2+3+1;
3653 else if (*p<10000)
3654 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003655#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003656 else
3657 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003658#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003659 else if (*p<100000)
3660 repsize += 2+5+1;
3661 else if (*p<1000000)
3662 repsize += 2+6+1;
3663 else
3664 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003665#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003666 }
3667 requiredsize = respos+repsize+(endp-collend);
3668 if (requiredsize > ressize) {
3669 if (requiredsize<2*ressize)
3670 requiredsize = 2*ressize;
3671 if (_PyString_Resize(&res, requiredsize))
3672 goto onError;
3673 str = PyString_AS_STRING(res) + respos;
3674 ressize = requiredsize;
3675 }
3676 /* generate replacement (temporarily (mis)uses p) */
3677 for (p = collstart; p < collend; ++p) {
3678 str += sprintf(str, "&#%d;", (int)*p);
3679 }
3680 p = collend;
3681 break;
3682 default:
3683 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3684 encoding, reason, startp, size, &exc,
3685 collstart-startp, collend-startp, &newpos);
3686 if (repunicode == NULL)
3687 goto onError;
3688 /* need more space? (at least enough for what we have+the
3689 replacement+the rest of the string, so we won't have to
3690 check space for encodable characters) */
3691 respos = str-PyString_AS_STRING(res);
3692 repsize = PyUnicode_GET_SIZE(repunicode);
3693 requiredsize = respos+repsize+(endp-collend);
3694 if (requiredsize > ressize) {
3695 if (requiredsize<2*ressize)
3696 requiredsize = 2*ressize;
3697 if (_PyString_Resize(&res, requiredsize)) {
3698 Py_DECREF(repunicode);
3699 goto onError;
3700 }
3701 str = PyString_AS_STRING(res) + respos;
3702 ressize = requiredsize;
3703 }
3704 /* check if there is anything unencodable in the replacement
3705 and copy it to the output */
3706 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3707 c = *uni2;
3708 if (c >= limit) {
3709 raise_encode_exception(&exc, encoding, startp, size,
3710 unicodepos, unicodepos+1, reason);
3711 Py_DECREF(repunicode);
3712 goto onError;
3713 }
3714 *str = (char)c;
3715 }
3716 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003717 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003718 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003719 }
3720 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003721 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003722 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003723 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003724 /* If this falls res will be NULL */
3725 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 Py_XDECREF(errorHandler);
3727 Py_XDECREF(exc);
3728 return res;
3729
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003730 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 Py_XDECREF(res);
3732 Py_XDECREF(errorHandler);
3733 Py_XDECREF(exc);
3734 return NULL;
3735}
3736
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003738 Py_ssize_t size,
3739 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003741 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742}
3743
3744PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3745{
3746 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003747 PyErr_BadArgument();
3748 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749 }
3750 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003751 PyUnicode_GET_SIZE(unicode),
3752 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753}
3754
3755/* --- 7-bit ASCII Codec -------------------------------------------------- */
3756
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003758 Py_ssize_t size,
3759 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003761 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762 PyUnicodeObject *v;
3763 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003764 Py_ssize_t startinpos;
3765 Py_ssize_t endinpos;
3766 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003767 const char *e;
3768 PyObject *errorHandler = NULL;
3769 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003770
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003772 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003773 Py_UNICODE r = *(unsigned char*)s;
3774 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003775 }
Tim Petersced69f82003-09-16 20:30:58 +00003776
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 v = _PyUnicode_New(size);
3778 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003779 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003781 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 e = s + size;
3784 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003785 register unsigned char c = (unsigned char)*s;
3786 if (c < 128) {
3787 *p++ = c;
3788 ++s;
3789 }
3790 else {
3791 startinpos = s-starts;
3792 endinpos = startinpos + 1;
3793 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3794 if (unicode_decode_call_errorhandler(
3795 errors, &errorHandler,
3796 "ascii", "ordinal not in range(128)",
3797 starts, size, &startinpos, &endinpos, &exc, &s,
3798 &v, &outpos, &p))
3799 goto onError;
3800 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003802 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003803 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3804 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003805 Py_XDECREF(errorHandler);
3806 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003807 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003808
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003809 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003811 Py_XDECREF(errorHandler);
3812 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813 return NULL;
3814}
3815
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003817 Py_ssize_t size,
3818 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003820 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821}
3822
3823PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3824{
3825 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003826 PyErr_BadArgument();
3827 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 }
3829 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003830 PyUnicode_GET_SIZE(unicode),
3831 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832}
3833
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003834#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003835
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003836/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003837
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003838#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003839#define NEED_RETRY
3840#endif
3841
3842/* XXX This code is limited to "true" double-byte encodings, as
3843 a) it assumes an incomplete character consists of a single byte, and
3844 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003845 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003846
3847static int is_dbcs_lead_byte(const char *s, int offset)
3848{
3849 const char *curr = s + offset;
3850
3851 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003852 const char *prev = CharPrev(s, curr);
3853 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003854 }
3855 return 0;
3856}
3857
3858/*
3859 * Decode MBCS string into unicode object. If 'final' is set, converts
3860 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3861 */
3862static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003863 const char *s, /* MBCS string */
3864 int size, /* sizeof MBCS string */
3865 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003866{
3867 Py_UNICODE *p;
3868 Py_ssize_t n = 0;
3869 int usize = 0;
3870
3871 assert(size >= 0);
3872
3873 /* Skip trailing lead-byte unless 'final' is set */
3874 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003875 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003876
3877 /* First get the size of the result */
3878 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003879 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3880 if (usize == 0) {
3881 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3882 return -1;
3883 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003884 }
3885
3886 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003887 /* Create unicode object */
3888 *v = _PyUnicode_New(usize);
3889 if (*v == NULL)
3890 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003891 }
3892 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003893 /* Extend unicode object */
3894 n = PyUnicode_GET_SIZE(*v);
3895 if (_PyUnicode_Resize(v, n + usize) < 0)
3896 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003897 }
3898
3899 /* Do the conversion */
3900 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003901 p = PyUnicode_AS_UNICODE(*v) + n;
3902 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3903 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3904 return -1;
3905 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003906 }
3907
3908 return size;
3909}
3910
3911PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003912 Py_ssize_t size,
3913 const char *errors,
3914 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003915{
3916 PyUnicodeObject *v = NULL;
3917 int done;
3918
3919 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003920 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003921
3922#ifdef NEED_RETRY
3923 retry:
3924 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003925 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003926 else
3927#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003928 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003929
3930 if (done < 0) {
3931 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003932 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003933 }
3934
3935 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003936 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003937
3938#ifdef NEED_RETRY
3939 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003940 s += done;
3941 size -= done;
3942 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003943 }
3944#endif
3945
3946 return (PyObject *)v;
3947}
3948
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003949PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003950 Py_ssize_t size,
3951 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003952{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003953 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3954}
3955
3956/*
3957 * Convert unicode into string object (MBCS).
3958 * Returns 0 if succeed, -1 otherwise.
3959 */
3960static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003961 const Py_UNICODE *p, /* unicode */
3962 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003963{
3964 int mbcssize = 0;
3965 Py_ssize_t n = 0;
3966
3967 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003968
3969 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003970 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003971 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3972 if (mbcssize == 0) {
3973 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3974 return -1;
3975 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003976 }
3977
Martin v. Löwisd8251432006-06-14 05:21:04 +00003978 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003979 /* Create string object */
3980 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3981 if (*repr == NULL)
3982 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003983 }
3984 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003985 /* Extend string object */
3986 n = PyString_Size(*repr);
3987 if (_PyString_Resize(repr, n + mbcssize) < 0)
3988 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003989 }
3990
3991 /* Do the conversion */
3992 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003993 char *s = PyString_AS_STRING(*repr) + n;
3994 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3995 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3996 return -1;
3997 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003998 }
3999
4000 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004001}
4002
4003PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004004 Py_ssize_t size,
4005 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004006{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004007 PyObject *repr = NULL;
4008 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004009
Martin v. Löwisd8251432006-06-14 05:21:04 +00004010#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004011 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004012 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004013 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004014 else
4015#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004016 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004017
Martin v. Löwisd8251432006-06-14 05:21:04 +00004018 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004019 Py_XDECREF(repr);
4020 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004021 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004022
4023#ifdef NEED_RETRY
4024 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004025 p += INT_MAX;
4026 size -= INT_MAX;
4027 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004028 }
4029#endif
4030
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004031 return repr;
4032}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004033
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004034PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4035{
4036 if (!PyUnicode_Check(unicode)) {
4037 PyErr_BadArgument();
4038 return NULL;
4039 }
4040 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004041 PyUnicode_GET_SIZE(unicode),
4042 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004043}
4044
Martin v. Löwisd8251432006-06-14 05:21:04 +00004045#undef NEED_RETRY
4046
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004047#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004048
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049/* --- Character Mapping Codec -------------------------------------------- */
4050
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004052 Py_ssize_t size,
4053 PyObject *mapping,
4054 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004056 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004057 Py_ssize_t startinpos;
4058 Py_ssize_t endinpos;
4059 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004060 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 PyUnicodeObject *v;
4062 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004063 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004064 PyObject *errorHandler = NULL;
4065 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004066 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004067 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004068
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 /* Default to Latin-1 */
4070 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004071 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072
4073 v = _PyUnicode_New(size);
4074 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004075 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004077 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004080 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004081 mapstring = PyUnicode_AS_UNICODE(mapping);
4082 maplen = PyUnicode_GET_SIZE(mapping);
4083 while (s < e) {
4084 unsigned char ch = *s;
4085 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004087 if (ch < maplen)
4088 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004090 if (x == 0xfffe) {
4091 /* undefined mapping */
4092 outpos = p-PyUnicode_AS_UNICODE(v);
4093 startinpos = s-starts;
4094 endinpos = startinpos+1;
4095 if (unicode_decode_call_errorhandler(
4096 errors, &errorHandler,
4097 "charmap", "character maps to <undefined>",
4098 starts, size, &startinpos, &endinpos, &exc, &s,
4099 &v, &outpos, &p)) {
4100 goto onError;
4101 }
4102 continue;
4103 }
4104 *p++ = x;
4105 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004106 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004107 }
4108 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004109 while (s < e) {
4110 unsigned char ch = *s;
4111 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004112
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004113 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4114 w = PyInt_FromLong((long)ch);
4115 if (w == NULL)
4116 goto onError;
4117 x = PyObject_GetItem(mapping, w);
4118 Py_DECREF(w);
4119 if (x == NULL) {
4120 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4121 /* No mapping found means: mapping is undefined. */
4122 PyErr_Clear();
4123 x = Py_None;
4124 Py_INCREF(x);
4125 } else
4126 goto onError;
4127 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004128
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004129 /* Apply mapping */
4130 if (PyInt_Check(x)) {
4131 long value = PyInt_AS_LONG(x);
4132 if (value < 0 || value > 65535) {
4133 PyErr_SetString(PyExc_TypeError,
4134 "character mapping must be in range(65536)");
4135 Py_DECREF(x);
4136 goto onError;
4137 }
4138 *p++ = (Py_UNICODE)value;
4139 }
4140 else if (x == Py_None) {
4141 /* undefined mapping */
4142 outpos = p-PyUnicode_AS_UNICODE(v);
4143 startinpos = s-starts;
4144 endinpos = startinpos+1;
4145 if (unicode_decode_call_errorhandler(
4146 errors, &errorHandler,
4147 "charmap", "character maps to <undefined>",
4148 starts, size, &startinpos, &endinpos, &exc, &s,
4149 &v, &outpos, &p)) {
4150 Py_DECREF(x);
4151 goto onError;
4152 }
4153 Py_DECREF(x);
4154 continue;
4155 }
4156 else if (PyUnicode_Check(x)) {
4157 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004158
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004159 if (targetsize == 1)
4160 /* 1-1 mapping */
4161 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004162
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004163 else if (targetsize > 1) {
4164 /* 1-n mapping */
4165 if (targetsize > extrachars) {
4166 /* resize first */
4167 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4168 Py_ssize_t needed = (targetsize - extrachars) + \
4169 (targetsize << 2);
4170 extrachars += needed;
4171 /* XXX overflow detection missing */
4172 if (_PyUnicode_Resize(&v,
4173 PyUnicode_GET_SIZE(v) + needed) < 0) {
4174 Py_DECREF(x);
4175 goto onError;
4176 }
4177 p = PyUnicode_AS_UNICODE(v) + oldpos;
4178 }
4179 Py_UNICODE_COPY(p,
4180 PyUnicode_AS_UNICODE(x),
4181 targetsize);
4182 p += targetsize;
4183 extrachars -= targetsize;
4184 }
4185 /* 1-0 mapping: skip the character */
4186 }
4187 else {
4188 /* wrong return value */
4189 PyErr_SetString(PyExc_TypeError,
4190 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004191 Py_DECREF(x);
4192 goto onError;
4193 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004194 Py_DECREF(x);
4195 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197 }
4198 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004199 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4200 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 Py_XDECREF(errorHandler);
4202 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004203 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004204
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004205 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 Py_XDECREF(errorHandler);
4207 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004208 Py_XDECREF(v);
4209 return NULL;
4210}
4211
Martin v. Löwis3f767792006-06-04 19:36:28 +00004212/* Charmap encoding: the lookup table */
4213
4214struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004215 PyObject_HEAD
4216 unsigned char level1[32];
4217 int count2, count3;
4218 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004219};
4220
4221static PyObject*
4222encoding_map_size(PyObject *obj, PyObject* args)
4223{
4224 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004225 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004226 128*map->count3);
4227}
4228
4229static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004230 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004231 PyDoc_STR("Return the size (in bytes) of this object") },
4232 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004233};
4234
4235static void
4236encoding_map_dealloc(PyObject* o)
4237{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004238 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004239}
4240
4241static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004242 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004243 "EncodingMap", /*tp_name*/
4244 sizeof(struct encoding_map), /*tp_basicsize*/
4245 0, /*tp_itemsize*/
4246 /* methods */
4247 encoding_map_dealloc, /*tp_dealloc*/
4248 0, /*tp_print*/
4249 0, /*tp_getattr*/
4250 0, /*tp_setattr*/
4251 0, /*tp_compare*/
4252 0, /*tp_repr*/
4253 0, /*tp_as_number*/
4254 0, /*tp_as_sequence*/
4255 0, /*tp_as_mapping*/
4256 0, /*tp_hash*/
4257 0, /*tp_call*/
4258 0, /*tp_str*/
4259 0, /*tp_getattro*/
4260 0, /*tp_setattro*/
4261 0, /*tp_as_buffer*/
4262 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4263 0, /*tp_doc*/
4264 0, /*tp_traverse*/
4265 0, /*tp_clear*/
4266 0, /*tp_richcompare*/
4267 0, /*tp_weaklistoffset*/
4268 0, /*tp_iter*/
4269 0, /*tp_iternext*/
4270 encoding_map_methods, /*tp_methods*/
4271 0, /*tp_members*/
4272 0, /*tp_getset*/
4273 0, /*tp_base*/
4274 0, /*tp_dict*/
4275 0, /*tp_descr_get*/
4276 0, /*tp_descr_set*/
4277 0, /*tp_dictoffset*/
4278 0, /*tp_init*/
4279 0, /*tp_alloc*/
4280 0, /*tp_new*/
4281 0, /*tp_free*/
4282 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004283};
4284
4285PyObject*
4286PyUnicode_BuildEncodingMap(PyObject* string)
4287{
4288 Py_UNICODE *decode;
4289 PyObject *result;
4290 struct encoding_map *mresult;
4291 int i;
4292 int need_dict = 0;
4293 unsigned char level1[32];
4294 unsigned char level2[512];
4295 unsigned char *mlevel1, *mlevel2, *mlevel3;
4296 int count2 = 0, count3 = 0;
4297
4298 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4299 PyErr_BadArgument();
4300 return NULL;
4301 }
4302 decode = PyUnicode_AS_UNICODE(string);
4303 memset(level1, 0xFF, sizeof level1);
4304 memset(level2, 0xFF, sizeof level2);
4305
4306 /* If there isn't a one-to-one mapping of NULL to \0,
4307 or if there are non-BMP characters, we need to use
4308 a mapping dictionary. */
4309 if (decode[0] != 0)
4310 need_dict = 1;
4311 for (i = 1; i < 256; i++) {
4312 int l1, l2;
4313 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004314#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004315 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004316#endif
4317 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004318 need_dict = 1;
4319 break;
4320 }
4321 if (decode[i] == 0xFFFE)
4322 /* unmapped character */
4323 continue;
4324 l1 = decode[i] >> 11;
4325 l2 = decode[i] >> 7;
4326 if (level1[l1] == 0xFF)
4327 level1[l1] = count2++;
4328 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004329 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004330 }
4331
4332 if (count2 >= 0xFF || count3 >= 0xFF)
4333 need_dict = 1;
4334
4335 if (need_dict) {
4336 PyObject *result = PyDict_New();
4337 PyObject *key, *value;
4338 if (!result)
4339 return NULL;
4340 for (i = 0; i < 256; i++) {
4341 key = value = NULL;
4342 key = PyInt_FromLong(decode[i]);
4343 value = PyInt_FromLong(i);
4344 if (!key || !value)
4345 goto failed1;
4346 if (PyDict_SetItem(result, key, value) == -1)
4347 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004348 Py_DECREF(key);
4349 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004350 }
4351 return result;
4352 failed1:
4353 Py_XDECREF(key);
4354 Py_XDECREF(value);
4355 Py_DECREF(result);
4356 return NULL;
4357 }
4358
4359 /* Create a three-level trie */
4360 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4361 16*count2 + 128*count3 - 1);
4362 if (!result)
4363 return PyErr_NoMemory();
4364 PyObject_Init(result, &EncodingMapType);
4365 mresult = (struct encoding_map*)result;
4366 mresult->count2 = count2;
4367 mresult->count3 = count3;
4368 mlevel1 = mresult->level1;
4369 mlevel2 = mresult->level23;
4370 mlevel3 = mresult->level23 + 16*count2;
4371 memcpy(mlevel1, level1, 32);
4372 memset(mlevel2, 0xFF, 16*count2);
4373 memset(mlevel3, 0, 128*count3);
4374 count3 = 0;
4375 for (i = 1; i < 256; i++) {
4376 int o1, o2, o3, i2, i3;
4377 if (decode[i] == 0xFFFE)
4378 /* unmapped character */
4379 continue;
4380 o1 = decode[i]>>11;
4381 o2 = (decode[i]>>7) & 0xF;
4382 i2 = 16*mlevel1[o1] + o2;
4383 if (mlevel2[i2] == 0xFF)
4384 mlevel2[i2] = count3++;
4385 o3 = decode[i] & 0x7F;
4386 i3 = 128*mlevel2[i2] + o3;
4387 mlevel3[i3] = i;
4388 }
4389 return result;
4390}
4391
4392static int
4393encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4394{
4395 struct encoding_map *map = (struct encoding_map*)mapping;
4396 int l1 = c>>11;
4397 int l2 = (c>>7) & 0xF;
4398 int l3 = c & 0x7F;
4399 int i;
4400
4401#ifdef Py_UNICODE_WIDE
4402 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004403 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004404 }
4405#endif
4406 if (c == 0)
4407 return 0;
4408 /* level 1*/
4409 i = map->level1[l1];
4410 if (i == 0xFF) {
4411 return -1;
4412 }
4413 /* level 2*/
4414 i = map->level23[16*i+l2];
4415 if (i == 0xFF) {
4416 return -1;
4417 }
4418 /* level 3 */
4419 i = map->level23[16*map->count2 + 128*i + l3];
4420 if (i == 0) {
4421 return -1;
4422 }
4423 return i;
4424}
4425
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426/* Lookup the character ch in the mapping. If the character
4427 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004428 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004429static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 PyObject *w = PyInt_FromLong((long)c);
4432 PyObject *x;
4433
4434 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004435 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 x = PyObject_GetItem(mapping, w);
4437 Py_DECREF(w);
4438 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004439 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4440 /* No mapping found means: mapping is undefined. */
4441 PyErr_Clear();
4442 x = Py_None;
4443 Py_INCREF(x);
4444 return x;
4445 } else
4446 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004448 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004449 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004451 long value = PyInt_AS_LONG(x);
4452 if (value < 0 || value > 255) {
4453 PyErr_SetString(PyExc_TypeError,
4454 "character mapping must be in range(256)");
4455 Py_DECREF(x);
4456 return NULL;
4457 }
4458 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004460 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004461 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004462 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004463 /* wrong return value */
4464 PyErr_SetString(PyExc_TypeError,
4465 "character mapping must return integer, None or str");
4466 Py_DECREF(x);
4467 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 }
4469}
4470
Martin v. Löwis3f767792006-06-04 19:36:28 +00004471static int
4472charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4473{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004474 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4475 /* exponentially overallocate to minimize reallocations */
4476 if (requiredsize < 2*outsize)
4477 requiredsize = 2*outsize;
4478 if (_PyString_Resize(outobj, requiredsize)) {
4479 return 0;
4480 }
4481 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004482}
4483
Benjamin Peterson857ce152009-01-31 16:29:18 +00004484typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004485 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004486}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004487/* lookup the character, put the result in the output string and adjust
4488 various state variables. Reallocate the output string if not enough
4489 space is available. Return a new reference to the object that
4490 was put in the output buffer, or Py_None, if the mapping was undefined
4491 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004492 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004493static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004494charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004495 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004496{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004497 PyObject *rep;
4498 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004499 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004500
Christian Heimese93237d2007-12-19 02:37:44 +00004501 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004502 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004503 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004504 if (res == -1)
4505 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004506 if (outsize<requiredsize)
4507 if (!charmapencode_resize(outobj, outpos, requiredsize))
4508 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004509 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004510 outstart[(*outpos)++] = (char)res;
4511 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004512 }
4513
4514 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004516 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004517 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004518 Py_DECREF(rep);
4519 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004520 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004521 if (PyInt_Check(rep)) {
4522 Py_ssize_t requiredsize = *outpos+1;
4523 if (outsize<requiredsize)
4524 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4525 Py_DECREF(rep);
4526 return enc_EXCEPTION;
4527 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004528 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004529 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004530 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004531 else {
4532 const char *repchars = PyString_AS_STRING(rep);
4533 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4534 Py_ssize_t requiredsize = *outpos+repsize;
4535 if (outsize<requiredsize)
4536 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4537 Py_DECREF(rep);
4538 return enc_EXCEPTION;
4539 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004540 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004541 memcpy(outstart + *outpos, repchars, repsize);
4542 *outpos += repsize;
4543 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544 }
Georg Brandl9f167602006-06-04 21:46:16 +00004545 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004546 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547}
4548
4549/* handle an error in PyUnicode_EncodeCharmap
4550 Return 0 on success, -1 on error */
4551static
4552int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004553 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004555 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004556 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004557{
4558 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004559 Py_ssize_t repsize;
4560 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561 Py_UNICODE *uni2;
4562 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004563 Py_ssize_t collstartpos = *inpos;
4564 Py_ssize_t collendpos = *inpos+1;
4565 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 char *encoding = "charmap";
4567 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004568 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570 /* find all unencodable characters */
4571 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004572 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004573 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004574 int res = encoding_map_lookup(p[collendpos], mapping);
4575 if (res != -1)
4576 break;
4577 ++collendpos;
4578 continue;
4579 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004580
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004581 rep = charmapencode_lookup(p[collendpos], mapping);
4582 if (rep==NULL)
4583 return -1;
4584 else if (rep!=Py_None) {
4585 Py_DECREF(rep);
4586 break;
4587 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004588 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004589 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004590 }
4591 /* cache callback name lookup
4592 * (if not done yet, i.e. it's the first error) */
4593 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004594 if ((errors==NULL) || (!strcmp(errors, "strict")))
4595 *known_errorHandler = 1;
4596 else if (!strcmp(errors, "replace"))
4597 *known_errorHandler = 2;
4598 else if (!strcmp(errors, "ignore"))
4599 *known_errorHandler = 3;
4600 else if (!strcmp(errors, "xmlcharrefreplace"))
4601 *known_errorHandler = 4;
4602 else
4603 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604 }
4605 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004606 case 1: /* strict */
4607 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4608 return -1;
4609 case 2: /* replace */
4610 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004611 x = charmapencode_output('?', mapping, res, respos);
4612 if (x==enc_EXCEPTION) {
4613 return -1;
4614 }
4615 else if (x==enc_FAILED) {
4616 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4617 return -1;
4618 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004619 }
4620 /* fall through */
4621 case 3: /* ignore */
4622 *inpos = collendpos;
4623 break;
4624 case 4: /* xmlcharrefreplace */
4625 /* generate replacement (temporarily (mis)uses p) */
4626 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004627 char buffer[2+29+1+1];
4628 char *cp;
4629 sprintf(buffer, "&#%d;", (int)p[collpos]);
4630 for (cp = buffer; *cp; ++cp) {
4631 x = charmapencode_output(*cp, mapping, res, respos);
4632 if (x==enc_EXCEPTION)
4633 return -1;
4634 else if (x==enc_FAILED) {
4635 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4636 return -1;
4637 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004638 }
4639 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004640 *inpos = collendpos;
4641 break;
4642 default:
4643 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004644 encoding, reason, p, size, exceptionObject,
4645 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004646 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004647 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004648 /* generate replacement */
4649 repsize = PyUnicode_GET_SIZE(repunicode);
4650 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004651 x = charmapencode_output(*uni2, mapping, res, respos);
4652 if (x==enc_EXCEPTION) {
4653 return -1;
4654 }
4655 else if (x==enc_FAILED) {
4656 Py_DECREF(repunicode);
4657 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4658 return -1;
4659 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004660 }
4661 *inpos = newpos;
4662 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004663 }
4664 return 0;
4665}
4666
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004668 Py_ssize_t size,
4669 PyObject *mapping,
4670 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004672 /* output object */
4673 PyObject *res = NULL;
4674 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004675 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004676 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004677 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678 PyObject *errorHandler = NULL;
4679 PyObject *exc = NULL;
4680 /* the following variable is used for caching string comparisons
4681 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4682 * 3=ignore, 4=xmlcharrefreplace */
4683 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684
4685 /* Default to Latin-1 */
4686 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004687 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004689 /* allocate enough for a simple encoding without
4690 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004691 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004692 if (res == NULL)
4693 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004694 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004695 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004697 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004698 /* try to encode it */
4699 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4700 if (x==enc_EXCEPTION) /* error */
4701 goto onError;
4702 if (x==enc_FAILED) { /* unencodable character */
4703 if (charmap_encoding_error(p, size, &inpos, mapping,
4704 &exc,
4705 &known_errorHandler, &errorHandler, errors,
4706 &res, &respos)) {
4707 goto onError;
4708 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004709 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004710 else
4711 /* done with this character => adjust input position */
4712 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004715 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004716 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004717 if (_PyString_Resize(&res, respos))
4718 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004719 }
4720 Py_XDECREF(exc);
4721 Py_XDECREF(errorHandler);
4722 return res;
4723
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004724 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004725 Py_XDECREF(res);
4726 Py_XDECREF(exc);
4727 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728 return NULL;
4729}
4730
4731PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004732 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733{
4734 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004735 PyErr_BadArgument();
4736 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737 }
4738 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004739 PyUnicode_GET_SIZE(unicode),
4740 mapping,
4741 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742}
4743
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004744/* create or adjust a UnicodeTranslateError */
4745static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004746 const Py_UNICODE *unicode, Py_ssize_t size,
4747 Py_ssize_t startpos, Py_ssize_t endpos,
4748 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004750 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004751 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004752 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753 }
4754 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004755 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4756 goto onError;
4757 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4758 goto onError;
4759 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4760 goto onError;
4761 return;
4762 onError:
4763 Py_DECREF(*exceptionObject);
4764 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765 }
4766}
4767
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768/* raises a UnicodeTranslateError */
4769static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004770 const Py_UNICODE *unicode, Py_ssize_t size,
4771 Py_ssize_t startpos, Py_ssize_t endpos,
4772 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004773{
4774 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004775 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004776 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004777 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004778}
4779
4780/* error handling callback helper:
4781 build arguments, call the callback and check the arguments,
4782 put the result into newpos and return the replacement string, which
4783 has to be freed by the caller */
4784static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004785 PyObject **errorHandler,
4786 const char *reason,
4787 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4788 Py_ssize_t startpos, Py_ssize_t endpos,
4789 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004790{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004791 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004792
Martin v. Löwis412fb672006-04-13 06:34:32 +00004793 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004794 PyObject *restuple;
4795 PyObject *resunicode;
4796
4797 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004798 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004799 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004800 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 }
4802
4803 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004804 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004806 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807
4808 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004809 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004810 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004811 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004813 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004814 Py_DECREF(restuple);
4815 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 }
4817 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004818 &resunicode, &i_newpos)) {
4819 Py_DECREF(restuple);
4820 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004821 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004822 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004823 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004824 else
4825 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004826 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004827 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4828 Py_DECREF(restuple);
4829 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004830 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004831 Py_INCREF(resunicode);
4832 Py_DECREF(restuple);
4833 return resunicode;
4834}
4835
4836/* Lookup the character ch in the mapping and put the result in result,
4837 which must be decrefed by the caller.
4838 Return 0 on success, -1 on error */
4839static
4840int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4841{
4842 PyObject *w = PyInt_FromLong((long)c);
4843 PyObject *x;
4844
4845 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004846 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004847 x = PyObject_GetItem(mapping, w);
4848 Py_DECREF(w);
4849 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004850 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4851 /* No mapping found means: use 1:1 mapping. */
4852 PyErr_Clear();
4853 *result = NULL;
4854 return 0;
4855 } else
4856 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004857 }
4858 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004859 *result = x;
4860 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004861 }
4862 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004863 long value = PyInt_AS_LONG(x);
4864 long max = PyUnicode_GetMax();
4865 if (value < 0 || value > max) {
4866 PyErr_Format(PyExc_TypeError,
4867 "character mapping must be in range(0x%lx)", max+1);
4868 Py_DECREF(x);
4869 return -1;
4870 }
4871 *result = x;
4872 return 0;
4873 }
4874 else if (PyUnicode_Check(x)) {
4875 *result = x;
4876 return 0;
4877 }
4878 else {
4879 /* wrong return value */
4880 PyErr_SetString(PyExc_TypeError,
4881 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004882 Py_DECREF(x);
4883 return -1;
4884 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004885}
4886/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004887 if not reallocate and adjust various state variables.
4888 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004889static
Walter Dörwald4894c302003-10-24 14:25:28 +00004890int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004891 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004892{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004893 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004894 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004895 /* remember old output position */
4896 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4897 /* exponentially overallocate to minimize reallocations */
4898 if (requiredsize < 2 * oldsize)
4899 requiredsize = 2 * oldsize;
4900 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4901 return -1;
4902 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004903 }
4904 return 0;
4905}
4906/* lookup the character, put the result in the output string and adjust
4907 various state variables. Return a new reference to the object that
4908 was put in the output buffer in *result, or Py_None, if the mapping was
4909 undefined (in which case no character was written).
4910 The called must decref result.
4911 Return 0 on success, -1 on error. */
4912static
Walter Dörwald4894c302003-10-24 14:25:28 +00004913int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004914 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4915 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004916{
Walter Dörwald4894c302003-10-24 14:25:28 +00004917 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004918 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004919 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004920 /* not found => default to 1:1 mapping */
4921 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004922 }
4923 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004924 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004925 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004926 /* no overflow check, because we know that the space is enough */
4927 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004928 }
4929 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004930 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4931 if (repsize==1) {
4932 /* no overflow check, because we know that the space is enough */
4933 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4934 }
4935 else if (repsize!=0) {
4936 /* more than one character */
4937 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4938 (insize - (curinp-startinp)) +
4939 repsize - 1;
4940 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4941 return -1;
4942 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4943 *outp += repsize;
4944 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004945 }
4946 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004947 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004948 return 0;
4949}
4950
4951PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004952 Py_ssize_t size,
4953 PyObject *mapping,
4954 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004956 /* output object */
4957 PyObject *res = NULL;
4958 /* pointers to the beginning and end+1 of input */
4959 const Py_UNICODE *startp = p;
4960 const Py_UNICODE *endp = p + size;
4961 /* pointer into the output */
4962 Py_UNICODE *str;
4963 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004964 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004965 char *reason = "character maps to <undefined>";
4966 PyObject *errorHandler = NULL;
4967 PyObject *exc = NULL;
4968 /* the following variable is used for caching string comparisons
4969 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4970 * 3=ignore, 4=xmlcharrefreplace */
4971 int known_errorHandler = -1;
4972
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004974 PyErr_BadArgument();
4975 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004976 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004977
4978 /* allocate enough for a simple 1:1 translation without
4979 replacements, if we need more, we'll resize */
4980 res = PyUnicode_FromUnicode(NULL, size);
4981 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004982 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004984 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004985 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004987 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004988 /* try to encode it */
4989 PyObject *x = NULL;
4990 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4991 Py_XDECREF(x);
4992 goto onError;
4993 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004994 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004995 if (x!=Py_None) /* it worked => adjust input pointer */
4996 ++p;
4997 else { /* untranslatable character */
4998 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4999 Py_ssize_t repsize;
5000 Py_ssize_t newpos;
5001 Py_UNICODE *uni2;
5002 /* startpos for collecting untranslatable chars */
5003 const Py_UNICODE *collstart = p;
5004 const Py_UNICODE *collend = p+1;
5005 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005007 /* find all untranslatable characters */
5008 while (collend < endp) {
5009 if (charmaptranslate_lookup(*collend, mapping, &x))
5010 goto onError;
5011 Py_XDECREF(x);
5012 if (x!=Py_None)
5013 break;
5014 ++collend;
5015 }
5016 /* cache callback name lookup
5017 * (if not done yet, i.e. it's the first error) */
5018 if (known_errorHandler==-1) {
5019 if ((errors==NULL) || (!strcmp(errors, "strict")))
5020 known_errorHandler = 1;
5021 else if (!strcmp(errors, "replace"))
5022 known_errorHandler = 2;
5023 else if (!strcmp(errors, "ignore"))
5024 known_errorHandler = 3;
5025 else if (!strcmp(errors, "xmlcharrefreplace"))
5026 known_errorHandler = 4;
5027 else
5028 known_errorHandler = 0;
5029 }
5030 switch (known_errorHandler) {
5031 case 1: /* strict */
5032 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005033 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005034 case 2: /* replace */
5035 /* No need to check for space, this is a 1:1 replacement */
5036 for (coll = collstart; coll<collend; ++coll)
5037 *str++ = '?';
5038 /* fall through */
5039 case 3: /* ignore */
5040 p = collend;
5041 break;
5042 case 4: /* xmlcharrefreplace */
5043 /* generate replacement (temporarily (mis)uses p) */
5044 for (p = collstart; p < collend; ++p) {
5045 char buffer[2+29+1+1];
5046 char *cp;
5047 sprintf(buffer, "&#%d;", (int)*p);
5048 if (charmaptranslate_makespace(&res, &str,
5049 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5050 goto onError;
5051 for (cp = buffer; *cp; ++cp)
5052 *str++ = *cp;
5053 }
5054 p = collend;
5055 break;
5056 default:
5057 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5058 reason, startp, size, &exc,
5059 collstart-startp, collend-startp, &newpos);
5060 if (repunicode == NULL)
5061 goto onError;
5062 /* generate replacement */
5063 repsize = PyUnicode_GET_SIZE(repunicode);
5064 if (charmaptranslate_makespace(&res, &str,
5065 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5066 Py_DECREF(repunicode);
5067 goto onError;
5068 }
5069 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5070 *str++ = *uni2;
5071 p = startp + newpos;
5072 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005073 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005074 }
5075 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005076 /* Resize if we allocated to much */
5077 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005078 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005079 if (PyUnicode_Resize(&res, respos) < 0)
5080 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005081 }
5082 Py_XDECREF(exc);
5083 Py_XDECREF(errorHandler);
5084 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005085
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005086 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005087 Py_XDECREF(res);
5088 Py_XDECREF(exc);
5089 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090 return NULL;
5091}
5092
5093PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005094 PyObject *mapping,
5095 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096{
5097 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005098
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099 str = PyUnicode_FromObject(str);
5100 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005101 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005103 PyUnicode_GET_SIZE(str),
5104 mapping,
5105 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106 Py_DECREF(str);
5107 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005108
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005109 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 Py_XDECREF(str);
5111 return NULL;
5112}
Tim Petersced69f82003-09-16 20:30:58 +00005113
Guido van Rossum9e896b32000-04-05 20:11:21 +00005114/* --- Decimal Encoder ---------------------------------------------------- */
5115
5116int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005117 Py_ssize_t length,
5118 char *output,
5119 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005120{
5121 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005122 PyObject *errorHandler = NULL;
5123 PyObject *exc = NULL;
5124 const char *encoding = "decimal";
5125 const char *reason = "invalid decimal Unicode string";
5126 /* the following variable is used for caching string comparisons
5127 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5128 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005129
5130 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005131 PyErr_BadArgument();
5132 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005133 }
5134
5135 p = s;
5136 end = s + length;
5137 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005138 register Py_UNICODE ch = *p;
5139 int decimal;
5140 PyObject *repunicode;
5141 Py_ssize_t repsize;
5142 Py_ssize_t newpos;
5143 Py_UNICODE *uni2;
5144 Py_UNICODE *collstart;
5145 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005146
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005147 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005148 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005149 ++p;
5150 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005151 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005152 decimal = Py_UNICODE_TODECIMAL(ch);
5153 if (decimal >= 0) {
5154 *output++ = '0' + decimal;
5155 ++p;
5156 continue;
5157 }
5158 if (0 < ch && ch < 256) {
5159 *output++ = (char)ch;
5160 ++p;
5161 continue;
5162 }
5163 /* All other characters are considered unencodable */
5164 collstart = p;
5165 collend = p+1;
5166 while (collend < end) {
5167 if ((0 < *collend && *collend < 256) ||
5168 !Py_UNICODE_ISSPACE(*collend) ||
5169 Py_UNICODE_TODECIMAL(*collend))
5170 break;
5171 }
5172 /* cache callback name lookup
5173 * (if not done yet, i.e. it's the first error) */
5174 if (known_errorHandler==-1) {
5175 if ((errors==NULL) || (!strcmp(errors, "strict")))
5176 known_errorHandler = 1;
5177 else if (!strcmp(errors, "replace"))
5178 known_errorHandler = 2;
5179 else if (!strcmp(errors, "ignore"))
5180 known_errorHandler = 3;
5181 else if (!strcmp(errors, "xmlcharrefreplace"))
5182 known_errorHandler = 4;
5183 else
5184 known_errorHandler = 0;
5185 }
5186 switch (known_errorHandler) {
5187 case 1: /* strict */
5188 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5189 goto onError;
5190 case 2: /* replace */
5191 for (p = collstart; p < collend; ++p)
5192 *output++ = '?';
5193 /* fall through */
5194 case 3: /* ignore */
5195 p = collend;
5196 break;
5197 case 4: /* xmlcharrefreplace */
5198 /* generate replacement (temporarily (mis)uses p) */
5199 for (p = collstart; p < collend; ++p)
5200 output += sprintf(output, "&#%d;", (int)*p);
5201 p = collend;
5202 break;
5203 default:
5204 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5205 encoding, reason, s, length, &exc,
5206 collstart-s, collend-s, &newpos);
5207 if (repunicode == NULL)
5208 goto onError;
5209 /* generate replacement */
5210 repsize = PyUnicode_GET_SIZE(repunicode);
5211 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5212 Py_UNICODE ch = *uni2;
5213 if (Py_UNICODE_ISSPACE(ch))
5214 *output++ = ' ';
5215 else {
5216 decimal = Py_UNICODE_TODECIMAL(ch);
5217 if (decimal >= 0)
5218 *output++ = '0' + decimal;
5219 else if (0 < ch && ch < 256)
5220 *output++ = (char)ch;
5221 else {
5222 Py_DECREF(repunicode);
5223 raise_encode_exception(&exc, encoding,
5224 s, length, collstart-s, collend-s, reason);
5225 goto onError;
5226 }
5227 }
5228 }
5229 p = s + newpos;
5230 Py_DECREF(repunicode);
5231 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005232 }
5233 /* 0-terminate the output string */
5234 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005235 Py_XDECREF(exc);
5236 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005237 return 0;
5238
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005239 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005240 Py_XDECREF(exc);
5241 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005242 return -1;
5243}
5244
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245/* --- Helpers ------------------------------------------------------------ */
5246
Eric Smitha9f7d622008-02-17 19:46:49 +00005247#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005248
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005249#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005250
Fredrik Lundha50d2012006-05-26 17:04:58 +00005251#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005252
5253#include "stringlib/count.h"
5254#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005255#include "stringlib/partition.h"
5256
Fredrik Lundhc8162812006-05-26 19:33:03 +00005257/* helper macro to fixup start/end slice values */
5258#define FIX_START_END(obj) \
5259 if (start < 0) \
5260 start += (obj)->length; \
5261 if (start < 0) \
5262 start = 0; \
5263 if (end > (obj)->length) \
5264 end = (obj)->length; \
5265 if (end < 0) \
5266 end += (obj)->length; \
5267 if (end < 0) \
5268 end = 0;
5269
Martin v. Löwis18e16552006-02-15 17:27:45 +00005270Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005271 PyObject *substr,
5272 Py_ssize_t start,
5273 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005275 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005276 PyUnicodeObject* str_obj;
5277 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005278
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005279 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5280 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005281 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005282 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5283 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005284 Py_DECREF(str_obj);
5285 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286 }
Tim Petersced69f82003-09-16 20:30:58 +00005287
Fredrik Lundhc8162812006-05-26 19:33:03 +00005288 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005289
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005290 result = stringlib_count(
5291 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5292 );
5293
5294 Py_DECREF(sub_obj);
5295 Py_DECREF(str_obj);
5296
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 return result;
5298}
5299
Martin v. Löwis18e16552006-02-15 17:27:45 +00005300Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005301 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005302 Py_ssize_t start,
5303 Py_ssize_t end,
5304 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005306 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005307
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005308 str = PyUnicode_FromObject(str);
5309 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005310 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005311 sub = PyUnicode_FromObject(sub);
5312 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005313 Py_DECREF(str);
5314 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 }
Tim Petersced69f82003-09-16 20:30:58 +00005316
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005317 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005318 result = stringlib_find_slice(
5319 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5320 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5321 start, end
5322 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005323 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005324 result = stringlib_rfind_slice(
5325 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5326 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5327 start, end
5328 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005329
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005330 Py_DECREF(str);
5331 Py_DECREF(sub);
5332
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333 return result;
5334}
5335
Tim Petersced69f82003-09-16 20:30:58 +00005336static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005338 PyUnicodeObject *substring,
5339 Py_ssize_t start,
5340 Py_ssize_t end,
5341 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 if (substring->length == 0)
5344 return 1;
5345
Fredrik Lundhc8162812006-05-26 19:33:03 +00005346 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347
5348 end -= substring->length;
5349 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005350 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351
5352 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005353 if (Py_UNICODE_MATCH(self, end, substring))
5354 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 } else {
5356 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005357 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 }
5359
5360 return 0;
5361}
5362
Martin v. Löwis18e16552006-02-15 17:27:45 +00005363Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005364 PyObject *substr,
5365 Py_ssize_t start,
5366 Py_ssize_t end,
5367 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005369 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005370
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 str = PyUnicode_FromObject(str);
5372 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005373 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 substr = PyUnicode_FromObject(substr);
5375 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005376 Py_DECREF(str);
5377 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 }
Tim Petersced69f82003-09-16 20:30:58 +00005379
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005381 (PyUnicodeObject *)substr,
5382 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 Py_DECREF(str);
5384 Py_DECREF(substr);
5385 return result;
5386}
5387
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388/* Apply fixfct filter to the Unicode object self and return a
5389 reference to the modified object */
5390
Tim Petersced69f82003-09-16 20:30:58 +00005391static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005393 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394{
5395
5396 PyUnicodeObject *u;
5397
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005398 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005400 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005401
5402 Py_UNICODE_COPY(u->str, self->str, self->length);
5403
Tim Peters7a29bd52001-09-12 03:03:31 +00005404 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005405 /* fixfct should return TRUE if it modified the buffer. If
5406 FALSE, return a reference to the original buffer instead
5407 (to save space, not time) */
5408 Py_INCREF(self);
5409 Py_DECREF(u);
5410 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411 }
5412 return (PyObject*) u;
5413}
5414
Tim Petersced69f82003-09-16 20:30:58 +00005415static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416int fixupper(PyUnicodeObject *self)
5417{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005418 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 Py_UNICODE *s = self->str;
5420 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005421
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005423 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005424
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005425 ch = Py_UNICODE_TOUPPER(*s);
5426 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005428 *s = ch;
5429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 s++;
5431 }
5432
5433 return status;
5434}
5435
Tim Petersced69f82003-09-16 20:30:58 +00005436static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437int fixlower(PyUnicodeObject *self)
5438{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005439 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 Py_UNICODE *s = self->str;
5441 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005442
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005444 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005445
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005446 ch = Py_UNICODE_TOLOWER(*s);
5447 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005449 *s = ch;
5450 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451 s++;
5452 }
5453
5454 return status;
5455}
5456
Tim Petersced69f82003-09-16 20:30:58 +00005457static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458int fixswapcase(PyUnicodeObject *self)
5459{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005460 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 Py_UNICODE *s = self->str;
5462 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005463
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 while (len-- > 0) {
5465 if (Py_UNICODE_ISUPPER(*s)) {
5466 *s = Py_UNICODE_TOLOWER(*s);
5467 status = 1;
5468 } else if (Py_UNICODE_ISLOWER(*s)) {
5469 *s = Py_UNICODE_TOUPPER(*s);
5470 status = 1;
5471 }
5472 s++;
5473 }
5474
5475 return status;
5476}
5477
Tim Petersced69f82003-09-16 20:30:58 +00005478static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479int fixcapitalize(PyUnicodeObject *self)
5480{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005481 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005482 Py_UNICODE *s = self->str;
5483 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005484
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005485 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005486 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005487 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005488 *s = Py_UNICODE_TOUPPER(*s);
5489 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005491 s++;
5492 while (--len > 0) {
5493 if (Py_UNICODE_ISUPPER(*s)) {
5494 *s = Py_UNICODE_TOLOWER(*s);
5495 status = 1;
5496 }
5497 s++;
5498 }
5499 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500}
5501
5502static
5503int fixtitle(PyUnicodeObject *self)
5504{
5505 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5506 register Py_UNICODE *e;
5507 int previous_is_cased;
5508
5509 /* Shortcut for single character strings */
5510 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005511 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5512 if (*p != ch) {
5513 *p = ch;
5514 return 1;
5515 }
5516 else
5517 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518 }
Tim Petersced69f82003-09-16 20:30:58 +00005519
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 e = p + PyUnicode_GET_SIZE(self);
5521 previous_is_cased = 0;
5522 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005523 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005524
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005525 if (previous_is_cased)
5526 *p = Py_UNICODE_TOLOWER(ch);
5527 else
5528 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005529
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005530 if (Py_UNICODE_ISLOWER(ch) ||
5531 Py_UNICODE_ISUPPER(ch) ||
5532 Py_UNICODE_ISTITLE(ch))
5533 previous_is_cased = 1;
5534 else
5535 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 }
5537 return 1;
5538}
5539
Tim Peters8ce9f162004-08-27 01:49:32 +00005540PyObject *
5541PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542{
Tim Peters8ce9f162004-08-27 01:49:32 +00005543 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005544 const Py_UNICODE blank = ' ';
5545 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005546 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005547 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005548 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5549 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005550 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5551 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005552 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005553 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005554 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555
Tim Peters05eba1f2004-08-27 21:32:02 +00005556 fseq = PySequence_Fast(seq, "");
5557 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005558 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005559 }
5560
Tim Peters91879ab2004-08-27 22:35:44 +00005561 /* Grrrr. A codec may be invoked to convert str objects to
5562 * Unicode, and so it's possible to call back into Python code
5563 * during PyUnicode_FromObject(), and so it's possible for a sick
5564 * codec to change the size of fseq (if seq is a list). Therefore
5565 * we have to keep refetching the size -- can't assume seqlen
5566 * is invariant.
5567 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005568 seqlen = PySequence_Fast_GET_SIZE(fseq);
5569 /* If empty sequence, return u"". */
5570 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005571 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5572 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005573 }
5574 /* If singleton sequence with an exact Unicode, return that. */
5575 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005576 item = PySequence_Fast_GET_ITEM(fseq, 0);
5577 if (PyUnicode_CheckExact(item)) {
5578 Py_INCREF(item);
5579 res = (PyUnicodeObject *)item;
5580 goto Done;
5581 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005582 }
5583
Tim Peters05eba1f2004-08-27 21:32:02 +00005584 /* At least two items to join, or one that isn't exact Unicode. */
5585 if (seqlen > 1) {
5586 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005587 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005588 sep = &blank;
5589 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005590 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005591 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005592 internal_separator = PyUnicode_FromObject(separator);
5593 if (internal_separator == NULL)
5594 goto onError;
5595 sep = PyUnicode_AS_UNICODE(internal_separator);
5596 seplen = PyUnicode_GET_SIZE(internal_separator);
5597 /* In case PyUnicode_FromObject() mutated seq. */
5598 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005599 }
5600 }
5601
5602 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005603 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005604 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005605 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005606 res_p = PyUnicode_AS_UNICODE(res);
5607 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005608
Tim Peters05eba1f2004-08-27 21:32:02 +00005609 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005610 Py_ssize_t itemlen;
5611 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005612
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005613 item = PySequence_Fast_GET_ITEM(fseq, i);
5614 /* Convert item to Unicode. */
5615 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5616 PyErr_Format(PyExc_TypeError,
5617 "sequence item %zd: expected string or Unicode,"
5618 " %.80s found",
5619 i, Py_TYPE(item)->tp_name);
5620 goto onError;
5621 }
5622 item = PyUnicode_FromObject(item);
5623 if (item == NULL)
5624 goto onError;
5625 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005626
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005627 /* In case PyUnicode_FromObject() mutated seq. */
5628 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005629
Tim Peters8ce9f162004-08-27 01:49:32 +00005630 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005631 itemlen = PyUnicode_GET_SIZE(item);
5632 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005633 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005634 goto Overflow;
5635 if (i < seqlen - 1) {
5636 new_res_used += seplen;
5637 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005638 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005639 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005640 if (new_res_used > res_alloc) {
5641 /* double allocated size until it's big enough */
5642 do {
5643 res_alloc += res_alloc;
5644 if (res_alloc <= 0)
5645 goto Overflow;
5646 } while (new_res_used > res_alloc);
5647 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5648 Py_DECREF(item);
5649 goto onError;
5650 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005651 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005652 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005653
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005654 /* Copy item, and maybe the separator. */
5655 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5656 res_p += itemlen;
5657 if (i < seqlen - 1) {
5658 Py_UNICODE_COPY(res_p, sep, seplen);
5659 res_p += seplen;
5660 }
5661 Py_DECREF(item);
5662 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005663 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005664
Tim Peters05eba1f2004-08-27 21:32:02 +00005665 /* Shrink res to match the used area; this probably can't fail,
5666 * but it's cheap to check.
5667 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005668 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005669 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005670
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005671 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005672 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005673 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674 return (PyObject *)res;
5675
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005676 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005677 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005678 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005679 Py_DECREF(item);
5680 /* fall through */
5681
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005682 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005683 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005684 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005685 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 return NULL;
5687}
5688
Tim Petersced69f82003-09-16 20:30:58 +00005689static
5690PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005691 Py_ssize_t left,
5692 Py_ssize_t right,
5693 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694{
5695 PyUnicodeObject *u;
5696
5697 if (left < 0)
5698 left = 0;
5699 if (right < 0)
5700 right = 0;
5701
Tim Peters7a29bd52001-09-12 03:03:31 +00005702 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 Py_INCREF(self);
5704 return self;
5705 }
5706
Neal Norwitze7d8be82008-07-31 17:17:14 +00005707 if (left > PY_SSIZE_T_MAX - self->length ||
5708 right > PY_SSIZE_T_MAX - (left + self->length)) {
5709 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5710 return NULL;
5711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 u = _PyUnicode_New(left + self->length + right);
5713 if (u) {
5714 if (left)
5715 Py_UNICODE_FILL(u->str, fill, left);
5716 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5717 if (right)
5718 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5719 }
5720
5721 return u;
5722}
5723
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005724#define SPLIT_APPEND(data, left, right) \
5725 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5726 if (!str) \
5727 goto onError; \
5728 if (PyList_Append(list, str)) { \
5729 Py_DECREF(str); \
5730 goto onError; \
5731 } \
5732 else \
5733 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734
5735static
5736PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005737 PyObject *list,
5738 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005740 register Py_ssize_t i;
5741 register Py_ssize_t j;
5742 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005744 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745
5746 for (i = j = 0; i < len; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005747 /* find a token */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005748 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005749 i++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005750 j = i;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005751 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5752 i++;
5753 if (j < i) {
5754 if (maxcount-- <= 0)
5755 break;
5756 SPLIT_APPEND(buf, j, i);
5757 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5758 i++;
5759 j = i;
5760 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 }
5762 if (j < len) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005763 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 }
5765 return list;
5766
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005767 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 Py_DECREF(list);
5769 return NULL;
5770}
5771
5772PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005773 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005775 register Py_ssize_t i;
5776 register Py_ssize_t j;
5777 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 PyObject *list;
5779 PyObject *str;
5780 Py_UNICODE *data;
5781
5782 string = PyUnicode_FromObject(string);
5783 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005784 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785 data = PyUnicode_AS_UNICODE(string);
5786 len = PyUnicode_GET_SIZE(string);
5787
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 list = PyList_New(0);
5789 if (!list)
5790 goto onError;
5791
5792 for (i = j = 0; i < len; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005793 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005794
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005795 /* Find a line and append it */
5796 while (i < len && !BLOOM_LINEBREAK(data[i]))
5797 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005799 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005800 eol = i;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005801 if (i < len) {
5802 if (data[i] == '\r' && i + 1 < len &&
5803 data[i+1] == '\n')
5804 i += 2;
5805 else
5806 i++;
5807 if (keepends)
5808 eol = i;
5809 }
5810 SPLIT_APPEND(data, j, eol);
5811 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812 }
5813 if (j < len) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005814 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815 }
5816
5817 Py_DECREF(string);
5818 return list;
5819
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005820 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005821 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822 Py_DECREF(string);
5823 return NULL;
5824}
5825
Tim Petersced69f82003-09-16 20:30:58 +00005826static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827PyObject *split_char(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005828 PyObject *list,
5829 Py_UNICODE ch,
5830 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005832 register Py_ssize_t i;
5833 register Py_ssize_t j;
5834 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005836 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837
5838 for (i = j = 0; i < len; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005839 if (buf[i] == ch) {
5840 if (maxcount-- <= 0)
5841 break;
5842 SPLIT_APPEND(buf, j, i);
5843 i = j = i + 1;
5844 } else
5845 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 }
5847 if (j <= len) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005848 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 }
5850 return list;
5851
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005852 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853 Py_DECREF(list);
5854 return NULL;
5855}
5856
Tim Petersced69f82003-09-16 20:30:58 +00005857static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858PyObject *split_substring(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005859 PyObject *list,
5860 PyUnicodeObject *substring,
5861 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005863 register Py_ssize_t i;
5864 register Py_ssize_t j;
5865 Py_ssize_t len = self->length;
5866 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 PyObject *str;
5868
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005869 for (i = j = 0; i <= len - sublen; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005870 if (Py_UNICODE_MATCH(self, i, substring)) {
5871 if (maxcount-- <= 0)
5872 break;
5873 SPLIT_APPEND(self->str, j, i);
5874 i = j = i + sublen;
5875 } else
5876 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877 }
5878 if (j <= len) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005879 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880 }
5881 return list;
5882
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005883 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 Py_DECREF(list);
5885 return NULL;
5886}
5887
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005888static
5889PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005890 PyObject *list,
5891 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005892{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005893 register Py_ssize_t i;
5894 register Py_ssize_t j;
5895 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005896 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005897 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005898
5899 for (i = j = len - 1; i >= 0; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005900 /* find a token */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005901 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005902 i--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005903 j = i;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005904 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5905 i--;
5906 if (j > i) {
5907 if (maxcount-- <= 0)
5908 break;
5909 SPLIT_APPEND(buf, i + 1, j + 1);
5910 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5911 i--;
5912 j = i;
5913 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005914 }
5915 if (j >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005916 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005917 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005918 if (PyList_Reverse(list) < 0)
5919 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005920 return list;
5921
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005922 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005923 Py_DECREF(list);
5924 return NULL;
5925}
5926
Benjamin Peterson857ce152009-01-31 16:29:18 +00005927static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005928PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005929 PyObject *list,
5930 Py_UNICODE ch,
5931 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005932{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005933 register Py_ssize_t i;
5934 register Py_ssize_t j;
5935 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005936 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005937 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005938
5939 for (i = j = len - 1; i >= 0; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005940 if (buf[i] == ch) {
5941 if (maxcount-- <= 0)
5942 break;
5943 SPLIT_APPEND(buf, i + 1, j + 1);
5944 j = i = i - 1;
5945 } else
5946 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005947 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005948 if (j >= -1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005949 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005950 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005951 if (PyList_Reverse(list) < 0)
5952 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005953 return list;
5954
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005955 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005956 Py_DECREF(list);
5957 return NULL;
5958}
5959
Benjamin Peterson857ce152009-01-31 16:29:18 +00005960static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005961PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005962 PyObject *list,
5963 PyUnicodeObject *substring,
5964 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005965{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005966 register Py_ssize_t i;
5967 register Py_ssize_t j;
5968 Py_ssize_t len = self->length;
5969 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005970 PyObject *str;
5971
5972 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005973 if (Py_UNICODE_MATCH(self, i, substring)) {
5974 if (maxcount-- <= 0)
5975 break;
5976 SPLIT_APPEND(self->str, i + sublen, j);
5977 j = i;
5978 i -= sublen;
5979 } else
5980 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005981 }
5982 if (j >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005983 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005984 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005985 if (PyList_Reverse(list) < 0)
5986 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005987 return list;
5988
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005989 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005990 Py_DECREF(list);
5991 return NULL;
5992}
5993
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994#undef SPLIT_APPEND
5995
5996static
5997PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005998 PyUnicodeObject *substring,
5999 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000{
6001 PyObject *list;
6002
6003 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00006004 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005
6006 list = PyList_New(0);
6007 if (!list)
6008 return NULL;
6009
6010 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006011 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012
6013 else if (substring->length == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006014 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015
6016 else if (substring->length == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006017 Py_DECREF(list);
6018 PyErr_SetString(PyExc_ValueError, "empty separator");
6019 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 }
6021 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006022 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023}
6024
Tim Petersced69f82003-09-16 20:30:58 +00006025static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006026PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006027 PyUnicodeObject *substring,
6028 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006029{
6030 PyObject *list;
6031
6032 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00006033 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006034
6035 list = PyList_New(0);
6036 if (!list)
6037 return NULL;
6038
6039 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006040 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006041
6042 else if (substring->length == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006043 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006044
6045 else if (substring->length == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006046 Py_DECREF(list);
6047 PyErr_SetString(PyExc_ValueError, "empty separator");
6048 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006049 }
6050 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006051 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006052}
6053
6054static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006056 PyUnicodeObject *str1,
6057 PyUnicodeObject *str2,
6058 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059{
6060 PyUnicodeObject *u;
6061
6062 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006063 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064
Fredrik Lundh347ee272006-05-24 16:35:18 +00006065 if (str1->length == str2->length) {
6066 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00006067 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006068 if (str1->length == 1) {
6069 /* replace characters */
6070 Py_UNICODE u1, u2;
6071 if (!findchar(self->str, self->length, str1->str[0]))
6072 goto nothing;
6073 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6074 if (!u)
6075 return NULL;
6076 Py_UNICODE_COPY(u->str, self->str, self->length);
6077 u1 = str1->str[0];
6078 u2 = str2->str[0];
6079 for (i = 0; i < u->length; i++)
6080 if (u->str[i] == u1) {
6081 if (--maxcount < 0)
6082 break;
6083 u->str[i] = u2;
6084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006086 i = fastsearch(
6087 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00006089 if (i < 0)
6090 goto nothing;
6091 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6092 if (!u)
6093 return NULL;
6094 Py_UNICODE_COPY(u->str, self->str, self->length);
6095 while (i <= self->length - str1->length)
6096 if (Py_UNICODE_MATCH(self, i, str1)) {
6097 if (--maxcount < 0)
6098 break;
6099 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6100 i += str1->length;
6101 } else
6102 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006105
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006106 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00006107 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 Py_UNICODE *p;
6109
6110 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006111 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 if (n > maxcount)
6113 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006114 if (n == 0)
6115 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00006116 /* new_size = self->length + n * (str2->length - str1->length)); */
6117 delta = (str2->length - str1->length);
6118 if (delta == 0) {
6119 new_size = self->length;
6120 } else {
6121 product = n * (str2->length - str1->length);
6122 if ((product / (str2->length - str1->length)) != n) {
6123 PyErr_SetString(PyExc_OverflowError,
6124 "replace string is too long");
6125 return NULL;
6126 }
6127 new_size = self->length + product;
6128 if (new_size < 0) {
6129 PyErr_SetString(PyExc_OverflowError,
6130 "replace string is too long");
6131 return NULL;
6132 }
6133 }
6134 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006135 if (!u)
6136 return NULL;
6137 i = 0;
6138 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006139 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006140 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006141 while (n-- > 0) {
6142 /* look for next match */
6143 j = i;
6144 while (j <= e) {
6145 if (Py_UNICODE_MATCH(self, j, str1))
6146 break;
6147 j++;
6148 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006149 if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006150 if (j > e)
6151 break;
6152 /* copy unchanged part [i:j] */
6153 Py_UNICODE_COPY(p, self->str+i, j-i);
6154 p += j - i;
6155 }
6156 /* copy substitution string */
6157 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006158 Py_UNICODE_COPY(p, str2->str, str2->length);
6159 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006160 }
6161 i = j + str1->length;
6162 }
6163 if (i < self->length)
6164 /* copy tail [i:] */
6165 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006166 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006167 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006168 while (n > 0) {
6169 Py_UNICODE_COPY(p, str2->str, str2->length);
6170 p += str2->length;
6171 if (--n <= 0)
6172 break;
6173 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006175 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 }
6177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006179
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006180 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00006181 /* nothing to replace; return original string (when possible) */
6182 if (PyUnicode_CheckExact(self)) {
6183 Py_INCREF(self);
6184 return (PyObject *) self;
6185 }
6186 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187}
6188
6189/* --- Unicode Object Methods --------------------------------------------- */
6190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006191PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006192 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193\n\
6194Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006195characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196
6197static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006198unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 return fixup(self, fixtitle);
6201}
6202
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006203PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006204 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205\n\
6206Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006207have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208
6209static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006210unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212 return fixup(self, fixcapitalize);
6213}
6214
6215#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006216PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006217 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218\n\
6219Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006220normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221
6222static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006223unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224{
6225 PyObject *list;
6226 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006227 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 /* Split into words */
6230 list = split(self, NULL, -1);
6231 if (!list)
6232 return NULL;
6233
6234 /* Capitalize each word */
6235 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6236 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006237 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 if (item == NULL)
6239 goto onError;
6240 Py_DECREF(PyList_GET_ITEM(list, i));
6241 PyList_SET_ITEM(list, i, item);
6242 }
6243
6244 /* Join the words to form a new string */
6245 item = PyUnicode_Join(NULL, list);
6246
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006247 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 Py_DECREF(list);
6249 return (PyObject *)item;
6250}
6251#endif
6252
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006253/* Argument converter. Coerces to a single unicode character */
6254
6255static int
6256convert_uc(PyObject *obj, void *addr)
6257{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006258 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6259 PyObject *uniobj;
6260 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006261
Benjamin Peterson857ce152009-01-31 16:29:18 +00006262 uniobj = PyUnicode_FromObject(obj);
6263 if (uniobj == NULL) {
6264 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006265 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006266 return 0;
6267 }
6268 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6269 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006270 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006271 Py_DECREF(uniobj);
6272 return 0;
6273 }
6274 unistr = PyUnicode_AS_UNICODE(uniobj);
6275 *fillcharloc = unistr[0];
6276 Py_DECREF(uniobj);
6277 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006278}
6279
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006280PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006281 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006283Return S centered in a Unicode string of length width. Padding is\n\
6284done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285
6286static PyObject *
6287unicode_center(PyUnicodeObject *self, PyObject *args)
6288{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006289 Py_ssize_t marg, left;
6290 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006291 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292
Thomas Woutersde017742006-02-16 19:34:37 +00006293 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 return NULL;
6295
Tim Peters7a29bd52001-09-12 03:03:31 +00006296 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 Py_INCREF(self);
6298 return (PyObject*) self;
6299 }
6300
6301 marg = width - self->length;
6302 left = marg / 2 + (marg & width & 1);
6303
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006304 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305}
6306
Marc-André Lemburge5034372000-08-08 08:04:29 +00006307#if 0
6308
6309/* This code should go into some future Unicode collation support
6310 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006311 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006312
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006313/* speedy UTF-16 code point order comparison */
6314/* gleaned from: */
6315/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6316
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006317static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006318{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006319 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006320 0, 0, 0, 0, 0, 0, 0, 0,
6321 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006322 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006323};
6324
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325static int
6326unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6327{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006328 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006329
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330 Py_UNICODE *s1 = str1->str;
6331 Py_UNICODE *s2 = str2->str;
6332
6333 len1 = str1->length;
6334 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006335
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006337 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006338
6339 c1 = *s1++;
6340 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006341
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006342 if (c1 > (1<<11) * 26)
6343 c1 += utf16Fixup[c1>>11];
6344 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006345 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006346 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006347
6348 if (c1 != c2)
6349 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006350
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006351 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 }
6353
6354 return (len1 < len2) ? -1 : (len1 != len2);
6355}
6356
Marc-André Lemburge5034372000-08-08 08:04:29 +00006357#else
6358
6359static int
6360unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6361{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006362 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006363
6364 Py_UNICODE *s1 = str1->str;
6365 Py_UNICODE *s2 = str2->str;
6366
6367 len1 = str1->length;
6368 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006369
Marc-André Lemburge5034372000-08-08 08:04:29 +00006370 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006371 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006372
Fredrik Lundh45714e92001-06-26 16:39:36 +00006373 c1 = *s1++;
6374 c2 = *s2++;
6375
6376 if (c1 != c2)
6377 return (c1 < c2) ? -1 : 1;
6378
Marc-André Lemburge5034372000-08-08 08:04:29 +00006379 len1--; len2--;
6380 }
6381
6382 return (len1 < len2) ? -1 : (len1 != len2);
6383}
6384
6385#endif
6386
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006388 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389{
6390 PyUnicodeObject *u = NULL, *v = NULL;
6391 int result;
6392
6393 /* Coerce the two arguments */
6394 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6395 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006396 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6398 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006399 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400
Thomas Wouters7e474022000-07-16 12:04:32 +00006401 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006403 Py_DECREF(u);
6404 Py_DECREF(v);
6405 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006406 }
6407
6408 result = unicode_compare(u, v);
6409
6410 Py_DECREF(u);
6411 Py_DECREF(v);
6412 return result;
6413
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006414 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415 Py_XDECREF(u);
6416 Py_XDECREF(v);
6417 return -1;
6418}
6419
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006420PyObject *PyUnicode_RichCompare(PyObject *left,
6421 PyObject *right,
6422 int op)
6423{
6424 int result;
6425
6426 result = PyUnicode_Compare(left, right);
6427 if (result == -1 && PyErr_Occurred())
6428 goto onError;
6429
6430 /* Convert the return value to a Boolean */
6431 switch (op) {
6432 case Py_EQ:
6433 result = (result == 0);
6434 break;
6435 case Py_NE:
6436 result = (result != 0);
6437 break;
6438 case Py_LE:
6439 result = (result <= 0);
6440 break;
6441 case Py_GE:
6442 result = (result >= 0);
6443 break;
6444 case Py_LT:
6445 result = (result == -1);
6446 break;
6447 case Py_GT:
6448 result = (result == 1);
6449 break;
6450 }
6451 return PyBool_FromLong(result);
6452
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006453 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006454
6455 /* Standard case
6456
6457 Type errors mean that PyUnicode_FromObject() could not convert
6458 one of the arguments (usually the right hand side) to Unicode,
6459 ie. we can't handle the comparison request. However, it is
6460 possible that the other object knows a comparison method, which
6461 is why we return Py_NotImplemented to give the other object a
6462 chance.
6463
6464 */
6465 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6466 PyErr_Clear();
6467 Py_INCREF(Py_NotImplemented);
6468 return Py_NotImplemented;
6469 }
6470 if (op != Py_EQ && op != Py_NE)
6471 return NULL;
6472
6473 /* Equality comparison.
6474
6475 This is a special case: we silence any PyExc_UnicodeDecodeError
6476 and instead turn it into a PyErr_UnicodeWarning.
6477
6478 */
6479 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6480 return NULL;
6481 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006482 if (PyErr_Warn(PyExc_UnicodeWarning,
6483 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006484 "Unicode equal comparison "
6485 "failed to convert both arguments to Unicode - "
6486 "interpreting them as being unequal" :
6487 "Unicode unequal comparison "
6488 "failed to convert both arguments to Unicode - "
6489 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006490 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006491 return NULL;
6492 result = (op == Py_NE);
6493 return PyBool_FromLong(result);
6494}
6495
Guido van Rossum403d68b2000-03-13 15:55:09 +00006496int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006497 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006498{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006499 PyObject *str, *sub;
6500 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006501
6502 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006503 sub = PyUnicode_FromObject(element);
6504 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006505 PyErr_SetString(PyExc_TypeError,
6506 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006507 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006508 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006509
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006510 str = PyUnicode_FromObject(container);
6511 if (!str) {
6512 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006513 return -1;
6514 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006515
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006516 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006517
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006518 Py_DECREF(str);
6519 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006520
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006521 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006522}
6523
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524/* Concat to string or Unicode object giving a new Unicode object. */
6525
6526PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006527 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528{
6529 PyUnicodeObject *u = NULL, *v = NULL, *w;
6530
6531 /* Coerce the two arguments */
6532 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6533 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006534 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6536 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006537 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538
6539 /* Shortcuts */
6540 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006541 Py_DECREF(v);
6542 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 }
6544 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006545 Py_DECREF(u);
6546 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 }
6548
6549 /* Concat the two Unicode strings */
6550 w = _PyUnicode_New(u->length + v->length);
6551 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006552 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553 Py_UNICODE_COPY(w->str, u->str, u->length);
6554 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6555
6556 Py_DECREF(u);
6557 Py_DECREF(v);
6558 return (PyObject *)w;
6559
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006560 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 Py_XDECREF(u);
6562 Py_XDECREF(v);
6563 return NULL;
6564}
6565
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006566PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006567 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006569Return the number of non-overlapping occurrences of substring sub in\n\
6570Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006571interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572
6573static PyObject *
6574unicode_count(PyUnicodeObject *self, PyObject *args)
6575{
6576 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006577 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006578 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 PyObject *result;
6580
Guido van Rossumb8872e62000-05-09 14:14:27 +00006581 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006582 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583 return NULL;
6584
6585 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006586 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006588 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006589
Fredrik Lundhc8162812006-05-26 19:33:03 +00006590 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006592 result = PyInt_FromSsize_t(
6593 stringlib_count(self->str + start, end - start,
6594 substring->str, substring->length)
6595 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596
6597 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006598
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 return result;
6600}
6601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006602PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006603 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006605Encodes S using the codec registered for encoding. encoding defaults\n\
6606to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006607handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006608a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6609'xmlcharrefreplace' as well as any other name registered with\n\
6610codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611
6612static PyObject *
6613unicode_encode(PyUnicodeObject *self, PyObject *args)
6614{
6615 char *encoding = NULL;
6616 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006617 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006618
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6620 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006621 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006622 if (v == NULL)
6623 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006624 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006625 PyErr_Format(PyExc_TypeError,
6626 "encoder did not return a string/unicode object "
6627 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006628 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006629 Py_DECREF(v);
6630 return NULL;
6631 }
6632 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006633
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006634 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006635 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006636}
6637
6638PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006639 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006640\n\
6641Decodes S using the codec registered for encoding. encoding defaults\n\
6642to the default encoding. errors may be given to set a different error\n\
6643handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6644a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6645as well as any other name registerd with codecs.register_error that is\n\
6646able to handle UnicodeDecodeErrors.");
6647
6648static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006649unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006650{
6651 char *encoding = NULL;
6652 char *errors = NULL;
6653 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006654
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006655 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6656 return NULL;
6657 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006658 if (v == NULL)
6659 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006660 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006661 PyErr_Format(PyExc_TypeError,
6662 "decoder did not return a string/unicode object "
6663 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006664 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006665 Py_DECREF(v);
6666 return NULL;
6667 }
6668 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006669
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006670 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006671 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672}
6673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006674PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006675 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676\n\
6677Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006678If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679
6680static PyObject*
6681unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6682{
6683 Py_UNICODE *e;
6684 Py_UNICODE *p;
6685 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006686 Py_UNICODE *qe;
6687 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688 PyUnicodeObject *u;
6689 int tabsize = 8;
6690
6691 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006692 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693
Thomas Wouters7e474022000-07-16 12:04:32 +00006694 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006695 i = 0; /* chars up to and including most recent \n or \r */
6696 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6697 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 for (p = self->str; p < e; p++)
6699 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006700 if (tabsize > 0) {
6701 incr = tabsize - (j % tabsize); /* cannot overflow */
6702 if (j > PY_SSIZE_T_MAX - incr)
6703 goto overflow1;
6704 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006705 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006708 if (j > PY_SSIZE_T_MAX - 1)
6709 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 j++;
6711 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006712 if (i > PY_SSIZE_T_MAX - j)
6713 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006715 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716 }
6717 }
6718
Guido van Rossum5bdff602008-03-11 21:18:06 +00006719 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006720 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006721
Guido van Rossumd57fd912000-03-10 22:53:23 +00006722 /* Second pass: create output string and fill it */
6723 u = _PyUnicode_New(i + j);
6724 if (!u)
6725 return NULL;
6726
Guido van Rossum5bdff602008-03-11 21:18:06 +00006727 j = 0; /* same as in first pass */
6728 q = u->str; /* next output char */
6729 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730
6731 for (p = self->str; p < e; p++)
6732 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006733 if (tabsize > 0) {
6734 i = tabsize - (j % tabsize);
6735 j += i;
6736 while (i--) {
6737 if (q >= qe)
6738 goto overflow2;
6739 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006740 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006741 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006742 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006743 else {
6744 if (q >= qe)
6745 goto overflow2;
6746 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006747 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748 if (*p == '\n' || *p == '\r')
6749 j = 0;
6750 }
6751
6752 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006753
6754 overflow2:
6755 Py_DECREF(u);
6756 overflow1:
6757 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6758 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759}
6760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006761PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006762 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763\n\
6764Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006765such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766arguments start and end are interpreted as in slice notation.\n\
6767\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006768Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769
6770static PyObject *
6771unicode_find(PyUnicodeObject *self, PyObject *args)
6772{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006773 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006774 Py_ssize_t start;
6775 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006776 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777
Facundo Batista57d56692007-11-16 18:04:14 +00006778 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006779 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006781 result = stringlib_find_slice(
6782 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6783 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6784 start, end
6785 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786
6787 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006788
6789 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790}
6791
6792static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006793unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794{
6795 if (index < 0 || index >= self->length) {
6796 PyErr_SetString(PyExc_IndexError, "string index out of range");
6797 return NULL;
6798 }
6799
6800 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6801}
6802
6803static long
6804unicode_hash(PyUnicodeObject *self)
6805{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006806 /* Since Unicode objects compare equal to their ASCII string
6807 counterparts, they should use the individual character values
6808 as basis for their hash value. This is needed to assure that
6809 strings and Unicode objects behave in the same way as
6810 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811
Martin v. Löwis18e16552006-02-15 17:27:45 +00006812 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006813 register Py_UNICODE *p;
6814 register long x;
6815
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006817 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006818 len = PyUnicode_GET_SIZE(self);
6819 p = PyUnicode_AS_UNICODE(self);
6820 x = *p << 7;
6821 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006822 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006823 x ^= PyUnicode_GET_SIZE(self);
6824 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006825 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006826 self->hash = x;
6827 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828}
6829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006830PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006831 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006833Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834
6835static PyObject *
6836unicode_index(PyUnicodeObject *self, PyObject *args)
6837{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006838 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006839 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006840 Py_ssize_t start;
6841 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842
Facundo Batista57d56692007-11-16 18:04:14 +00006843 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006846 result = stringlib_find_slice(
6847 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6848 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6849 start, end
6850 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851
6852 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006853
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 if (result < 0) {
6855 PyErr_SetString(PyExc_ValueError, "substring not found");
6856 return NULL;
6857 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006858
Martin v. Löwis18e16552006-02-15 17:27:45 +00006859 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860}
6861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006862PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006863 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006865Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006866at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867
6868static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006869unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870{
6871 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6872 register const Py_UNICODE *e;
6873 int cased;
6874
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875 /* Shortcut for single character strings */
6876 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006877 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006879 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006880 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006881 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006882
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883 e = p + PyUnicode_GET_SIZE(self);
6884 cased = 0;
6885 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006886 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006887
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006888 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6889 return PyBool_FromLong(0);
6890 else if (!cased && Py_UNICODE_ISLOWER(ch))
6891 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006893 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894}
6895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006896PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006897 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006899Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006900at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901
6902static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006903unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904{
6905 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6906 register const Py_UNICODE *e;
6907 int cased;
6908
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 /* Shortcut for single character strings */
6910 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006911 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006913 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006914 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006915 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006916
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917 e = p + PyUnicode_GET_SIZE(self);
6918 cased = 0;
6919 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006920 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006921
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006922 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6923 return PyBool_FromLong(0);
6924 else if (!cased && Py_UNICODE_ISUPPER(ch))
6925 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006927 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928}
6929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006930PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006931 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006933Return True if S is a titlecased string and there is at least one\n\
6934character in S, i.e. upper- and titlecase characters may only\n\
6935follow uncased characters and lowercase characters only cased ones.\n\
6936Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937
6938static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006939unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940{
6941 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6942 register const Py_UNICODE *e;
6943 int cased, previous_is_cased;
6944
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 /* Shortcut for single character strings */
6946 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006947 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6948 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006950 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006951 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006952 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006953
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 e = p + PyUnicode_GET_SIZE(self);
6955 cased = 0;
6956 previous_is_cased = 0;
6957 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006958 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006959
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006960 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6961 if (previous_is_cased)
6962 return PyBool_FromLong(0);
6963 previous_is_cased = 1;
6964 cased = 1;
6965 }
6966 else if (Py_UNICODE_ISLOWER(ch)) {
6967 if (!previous_is_cased)
6968 return PyBool_FromLong(0);
6969 previous_is_cased = 1;
6970 cased = 1;
6971 }
6972 else
6973 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006974 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006975 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976}
6977
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006978PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006979 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006981Return True if all characters in S are whitespace\n\
6982and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983
6984static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006985unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986{
6987 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6988 register const Py_UNICODE *e;
6989
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990 /* Shortcut for single character strings */
6991 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006992 Py_UNICODE_ISSPACE(*p))
6993 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006995 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006996 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006997 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006998
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999 e = p + PyUnicode_GET_SIZE(self);
7000 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007001 if (!Py_UNICODE_ISSPACE(*p))
7002 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007004 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005}
7006
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007007PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007008 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007009\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007010Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007011and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007012
7013static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007014unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007015{
7016 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7017 register const Py_UNICODE *e;
7018
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007019 /* Shortcut for single character strings */
7020 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007021 Py_UNICODE_ISALPHA(*p))
7022 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007023
7024 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007025 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007026 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007027
7028 e = p + PyUnicode_GET_SIZE(self);
7029 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007030 if (!Py_UNICODE_ISALPHA(*p))
7031 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007032 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007033 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007034}
7035
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007036PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007037 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007038\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007039Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007040and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007041
7042static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007043unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007044{
7045 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7046 register const Py_UNICODE *e;
7047
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007048 /* Shortcut for single character strings */
7049 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007050 Py_UNICODE_ISALNUM(*p))
7051 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007052
7053 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007054 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007055 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007056
7057 e = p + PyUnicode_GET_SIZE(self);
7058 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007059 if (!Py_UNICODE_ISALNUM(*p))
7060 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007061 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007062 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007063}
7064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007065PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007066 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007068Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007069False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070
7071static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007072unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073{
7074 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7075 register const Py_UNICODE *e;
7076
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077 /* Shortcut for single character strings */
7078 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007079 Py_UNICODE_ISDECIMAL(*p))
7080 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007082 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007083 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007084 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007085
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086 e = p + PyUnicode_GET_SIZE(self);
7087 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007088 if (!Py_UNICODE_ISDECIMAL(*p))
7089 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007091 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092}
7093
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007094PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007095 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007097Return True if all characters in S are digits\n\
7098and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099
7100static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007101unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102{
7103 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7104 register const Py_UNICODE *e;
7105
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106 /* Shortcut for single character strings */
7107 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007108 Py_UNICODE_ISDIGIT(*p))
7109 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007111 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007112 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007113 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007114
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115 e = p + PyUnicode_GET_SIZE(self);
7116 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007117 if (!Py_UNICODE_ISDIGIT(*p))
7118 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007120 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121}
7122
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007123PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007124 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007126Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007127False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128
7129static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007130unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131{
7132 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7133 register const Py_UNICODE *e;
7134
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135 /* Shortcut for single character strings */
7136 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007137 Py_UNICODE_ISNUMERIC(*p))
7138 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007140 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007141 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007142 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007143
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144 e = p + PyUnicode_GET_SIZE(self);
7145 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007146 if (!Py_UNICODE_ISNUMERIC(*p))
7147 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007149 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150}
7151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007152PyDoc_STRVAR(join__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007153 "S.join(sequence) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154\n\
7155Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007156sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157
7158static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007159unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007161 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162}
7163
Martin v. Löwis18e16552006-02-15 17:27:45 +00007164static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007165unicode_length(PyUnicodeObject *self)
7166{
7167 return self->length;
7168}
7169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007170PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007171 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007173Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007174done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175
7176static PyObject *
7177unicode_ljust(PyUnicodeObject *self, PyObject *args)
7178{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007179 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007180 Py_UNICODE fillchar = ' ';
7181
Martin v. Löwis412fb672006-04-13 06:34:32 +00007182 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 return NULL;
7184
Tim Peters7a29bd52001-09-12 03:03:31 +00007185 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186 Py_INCREF(self);
7187 return (PyObject*) self;
7188 }
7189
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007190 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191}
7192
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007193PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007194 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007196Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007197
7198static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007199unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201 return fixup(self, fixlower);
7202}
7203
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007204#define LEFTSTRIP 0
7205#define RIGHTSTRIP 1
7206#define BOTHSTRIP 2
7207
7208/* Arrays indexed by above */
7209static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7210
7211#define STRIPNAME(i) (stripformat[i]+3)
7212
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007213/* externally visible for str.strip(unicode) */
7214PyObject *
7215_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7216{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007217 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7218 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7219 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7220 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7221 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007222
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007223 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007224
Benjamin Peterson857ce152009-01-31 16:29:18 +00007225 i = 0;
7226 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007227 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7228 i++;
7229 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00007230 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007231
Benjamin Peterson857ce152009-01-31 16:29:18 +00007232 j = len;
7233 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007234 do {
7235 j--;
7236 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7237 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007238 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007239
Benjamin Peterson857ce152009-01-31 16:29:18 +00007240 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007241 Py_INCREF(self);
7242 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007243 }
7244 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007245 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007246}
7247
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248
7249static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007250do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007252 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7253 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007254
Benjamin Peterson857ce152009-01-31 16:29:18 +00007255 i = 0;
7256 if (striptype != RIGHTSTRIP) {
7257 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7258 i++;
7259 }
7260 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007261
Benjamin Peterson857ce152009-01-31 16:29:18 +00007262 j = len;
7263 if (striptype != LEFTSTRIP) {
7264 do {
7265 j--;
7266 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7267 j++;
7268 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007269
Benjamin Peterson857ce152009-01-31 16:29:18 +00007270 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7271 Py_INCREF(self);
7272 return (PyObject*)self;
7273 }
7274 else
7275 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276}
7277
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007278
7279static PyObject *
7280do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7281{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007282 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007283
Benjamin Peterson857ce152009-01-31 16:29:18 +00007284 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7285 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007286
Benjamin Peterson857ce152009-01-31 16:29:18 +00007287 if (sep != NULL && sep != Py_None) {
7288 if (PyUnicode_Check(sep))
7289 return _PyUnicode_XStrip(self, striptype, sep);
7290 else if (PyString_Check(sep)) {
7291 PyObject *res;
7292 sep = PyUnicode_FromObject(sep);
7293 if (sep==NULL)
7294 return NULL;
7295 res = _PyUnicode_XStrip(self, striptype, sep);
7296 Py_DECREF(sep);
7297 return res;
7298 }
7299 else {
7300 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007301 "%s arg must be None, unicode or str",
7302 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007303 return NULL;
7304 }
7305 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007306
Benjamin Peterson857ce152009-01-31 16:29:18 +00007307 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007308}
7309
7310
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007311PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007312 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007313\n\
7314Return a copy of the string S with leading and trailing\n\
7315whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007316If chars is given and not None, remove characters in chars instead.\n\
7317If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007318
7319static PyObject *
7320unicode_strip(PyUnicodeObject *self, PyObject *args)
7321{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007322 if (PyTuple_GET_SIZE(args) == 0)
7323 return do_strip(self, BOTHSTRIP); /* Common case */
7324 else
7325 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007326}
7327
7328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007329PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007330 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007331\n\
7332Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007333If chars is given and not None, remove characters in chars instead.\n\
7334If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007335
7336static PyObject *
7337unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7338{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007339 if (PyTuple_GET_SIZE(args) == 0)
7340 return do_strip(self, LEFTSTRIP); /* Common case */
7341 else
7342 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007343}
7344
7345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007346PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007347 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007348\n\
7349Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007350If chars is given and not None, remove characters in chars instead.\n\
7351If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007352
7353static PyObject *
7354unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7355{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007356 if (PyTuple_GET_SIZE(args) == 0)
7357 return do_strip(self, RIGHTSTRIP); /* Common case */
7358 else
7359 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007360}
7361
7362
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007364unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365{
7366 PyUnicodeObject *u;
7367 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007368 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007369 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007370
7371 if (len < 0)
7372 len = 0;
7373
Tim Peters7a29bd52001-09-12 03:03:31 +00007374 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375 /* no repeat, return original string */
7376 Py_INCREF(str);
7377 return (PyObject*) str;
7378 }
Tim Peters8f422462000-09-09 06:13:41 +00007379
7380 /* ensure # of chars needed doesn't overflow int and # of bytes
7381 * needed doesn't overflow size_t
7382 */
7383 nchars = len * str->length;
7384 if (len && nchars / len != str->length) {
7385 PyErr_SetString(PyExc_OverflowError,
7386 "repeated string is too long");
7387 return NULL;
7388 }
7389 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7390 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7391 PyErr_SetString(PyExc_OverflowError,
7392 "repeated string is too long");
7393 return NULL;
7394 }
7395 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396 if (!u)
7397 return NULL;
7398
7399 p = u->str;
7400
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007401 if (str->length == 1 && len > 0) {
7402 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007403 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007404 Py_ssize_t done = 0; /* number of characters copied this far */
7405 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007406 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007407 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007408 }
7409 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007410 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007411 Py_UNICODE_COPY(p+done, p, n);
7412 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007413 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007414 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415
7416 return (PyObject*) u;
7417}
7418
7419PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007420 PyObject *subobj,
7421 PyObject *replobj,
7422 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007423{
7424 PyObject *self;
7425 PyObject *str1;
7426 PyObject *str2;
7427 PyObject *result;
7428
7429 self = PyUnicode_FromObject(obj);
7430 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007431 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432 str1 = PyUnicode_FromObject(subobj);
7433 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007434 Py_DECREF(self);
7435 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436 }
7437 str2 = PyUnicode_FromObject(replobj);
7438 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007439 Py_DECREF(self);
7440 Py_DECREF(str1);
7441 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442 }
Tim Petersced69f82003-09-16 20:30:58 +00007443 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007444 (PyUnicodeObject *)str1,
7445 (PyUnicodeObject *)str2,
7446 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447 Py_DECREF(self);
7448 Py_DECREF(str1);
7449 Py_DECREF(str2);
7450 return result;
7451}
7452
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007453PyDoc_STRVAR(replace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007454 "S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455\n\
7456Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007457old replaced by new. If the optional argument count is\n\
7458given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459
7460static PyObject*
7461unicode_replace(PyUnicodeObject *self, PyObject *args)
7462{
7463 PyUnicodeObject *str1;
7464 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007465 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466 PyObject *result;
7467
Martin v. Löwis18e16552006-02-15 17:27:45 +00007468 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007469 return NULL;
7470 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7471 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007472 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007474 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007475 Py_DECREF(str1);
7476 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007477 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478
7479 result = replace(self, str1, str2, maxcount);
7480
7481 Py_DECREF(str1);
7482 Py_DECREF(str2);
7483 return result;
7484}
7485
7486static
7487PyObject *unicode_repr(PyObject *unicode)
7488{
7489 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007490 PyUnicode_GET_SIZE(unicode),
7491 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492}
7493
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007494PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007495 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007496\n\
7497Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007498such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499arguments start and end are interpreted as in slice notation.\n\
7500\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007501Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502
7503static PyObject *
7504unicode_rfind(PyUnicodeObject *self, PyObject *args)
7505{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007506 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007507 Py_ssize_t start;
7508 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007509 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510
Facundo Batista57d56692007-11-16 18:04:14 +00007511 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007512 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007514 result = stringlib_rfind_slice(
7515 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7516 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7517 start, end
7518 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519
7520 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007521
7522 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523}
7524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007525PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007526 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007527\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007528Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529
7530static PyObject *
7531unicode_rindex(PyUnicodeObject *self, PyObject *args)
7532{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007533 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007534 Py_ssize_t start;
7535 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007536 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537
Facundo Batista57d56692007-11-16 18:04:14 +00007538 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007541 result = stringlib_rfind_slice(
7542 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7543 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7544 start, end
7545 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546
7547 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007548
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 if (result < 0) {
7550 PyErr_SetString(PyExc_ValueError, "substring not found");
7551 return NULL;
7552 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007553 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554}
7555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007556PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007557 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007559Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007560done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561
7562static PyObject *
7563unicode_rjust(PyUnicodeObject *self, PyObject *args)
7564{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007565 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007566 Py_UNICODE fillchar = ' ';
7567
Martin v. Löwis412fb672006-04-13 06:34:32 +00007568 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569 return NULL;
7570
Tim Peters7a29bd52001-09-12 03:03:31 +00007571 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 Py_INCREF(self);
7573 return (PyObject*) self;
7574 }
7575
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007576 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577}
7578
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007580unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581{
7582 /* standard clamping */
7583 if (start < 0)
7584 start = 0;
7585 if (end < 0)
7586 end = 0;
7587 if (end > self->length)
7588 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007589 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 /* full slice, return original string */
7591 Py_INCREF(self);
7592 return (PyObject*) self;
7593 }
7594 if (start > end)
7595 start = end;
7596 /* copy slice */
7597 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007598 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599}
7600
7601PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007602 PyObject *sep,
7603 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604{
7605 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007606
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 s = PyUnicode_FromObject(s);
7608 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007609 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007610 if (sep != NULL) {
7611 sep = PyUnicode_FromObject(sep);
7612 if (sep == NULL) {
7613 Py_DECREF(s);
7614 return NULL;
7615 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616 }
7617
7618 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7619
7620 Py_DECREF(s);
7621 Py_XDECREF(sep);
7622 return result;
7623}
7624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007625PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007626 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627\n\
7628Return a list of the words in S, using sep as the\n\
7629delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007630splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007631whitespace string is a separator and empty strings are\n\
7632removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633
7634static PyObject*
7635unicode_split(PyUnicodeObject *self, PyObject *args)
7636{
7637 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007638 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639
Martin v. Löwis18e16552006-02-15 17:27:45 +00007640 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641 return NULL;
7642
7643 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007644 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007646 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007648 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649}
7650
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007651PyObject *
7652PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7653{
7654 PyObject* str_obj;
7655 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007656 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007657
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007658 str_obj = PyUnicode_FromObject(str_in);
7659 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007660 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007661 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007662 if (!sep_obj) {
7663 Py_DECREF(str_obj);
7664 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007665 }
7666
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007667 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007668 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7669 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7670 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007671
Fredrik Lundhb9479482006-05-26 17:22:38 +00007672 Py_DECREF(sep_obj);
7673 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007674
7675 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007676}
7677
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007678
7679PyObject *
7680PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7681{
7682 PyObject* str_obj;
7683 PyObject* sep_obj;
7684 PyObject* out;
7685
7686 str_obj = PyUnicode_FromObject(str_in);
7687 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007688 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007689 sep_obj = PyUnicode_FromObject(sep_in);
7690 if (!sep_obj) {
7691 Py_DECREF(str_obj);
7692 return NULL;
7693 }
7694
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007695 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007696 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7697 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7698 );
7699
7700 Py_DECREF(sep_obj);
7701 Py_DECREF(str_obj);
7702
7703 return out;
7704}
7705
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007706PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007707 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007708\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007709Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007710the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007711found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007712
7713static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007714unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007715{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007716 return PyUnicode_Partition((PyObject *)self, separator);
7717}
7718
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007719PyDoc_STRVAR(rpartition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007720 "S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007721\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007722Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007723the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007724separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007725
7726static PyObject*
7727unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7728{
7729 return PyUnicode_RPartition((PyObject *)self, separator);
7730}
7731
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007732PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007733 PyObject *sep,
7734 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007735{
7736 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007737
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007738 s = PyUnicode_FromObject(s);
7739 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007740 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007741 if (sep != NULL) {
7742 sep = PyUnicode_FromObject(sep);
7743 if (sep == NULL) {
7744 Py_DECREF(s);
7745 return NULL;
7746 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007747 }
7748
7749 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7750
7751 Py_DECREF(s);
7752 Py_XDECREF(sep);
7753 return result;
7754}
7755
7756PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007757 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007758\n\
7759Return a list of the words in S, using sep as the\n\
7760delimiter string, starting at the end of the string and\n\
7761working to the front. If maxsplit is given, at most maxsplit\n\
7762splits are done. If sep is not specified, any whitespace string\n\
7763is a separator.");
7764
7765static PyObject*
7766unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7767{
7768 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007769 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007770
Martin v. Löwis18e16552006-02-15 17:27:45 +00007771 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007772 return NULL;
7773
7774 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007775 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007776 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007777 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007778 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007779 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007780}
7781
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007782PyDoc_STRVAR(splitlines__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007783 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784\n\
7785Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007786Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007787is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788
7789static PyObject*
7790unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7791{
Guido van Rossum86662912000-04-11 15:38:46 +00007792 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793
Guido van Rossum86662912000-04-11 15:38:46 +00007794 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795 return NULL;
7796
Guido van Rossum86662912000-04-11 15:38:46 +00007797 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798}
7799
7800static
7801PyObject *unicode_str(PyUnicodeObject *self)
7802{
Fred Drakee4315f52000-05-09 19:53:39 +00007803 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804}
7805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007806PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007807 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808\n\
7809Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007810and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811
7812static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007813unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815 return fixup(self, fixswapcase);
7816}
7817
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007818PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007819 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820\n\
7821Return a copy of the string S, where all characters have been mapped\n\
7822through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007823Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7824Unmapped characters are left untouched. Characters mapped to None\n\
7825are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826
7827static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007828unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829{
Tim Petersced69f82003-09-16 20:30:58 +00007830 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007831 self->length,
7832 table,
7833 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834}
7835
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007836PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007837 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007839Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840
7841static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007842unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007843{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 return fixup(self, fixupper);
7845}
7846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007847PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007848 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849\n\
Georg Brandl98064072008-09-09 19:26:00 +00007850Pad a numeric string S with zeros on the left, to fill a field\n\
7851of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852
7853static PyObject *
7854unicode_zfill(PyUnicodeObject *self, PyObject *args)
7855{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007856 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857 PyUnicodeObject *u;
7858
Martin v. Löwis18e16552006-02-15 17:27:45 +00007859 Py_ssize_t width;
7860 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861 return NULL;
7862
7863 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007864 if (PyUnicode_CheckExact(self)) {
7865 Py_INCREF(self);
7866 return (PyObject*) self;
7867 }
7868 else
7869 return PyUnicode_FromUnicode(
7870 PyUnicode_AS_UNICODE(self),
7871 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007872 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873 }
7874
7875 fill = width - self->length;
7876
7877 u = pad(self, fill, 0, '0');
7878
Walter Dörwald068325e2002-04-15 13:36:47 +00007879 if (u == NULL)
7880 return NULL;
7881
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882 if (u->str[fill] == '+' || u->str[fill] == '-') {
7883 /* move sign to beginning of string */
7884 u->str[0] = u->str[fill];
7885 u->str[fill] = '0';
7886 }
7887
7888 return (PyObject*) u;
7889}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890
7891#if 0
7892static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007893free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007895 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896}
7897#endif
7898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007899PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007900 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007902Return True if S starts with the specified prefix, False otherwise.\n\
7903With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007904With optional end, stop comparing S at that position.\n\
7905prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007906
7907static PyObject *
7908unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007909 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007910{
Georg Brandl24250812006-06-09 18:45:48 +00007911 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007913 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007914 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007915 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916
Georg Brandl24250812006-06-09 18:45:48 +00007917 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007918 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7919 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007920 if (PyTuple_Check(subobj)) {
7921 Py_ssize_t i;
7922 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7923 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007924 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007925 if (substring == NULL)
7926 return NULL;
7927 result = tailmatch(self, substring, start, end, -1);
7928 Py_DECREF(substring);
7929 if (result) {
7930 Py_RETURN_TRUE;
7931 }
7932 }
7933 /* nothing matched */
7934 Py_RETURN_FALSE;
7935 }
7936 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007937 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007938 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007939 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007941 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942}
7943
7944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007945PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007946 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007948Return True if S ends with the specified suffix, False otherwise.\n\
7949With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007950With optional end, stop comparing S at that position.\n\
7951suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952
7953static PyObject *
7954unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007955 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956{
Georg Brandl24250812006-06-09 18:45:48 +00007957 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007959 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007960 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007961 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962
Georg Brandl24250812006-06-09 18:45:48 +00007963 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007964 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7965 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007966 if (PyTuple_Check(subobj)) {
7967 Py_ssize_t i;
7968 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7969 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007970 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007971 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007972 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007973 result = tailmatch(self, substring, start, end, +1);
7974 Py_DECREF(substring);
7975 if (result) {
7976 Py_RETURN_TRUE;
7977 }
7978 }
7979 Py_RETURN_FALSE;
7980 }
7981 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007983 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984
Georg Brandl24250812006-06-09 18:45:48 +00007985 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007987 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988}
7989
7990
Eric Smitha9f7d622008-02-17 19:46:49 +00007991/* Implements do_string_format, which is unicode because of stringlib */
7992#include "stringlib/string_format.h"
7993
7994PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007995 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007996\n\
7997");
7998
Eric Smithdc13b792008-05-30 18:10:04 +00007999static PyObject *
8000unicode__format__(PyObject *self, PyObject *args)
8001{
8002 PyObject *format_spec;
8003 PyObject *result = NULL;
8004 PyObject *tmp = NULL;
8005
8006 /* If 2.x, convert format_spec to the same type as value */
8007 /* This is to allow things like u''.format('') */
8008 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
8009 goto done;
8010 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
8011 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008012 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00008013 goto done;
8014 }
8015 tmp = PyObject_Unicode(format_spec);
8016 if (tmp == NULL)
8017 goto done;
8018 format_spec = tmp;
8019
8020 result = _PyUnicode_FormatAdvanced(self,
8021 PyUnicode_AS_UNICODE(format_spec),
8022 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008023 done:
Eric Smithdc13b792008-05-30 18:10:04 +00008024 Py_XDECREF(tmp);
8025 return result;
8026}
8027
Eric Smitha9f7d622008-02-17 19:46:49 +00008028PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008029 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00008030\n\
8031");
8032
Robert Schuppenies901c9972008-06-10 10:10:31 +00008033static PyObject *
8034unicode__sizeof__(PyUnicodeObject *v)
8035{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00008036 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
8037 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00008038}
8039
8040PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008041 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00008042\n\
8043");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008044
8045static PyObject *
8046unicode_getnewargs(PyUnicodeObject *v)
8047{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008048 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00008049}
8050
8051
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052static PyMethodDef unicode_methods[] = {
8053
8054 /* Order is according to common usage: often used methods should
8055 appear first, since lookup is done sequentially. */
8056
Georg Brandlecdc0a92006-03-30 12:19:07 +00008057 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008058 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8059 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008060 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008061 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8062 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8063 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8064 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8065 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8066 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8067 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00008068 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008069 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8070 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8071 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008072 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00008073 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008074/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
8075 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8076 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8077 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008078 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00008079 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008080 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008081 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008082 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8083 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8084 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8085 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8086 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8087 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8088 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8089 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8090 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8091 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8092 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8093 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8094 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8095 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008096 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00008097 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8098 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8099 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8100 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00008101 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008102#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008103 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104#endif
8105
8106#if 0
8107 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008108 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109#endif
8110
Benjamin Peterson857ce152009-01-31 16:29:18 +00008111 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112 {NULL, NULL}
8113};
8114
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008115static PyObject *
8116unicode_mod(PyObject *v, PyObject *w)
8117{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008118 if (!PyUnicode_Check(v)) {
8119 Py_INCREF(Py_NotImplemented);
8120 return Py_NotImplemented;
8121 }
8122 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008123}
8124
8125static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008126 0, /*nb_add*/
8127 0, /*nb_subtract*/
8128 0, /*nb_multiply*/
8129 0, /*nb_divide*/
8130 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008131};
8132
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008134 (lenfunc) unicode_length, /* sq_length */
8135 PyUnicode_Concat, /* sq_concat */
8136 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8137 (ssizeargfunc) unicode_getitem, /* sq_item */
8138 (ssizessizeargfunc) unicode_slice, /* sq_slice */
8139 0, /* sq_ass_item */
8140 0, /* sq_ass_slice */
8141 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142};
8143
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008144static PyObject*
8145unicode_subscript(PyUnicodeObject* self, PyObject* item)
8146{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00008147 if (PyIndex_Check(item)) {
8148 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008149 if (i == -1 && PyErr_Occurred())
8150 return NULL;
8151 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008152 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008153 return unicode_getitem(self, i);
8154 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008155 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008156 Py_UNICODE* source_buf;
8157 Py_UNICODE* result_buf;
8158 PyObject* result;
8159
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008160 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008161 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008162 return NULL;
8163 }
8164
8165 if (slicelength <= 0) {
8166 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008167 } else if (start == 0 && step == 1 && slicelength == self->length &&
8168 PyUnicode_CheckExact(self)) {
8169 Py_INCREF(self);
8170 return (PyObject *)self;
8171 } else if (step == 1) {
8172 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008173 } else {
8174 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008175 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8176 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008177
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008178 if (result_buf == NULL)
8179 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008180
8181 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8182 result_buf[i] = source_buf[cur];
8183 }
Tim Petersced69f82003-09-16 20:30:58 +00008184
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008185 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008186 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008187 return result;
8188 }
8189 } else {
8190 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8191 return NULL;
8192 }
8193}
8194
8195static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008196 (lenfunc)unicode_length, /* mp_length */
8197 (binaryfunc)unicode_subscript, /* mp_subscript */
8198 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008199};
8200
Martin v. Löwis18e16552006-02-15 17:27:45 +00008201static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008202unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008203 Py_ssize_t index,
8204 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008205{
8206 if (index != 0) {
8207 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008208 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209 return -1;
8210 }
8211 *ptr = (void *) self->str;
8212 return PyUnicode_GET_DATA_SIZE(self);
8213}
8214
Martin v. Löwis18e16552006-02-15 17:27:45 +00008215static Py_ssize_t
8216unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008217 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218{
8219 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008220 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008221 return -1;
8222}
8223
8224static int
8225unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008226 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227{
8228 if (lenp)
8229 *lenp = PyUnicode_GET_DATA_SIZE(self);
8230 return 1;
8231}
8232
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008233static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008235 Py_ssize_t index,
8236 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237{
8238 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008239
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240 if (index != 0) {
8241 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008242 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243 return -1;
8244 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008245 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008247 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008248 *ptr = (void *) PyString_AS_STRING(str);
8249 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008250}
8251
8252/* Helpers for PyUnicode_Format() */
8253
8254static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008255getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008257 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008259 (*p_argidx)++;
8260 if (arglen < 0)
8261 return args;
8262 else
8263 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264 }
8265 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008266 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267 return NULL;
8268}
8269
8270#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008271#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008273#define F_ALT (1<<3)
8274#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275
Martin v. Löwis18e16552006-02-15 17:27:45 +00008276static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008277strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008279 register Py_ssize_t i;
8280 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008282 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284 return len;
8285}
8286
Neal Norwitzfc76d632006-01-10 06:03:13 +00008287static int
Eric Smith068f0652009-04-25 21:40:15 +00008288doubletounicode(Py_UNICODE *buffer, size_t len, int format_code,
8289 int precision, int flags, double x)
Neal Norwitzfc76d632006-01-10 06:03:13 +00008290{
Tim Peters15231542006-02-16 01:08:01 +00008291 Py_ssize_t result;
8292
Eric Smith068f0652009-04-25 21:40:15 +00008293 _PyOS_double_to_string((char *)buffer, len, x, format_code, precision,
8294 flags, NULL);
Tim Peters15231542006-02-16 01:08:01 +00008295 result = strtounicode(buffer, (char *)buffer);
8296 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008297}
8298
8299static int
8300longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8301{
Tim Peters15231542006-02-16 01:08:01 +00008302 Py_ssize_t result;
8303
Neal Norwitzfc76d632006-01-10 06:03:13 +00008304 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008305 result = strtounicode(buffer, (char *)buffer);
8306 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008307}
8308
Guido van Rossum078151d2002-08-11 04:24:12 +00008309/* XXX To save some code duplication, formatfloat/long/int could have been
8310 shared with stringobject.c, converting from 8-bit to Unicode after the
8311 formatting is done. */
8312
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313static int
8314formatfloat(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008315 size_t buflen,
8316 int flags,
8317 int prec,
8318 int type,
8319 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008322
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323 x = PyFloat_AsDouble(v);
8324 if (x == -1.0 && PyErr_Occurred())
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008325 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008327 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008328 /* make sure that the decimal representation of precision really does
8329 need at most 10 digits: platforms with sizeof(int) == 8 exist! */
8330 if (prec > 0x7fffffffL) {
8331 PyErr_SetString(PyExc_OverflowError,
8332 "outrageously large precision "
8333 "for formatted float");
8334 return -1;
8335 }
8336
Mark Dickinson2e648ec2009-03-29 14:37:51 +00008337 if (type == 'f' && fabs(x) >= 1e50)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008338 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008339 /* Worst case length calc to ensure no buffer overrun:
8340
8341 'g' formats:
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008342 fmt = %#.<prec>g
8343 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8344 for any double rep.)
8345 len = 1 + prec + 1 + 2 + 5 = 9 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008346
8347 'f' formats:
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008348 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8349 len = 1 + 50 + 1 + prec = 52 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008350
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008351 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008352 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008353
8354 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008355 if (((type == 'g' || type == 'G') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008356 buflen <= (size_t)10 + (size_t)prec) ||
8357 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8358 PyErr_SetString(PyExc_OverflowError,
8359 "formatted float is too long (precision too large?)");
8360 return -1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008361 }
Eric Smith068f0652009-04-25 21:40:15 +00008362 return doubletounicode(buf, buflen, type, prec,
8363 (flags&F_ALT)?Py_DTSF_ALT:0, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364}
8365
Tim Peters38fd5b62000-09-21 05:43:11 +00008366static PyObject*
8367formatlong(PyObject *val, int flags, int prec, int type)
8368{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008369 char *buf;
8370 int i, len;
8371 PyObject *str; /* temporary string object. */
8372 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008373
Benjamin Peterson857ce152009-01-31 16:29:18 +00008374 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8375 if (!str)
8376 return NULL;
8377 result = _PyUnicode_New(len);
8378 if (!result) {
8379 Py_DECREF(str);
8380 return NULL;
8381 }
8382 for (i = 0; i < len; i++)
8383 result->str[i] = buf[i];
8384 result->str[len] = 0;
8385 Py_DECREF(str);
8386 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008387}
8388
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389static int
8390formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008391 size_t buflen,
8392 int flags,
8393 int prec,
8394 int type,
8395 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008397 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008398 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8399 * + 1 + 1
8400 * = 24
8401 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008402 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008403 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404 long x;
8405
8406 x = PyInt_AsLong(v);
8407 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008408 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008409 if (x < 0 && type == 'u') {
8410 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008411 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008412 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8413 sign = "-";
8414 else
8415 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008417 prec = 1;
8418
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008419 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8420 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008421 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008422 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008423 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008424 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008425 return -1;
8426 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008427
8428 if ((flags & F_ALT) &&
8429 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008430 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008431 * of issues that cause pain:
8432 * - when 0 is being converted, the C standard leaves off
8433 * the '0x' or '0X', which is inconsistent with other
8434 * %#x/%#X conversions and inconsistent with Python's
8435 * hex() function
8436 * - there are platforms that violate the standard and
8437 * convert 0 with the '0x' or '0X'
8438 * (Metrowerks, Compaq Tru64)
8439 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008440 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008441 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008442 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008443 * We can achieve the desired consistency by inserting our
8444 * own '0x' or '0X' prefix, and substituting %x/%X in place
8445 * of %#x/%#X.
8446 *
8447 * Note that this is the same approach as used in
8448 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008449 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008450 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8451 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008452 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008453 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008454 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8455 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008456 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008457 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008458 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008459 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008460 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008461 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008462}
8463
8464static int
8465formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008466 size_t buflen,
8467 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008468{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008469 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008470 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008471 if (PyUnicode_GET_SIZE(v) != 1)
8472 goto onError;
8473 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008476 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008477 if (PyString_GET_SIZE(v) != 1)
8478 goto onError;
8479 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481
8482 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008483 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008484 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008485 x = PyInt_AsLong(v);
8486 if (x == -1 && PyErr_Occurred())
8487 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008488#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008489 if (x < 0 || x > 0x10ffff) {
8490 PyErr_SetString(PyExc_OverflowError,
8491 "%c arg not in range(0x110000) "
8492 "(wide Python build)");
8493 return -1;
8494 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008495#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008496 if (x < 0 || x > 0xffff) {
8497 PyErr_SetString(PyExc_OverflowError,
8498 "%c arg not in range(0x10000) "
8499 "(narrow Python build)");
8500 return -1;
8501 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008502#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008503 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008504 }
8505 buf[1] = '\0';
8506 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008507
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008508 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008509 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008510 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008511 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512}
8513
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008514/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8515
8516 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8517 chars are formatted. XXX This is a magic number. Each formatting
8518 routine does bounds checking to ensure no overflow, but a better
8519 solution may be to malloc a buffer of appropriate size for each
8520 format. For now, the current solution is sufficient.
8521*/
8522#define FORMATBUFLEN (size_t)120
8523
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008525 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008526{
8527 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008528 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529 int args_owned = 0;
8530 PyUnicodeObject *result = NULL;
8531 PyObject *dict = NULL;
8532 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008533
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008535 PyErr_BadInternalCall();
8536 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537 }
8538 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008539 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008540 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541 fmt = PyUnicode_AS_UNICODE(uformat);
8542 fmtcnt = PyUnicode_GET_SIZE(uformat);
8543
8544 reslen = rescnt = fmtcnt + 100;
8545 result = _PyUnicode_New(reslen);
8546 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008547 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548 res = PyUnicode_AS_UNICODE(result);
8549
8550 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008551 arglen = PyTuple_Size(args);
8552 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008553 }
8554 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008555 arglen = -1;
8556 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008557 }
Christian Heimese93237d2007-12-19 02:37:44 +00008558 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008559 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008560 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561
8562 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008563 if (*fmt != '%') {
8564 if (--rescnt < 0) {
8565 rescnt = fmtcnt + 100;
8566 reslen += rescnt;
8567 if (_PyUnicode_Resize(&result, reslen) < 0)
8568 goto onError;
8569 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8570 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008571 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008572 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008573 }
8574 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008575 /* Got a format specifier */
8576 int flags = 0;
8577 Py_ssize_t width = -1;
8578 int prec = -1;
8579 Py_UNICODE c = '\0';
8580 Py_UNICODE fill;
8581 int isnumok;
8582 PyObject *v = NULL;
8583 PyObject *temp = NULL;
8584 Py_UNICODE *pbuf;
8585 Py_UNICODE sign;
8586 Py_ssize_t len;
8587 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8588
8589 fmt++;
8590 if (*fmt == '(') {
8591 Py_UNICODE *keystart;
8592 Py_ssize_t keylen;
8593 PyObject *key;
8594 int pcount = 1;
8595
8596 if (dict == NULL) {
8597 PyErr_SetString(PyExc_TypeError,
8598 "format requires a mapping");
8599 goto onError;
8600 }
8601 ++fmt;
8602 --fmtcnt;
8603 keystart = fmt;
8604 /* Skip over balanced parentheses */
8605 while (pcount > 0 && --fmtcnt >= 0) {
8606 if (*fmt == ')')
8607 --pcount;
8608 else if (*fmt == '(')
8609 ++pcount;
8610 fmt++;
8611 }
8612 keylen = fmt - keystart - 1;
8613 if (fmtcnt < 0 || pcount > 0) {
8614 PyErr_SetString(PyExc_ValueError,
8615 "incomplete format key");
8616 goto onError;
8617 }
8618#if 0
8619 /* keys are converted to strings using UTF-8 and
8620 then looked up since Python uses strings to hold
8621 variables names etc. in its namespaces and we
8622 wouldn't want to break common idioms. */
8623 key = PyUnicode_EncodeUTF8(keystart,
8624 keylen,
8625 NULL);
8626#else
8627 key = PyUnicode_FromUnicode(keystart, keylen);
8628#endif
8629 if (key == NULL)
8630 goto onError;
8631 if (args_owned) {
8632 Py_DECREF(args);
8633 args_owned = 0;
8634 }
8635 args = PyObject_GetItem(dict, key);
8636 Py_DECREF(key);
8637 if (args == NULL) {
8638 goto onError;
8639 }
8640 args_owned = 1;
8641 arglen = -1;
8642 argidx = -2;
8643 }
8644 while (--fmtcnt >= 0) {
8645 switch (c = *fmt++) {
8646 case '-': flags |= F_LJUST; continue;
8647 case '+': flags |= F_SIGN; continue;
8648 case ' ': flags |= F_BLANK; continue;
8649 case '#': flags |= F_ALT; continue;
8650 case '0': flags |= F_ZERO; continue;
8651 }
8652 break;
8653 }
8654 if (c == '*') {
8655 v = getnextarg(args, arglen, &argidx);
8656 if (v == NULL)
8657 goto onError;
8658 if (!PyInt_Check(v)) {
8659 PyErr_SetString(PyExc_TypeError,
8660 "* wants int");
8661 goto onError;
8662 }
8663 width = PyInt_AsLong(v);
8664 if (width < 0) {
8665 flags |= F_LJUST;
8666 width = -width;
8667 }
8668 if (--fmtcnt >= 0)
8669 c = *fmt++;
8670 }
8671 else if (c >= '0' && c <= '9') {
8672 width = c - '0';
8673 while (--fmtcnt >= 0) {
8674 c = *fmt++;
8675 if (c < '0' || c > '9')
8676 break;
8677 if ((width*10) / 10 != width) {
8678 PyErr_SetString(PyExc_ValueError,
8679 "width too big");
8680 goto onError;
8681 }
8682 width = width*10 + (c - '0');
8683 }
8684 }
8685 if (c == '.') {
8686 prec = 0;
8687 if (--fmtcnt >= 0)
8688 c = *fmt++;
8689 if (c == '*') {
8690 v = getnextarg(args, arglen, &argidx);
8691 if (v == NULL)
8692 goto onError;
8693 if (!PyInt_Check(v)) {
8694 PyErr_SetString(PyExc_TypeError,
8695 "* wants int");
8696 goto onError;
8697 }
8698 prec = PyInt_AsLong(v);
8699 if (prec < 0)
8700 prec = 0;
8701 if (--fmtcnt >= 0)
8702 c = *fmt++;
8703 }
8704 else if (c >= '0' && c <= '9') {
8705 prec = c - '0';
8706 while (--fmtcnt >= 0) {
8707 c = Py_CHARMASK(*fmt++);
8708 if (c < '0' || c > '9')
8709 break;
8710 if ((prec*10) / 10 != prec) {
8711 PyErr_SetString(PyExc_ValueError,
8712 "prec too big");
8713 goto onError;
8714 }
8715 prec = prec*10 + (c - '0');
8716 }
8717 }
8718 } /* prec */
8719 if (fmtcnt >= 0) {
8720 if (c == 'h' || c == 'l' || c == 'L') {
8721 if (--fmtcnt >= 0)
8722 c = *fmt++;
8723 }
8724 }
8725 if (fmtcnt < 0) {
8726 PyErr_SetString(PyExc_ValueError,
8727 "incomplete format");
8728 goto onError;
8729 }
8730 if (c != '%') {
8731 v = getnextarg(args, arglen, &argidx);
8732 if (v == NULL)
8733 goto onError;
8734 }
8735 sign = 0;
8736 fill = ' ';
8737 switch (c) {
8738
8739 case '%':
8740 pbuf = formatbuf;
8741 /* presume that buffer length is at least 1 */
8742 pbuf[0] = '%';
8743 len = 1;
8744 break;
8745
8746 case 's':
8747 case 'r':
8748 if (PyUnicode_Check(v) && c == 's') {
8749 temp = v;
8750 Py_INCREF(temp);
8751 }
8752 else {
8753 PyObject *unicode;
8754 if (c == 's')
8755 temp = PyObject_Unicode(v);
8756 else
8757 temp = PyObject_Repr(v);
8758 if (temp == NULL)
8759 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008760 if (PyUnicode_Check(temp))
8761 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008762 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008763 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008764 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8765 PyString_GET_SIZE(temp),
8766 NULL,
8767 "strict");
8768 Py_DECREF(temp);
8769 temp = unicode;
8770 if (temp == NULL)
8771 goto onError;
8772 }
8773 else {
8774 Py_DECREF(temp);
8775 PyErr_SetString(PyExc_TypeError,
8776 "%s argument has non-string str()");
8777 goto onError;
8778 }
8779 }
8780 pbuf = PyUnicode_AS_UNICODE(temp);
8781 len = PyUnicode_GET_SIZE(temp);
8782 if (prec >= 0 && len > prec)
8783 len = prec;
8784 break;
8785
8786 case 'i':
8787 case 'd':
8788 case 'u':
8789 case 'o':
8790 case 'x':
8791 case 'X':
8792 if (c == 'i')
8793 c = 'd';
8794 isnumok = 0;
8795 if (PyNumber_Check(v)) {
8796 PyObject *iobj=NULL;
8797
8798 if (PyInt_Check(v) || (PyLong_Check(v))) {
8799 iobj = v;
8800 Py_INCREF(iobj);
8801 }
8802 else {
8803 iobj = PyNumber_Int(v);
8804 if (iobj==NULL) iobj = PyNumber_Long(v);
8805 }
8806 if (iobj!=NULL) {
8807 if (PyInt_Check(iobj)) {
8808 isnumok = 1;
8809 pbuf = formatbuf;
8810 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8811 flags, prec, c, iobj);
8812 Py_DECREF(iobj);
8813 if (len < 0)
8814 goto onError;
8815 sign = 1;
8816 }
8817 else if (PyLong_Check(iobj)) {
8818 isnumok = 1;
8819 temp = formatlong(iobj, flags, prec, c);
8820 Py_DECREF(iobj);
8821 if (!temp)
8822 goto onError;
8823 pbuf = PyUnicode_AS_UNICODE(temp);
8824 len = PyUnicode_GET_SIZE(temp);
8825 sign = 1;
8826 }
8827 else {
8828 Py_DECREF(iobj);
8829 }
8830 }
8831 }
8832 if (!isnumok) {
8833 PyErr_Format(PyExc_TypeError,
8834 "%%%c format: a number is required, "
8835 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8836 goto onError;
8837 }
8838 if (flags & F_ZERO)
8839 fill = '0';
8840 break;
8841
8842 case 'e':
8843 case 'E':
8844 case 'f':
8845 case 'F':
8846 case 'g':
8847 case 'G':
8848 if (c == 'F')
8849 c = 'f';
8850 pbuf = formatbuf;
8851 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8852 flags, prec, c, v);
8853 if (len < 0)
8854 goto onError;
8855 sign = 1;
8856 if (flags & F_ZERO)
8857 fill = '0';
8858 break;
8859
8860 case 'c':
8861 pbuf = formatbuf;
8862 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8863 if (len < 0)
8864 goto onError;
8865 break;
8866
8867 default:
8868 PyErr_Format(PyExc_ValueError,
8869 "unsupported format character '%c' (0x%x) "
8870 "at index %zd",
8871 (31<=c && c<=126) ? (char)c : '?',
8872 (int)c,
8873 (Py_ssize_t)(fmt - 1 -
8874 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008875 goto onError;
8876 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008877 if (sign) {
8878 if (*pbuf == '-' || *pbuf == '+') {
8879 sign = *pbuf++;
8880 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008881 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008882 else if (flags & F_SIGN)
8883 sign = '+';
8884 else if (flags & F_BLANK)
8885 sign = ' ';
8886 else
8887 sign = 0;
8888 }
8889 if (width < len)
8890 width = len;
8891 if (rescnt - (sign != 0) < width) {
8892 reslen -= rescnt;
8893 rescnt = width + fmtcnt + 100;
8894 reslen += rescnt;
8895 if (reslen < 0) {
8896 Py_XDECREF(temp);
8897 PyErr_NoMemory();
8898 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008899 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008900 if (_PyUnicode_Resize(&result, reslen) < 0) {
8901 Py_XDECREF(temp);
8902 goto onError;
8903 }
8904 res = PyUnicode_AS_UNICODE(result)
8905 + reslen - rescnt;
8906 }
8907 if (sign) {
8908 if (fill != ' ')
8909 *res++ = sign;
8910 rescnt--;
8911 if (width > len)
8912 width--;
8913 }
8914 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8915 assert(pbuf[0] == '0');
8916 assert(pbuf[1] == c);
8917 if (fill != ' ') {
8918 *res++ = *pbuf++;
8919 *res++ = *pbuf++;
8920 }
8921 rescnt -= 2;
8922 width -= 2;
8923 if (width < 0)
8924 width = 0;
8925 len -= 2;
8926 }
8927 if (width > len && !(flags & F_LJUST)) {
8928 do {
8929 --rescnt;
8930 *res++ = fill;
8931 } while (--width > len);
8932 }
8933 if (fill == ' ') {
8934 if (sign)
8935 *res++ = sign;
8936 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8937 assert(pbuf[0] == '0');
8938 assert(pbuf[1] == c);
8939 *res++ = *pbuf++;
8940 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008941 }
8942 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008943 Py_UNICODE_COPY(res, pbuf, len);
8944 res += len;
8945 rescnt -= len;
8946 while (--width >= len) {
8947 --rescnt;
8948 *res++ = ' ';
8949 }
8950 if (dict && (argidx < arglen) && c != '%') {
8951 PyErr_SetString(PyExc_TypeError,
8952 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008953 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008954 goto onError;
8955 }
8956 Py_XDECREF(temp);
8957 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958 } /* until end */
8959 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008960 PyErr_SetString(PyExc_TypeError,
8961 "not all arguments converted during string formatting");
8962 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963 }
8964
Thomas Woutersa96affe2006-03-12 00:29:36 +00008965 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008966 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008968 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969 }
8970 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971 return (PyObject *)result;
8972
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008973 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974 Py_XDECREF(result);
8975 Py_DECREF(uformat);
8976 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008977 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978 }
8979 return NULL;
8980}
8981
8982static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008983 (readbufferproc) unicode_buffer_getreadbuf,
8984 (writebufferproc) unicode_buffer_getwritebuf,
8985 (segcountproc) unicode_buffer_getsegcount,
8986 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987};
8988
Jeremy Hylton938ace62002-07-17 16:30:39 +00008989static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008990unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8991
Tim Peters6d6c1a32001-08-02 04:15:00 +00008992static PyObject *
8993unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8994{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008995 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008996 static char *kwlist[] = {"string", "encoding", "errors", 0};
8997 char *encoding = NULL;
8998 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008999
Benjamin Peterson857ce152009-01-31 16:29:18 +00009000 if (type != &PyUnicode_Type)
9001 return unicode_subtype_new(type, args, kwds);
9002 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009003 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00009004 return NULL;
9005 if (x == NULL)
9006 return (PyObject *)_PyUnicode_New(0);
9007 if (encoding == NULL && errors == NULL)
9008 return PyObject_Unicode(x);
9009 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009010 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009011}
9012
Guido van Rossume023fe02001-08-30 03:12:59 +00009013static PyObject *
9014unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9015{
Benjamin Peterson857ce152009-01-31 16:29:18 +00009016 PyUnicodeObject *tmp, *pnew;
9017 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009018
Benjamin Peterson857ce152009-01-31 16:29:18 +00009019 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9020 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9021 if (tmp == NULL)
9022 return NULL;
9023 assert(PyUnicode_Check(tmp));
9024 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9025 if (pnew == NULL) {
9026 Py_DECREF(tmp);
9027 return NULL;
9028 }
9029 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9030 if (pnew->str == NULL) {
9031 _Py_ForgetReference((PyObject *)pnew);
9032 PyObject_Del(pnew);
9033 Py_DECREF(tmp);
9034 return PyErr_NoMemory();
9035 }
9036 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9037 pnew->length = n;
9038 pnew->hash = tmp->hash;
9039 Py_DECREF(tmp);
9040 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009041}
9042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009043PyDoc_STRVAR(unicode_doc,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009044 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009045\n\
9046Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009047encoding defaults to the current default string encoding.\n\
9048errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009049
Guido van Rossumd57fd912000-03-10 22:53:23 +00009050PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00009051 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00009052 "unicode", /* tp_name */
9053 sizeof(PyUnicodeObject), /* tp_size */
9054 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00009056 (destructor)unicode_dealloc, /* tp_dealloc */
9057 0, /* tp_print */
9058 0, /* tp_getattr */
9059 0, /* tp_setattr */
9060 0, /* tp_compare */
9061 unicode_repr, /* tp_repr */
9062 &unicode_as_number, /* tp_as_number */
9063 &unicode_as_sequence, /* tp_as_sequence */
9064 &unicode_as_mapping, /* tp_as_mapping */
9065 (hashfunc) unicode_hash, /* tp_hash*/
9066 0, /* tp_call*/
9067 (reprfunc) unicode_str, /* tp_str */
9068 PyObject_GenericGetAttr, /* tp_getattro */
9069 0, /* tp_setattro */
9070 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009071 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009072 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00009073 unicode_doc, /* tp_doc */
9074 0, /* tp_traverse */
9075 0, /* tp_clear */
9076 PyUnicode_RichCompare, /* tp_richcompare */
9077 0, /* tp_weaklistoffset */
9078 0, /* tp_iter */
9079 0, /* tp_iternext */
9080 unicode_methods, /* tp_methods */
9081 0, /* tp_members */
9082 0, /* tp_getset */
9083 &PyBaseString_Type, /* tp_base */
9084 0, /* tp_dict */
9085 0, /* tp_descr_get */
9086 0, /* tp_descr_set */
9087 0, /* tp_dictoffset */
9088 0, /* tp_init */
9089 0, /* tp_alloc */
9090 unicode_new, /* tp_new */
9091 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092};
9093
9094/* Initialize the Unicode implementation */
9095
Thomas Wouters78890102000-07-22 19:25:51 +00009096void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009098 int i;
9099
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009100 /* XXX - move this array to unicodectype.c ? */
9101 Py_UNICODE linebreak[] = {
9102 0x000A, /* LINE FEED */
9103 0x000D, /* CARRIAGE RETURN */
9104 0x001C, /* FILE SEPARATOR */
9105 0x001D, /* GROUP SEPARATOR */
9106 0x001E, /* RECORD SEPARATOR */
9107 0x0085, /* NEXT LINE */
9108 0x2028, /* LINE SEPARATOR */
9109 0x2029, /* PARAGRAPH SEPARATOR */
9110 };
9111
Fred Drakee4315f52000-05-09 19:53:39 +00009112 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00009113 free_list = NULL;
9114 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009115 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00009116 if (!unicode_empty)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009117 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00009118
Marc-André Lemburg90e81472000-06-07 09:13:21 +00009119 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009120 for (i = 0; i < 256; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009121 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009122 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009123 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009124
9125 /* initialize the linebreak bloom filter */
9126 bloom_linebreak = make_bloom_mask(
9127 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9128 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00009129
9130 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131}
9132
9133/* Finalize the Unicode implementation */
9134
Christian Heimes3b718a72008-02-14 12:47:33 +00009135int
9136PyUnicode_ClearFreeList(void)
9137{
9138 int freelist_size = numfree;
9139 PyUnicodeObject *u;
9140
9141 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009142 PyUnicodeObject *v = u;
9143 u = *(PyUnicodeObject **)u;
9144 if (v->str)
9145 PyObject_DEL(v->str);
9146 Py_XDECREF(v->defenc);
9147 PyObject_Del(v);
9148 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00009149 }
9150 free_list = NULL;
9151 assert(numfree == 0);
9152 return freelist_size;
9153}
9154
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155void
Thomas Wouters78890102000-07-22 19:25:51 +00009156_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009158 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009159
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009160 Py_XDECREF(unicode_empty);
9161 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009162
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009163 for (i = 0; i < 256; i++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009164 if (unicode_latin1[i]) {
9165 Py_DECREF(unicode_latin1[i]);
9166 unicode_latin1[i] = NULL;
9167 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009168 }
Christian Heimes3b718a72008-02-14 12:47:33 +00009169 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009170}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009171
Anthony Baxterac6bd462006-04-13 02:06:09 +00009172#ifdef __cplusplus
9173}
9174#endif
9175
9176
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009177/*
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009178 Local variables:
9179 c-basic-offset: 4
9180 indent-tabs-mode: nil
9181 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009182*/