blob: 1b36a56a254057f8f192d041052596e28e2c8cfe [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000118/* case 0x0009: * HORIZONTAL TABULATION */
119/* case 0x000A: * LINE FEED */
120/* case 0x000B: * VERTICAL TABULATION */
121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000149/* 0x000A, * LINE FEED */
150/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000153/* 0x001C, * FILE SEPARATOR */
154/* 0x001D, * GROUP SEPARATOR */
155/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000161
Benjamin Peterson857ce152009-01-31 16:29:18 +0000162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000170};
171
172
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000173Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000174PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000176#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000178#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000182#endif
183}
184
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000185/* --- Bloom Filters ----------------------------------------------------- */
186
187/* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
190
191/* the linebreak mask is set up by Unicode_Init below */
192
193#define BLOOM_MASK unsigned long
194
195static BLOOM_MASK bloom_linebreak;
196
197#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
198
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000199#define BLOOM_LINEBREAK(ch) \
200 ((ch) < 128U ? ascii_linebreak[(ch)] : \
201 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000202
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000203Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000204{
205 /* calculate simple bloom-style bitmask for a given unicode string */
206
207 long mask;
208 Py_ssize_t i;
209
210 mask = 0;
211 for (i = 0; i < len; i++)
212 mask |= (1 << (ptr[i] & 0x1F));
213
214 return mask;
215}
216
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000217Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000218{
219 Py_ssize_t i;
220
221 for (i = 0; i < setlen; i++)
222 if (set[i] == chr)
223 return 1;
224
Fredrik Lundh77633512006-05-23 19:47:35 +0000225 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226}
227
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000228#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000229 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
230
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231/* --- Unicode Object ----------------------------------------------------- */
232
233static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000234int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000235 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000236{
237 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000238
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000240 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000241 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000243 /* Resizing shared object (unicode_empty or single character
244 objects) in-place is not allowed. Use PyUnicode_Resize()
245 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000246
Benjamin Peterson857ce152009-01-31 16:29:18 +0000247 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000248 (unicode->length == 1 &&
249 unicode->str[0] < 256U &&
250 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 return -1;
254 }
255
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000256 /* We allocate one more byte to make sure the string is Ux0000 terminated.
257 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000258 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259 it contains). */
260
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000262 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000263 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000265 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 PyErr_NoMemory();
267 return -1;
268 }
269 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000270 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000271
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000272 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000274 if (unicode->defenc) {
275 Py_DECREF(unicode->defenc);
276 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 }
278 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000279
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return 0;
281}
282
283/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000284 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000285
286 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000287 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288
289*/
290
291static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000292PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293{
294 register PyUnicodeObject *unicode;
295
Andrew Dalkee0df7622006-05-27 11:04:36 +0000296 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297 if (length == 0 && unicode_empty != NULL) {
298 Py_INCREF(unicode_empty);
299 return unicode_empty;
300 }
301
Neal Norwitze7d8be82008-07-31 17:17:14 +0000302 /* Ensure we won't overflow the size. */
303 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
304 return (PyUnicodeObject *)PyErr_NoMemory();
305 }
306
Guido van Rossumd57fd912000-03-10 22:53:23 +0000307 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000308 if (free_list) {
309 unicode = free_list;
310 free_list = *(PyUnicodeObject **)unicode;
311 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000312 if (unicode->str) {
313 /* Keep-Alive optimization: we only upsize the buffer,
314 never downsize it. */
315 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000316 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000317 PyObject_DEL(unicode->str);
318 unicode->str = NULL;
319 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000320 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000321 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000322 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
323 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000324 }
325 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 }
327 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000328 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000329 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 if (unicode == NULL)
331 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000332 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
333 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 }
335
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000336 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000337 PyErr_NoMemory();
338 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000339 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000340 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000341 * the caller fails before initializing str -- unicode_resize()
342 * reads str[0], and the Keep-Alive optimization can keep memory
343 * allocated for str alive across a call to unicode_dealloc(unicode).
344 * We don't want unicode_resize to read uninitialized memory in
345 * that case.
346 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000347 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000348 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000349 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000350 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000351 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000352 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000353
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000354 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000355 /* XXX UNREF/NEWREF interface should be more symmetrical */
356 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000357 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000358 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360}
361
362static
Guido van Rossum9475a232001-10-05 20:51:39 +0000363void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000365 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000366 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000367 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000368 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
369 PyObject_DEL(unicode->str);
370 unicode->str = NULL;
371 unicode->length = 0;
372 }
373 if (unicode->defenc) {
374 Py_DECREF(unicode->defenc);
375 unicode->defenc = NULL;
376 }
377 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000378 *(PyUnicodeObject **)unicode = free_list;
379 free_list = unicode;
380 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381 }
382 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000383 PyObject_DEL(unicode->str);
384 Py_XDECREF(unicode->defenc);
385 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386 }
387}
388
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000389static
390int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000391{
392 register PyUnicodeObject *v;
393
394 /* Argument checks */
395 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000396 PyErr_BadInternalCall();
397 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000398 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000399 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000400 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000401 PyErr_BadInternalCall();
402 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000403 }
404
405 /* Resizing unicode_empty and single character objects is not
406 possible since these are being shared. We simply return a fresh
407 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000408 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000409 (v == unicode_empty || v->length == 1)) {
410 PyUnicodeObject *w = _PyUnicode_New(length);
411 if (w == NULL)
412 return -1;
413 Py_UNICODE_COPY(w->str, v->str,
414 length < v->length ? length : v->length);
415 Py_DECREF(*unicode);
416 *unicode = w;
417 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000418 }
419
420 /* Note that we don't have to modify *unicode for unshared Unicode
421 objects, since we can modify them in-place. */
422 return unicode_resize(v, length);
423}
424
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000425int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
426{
427 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
428}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429
Guido van Rossumd57fd912000-03-10 22:53:23 +0000430PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000431 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432{
433 PyUnicodeObject *unicode;
434
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000435 /* If the Unicode data is known at construction time, we can apply
436 some optimizations which share commonly used objects. */
437 if (u != NULL) {
438
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000439 /* Optimization for empty strings */
440 if (size == 0 && unicode_empty != NULL) {
441 Py_INCREF(unicode_empty);
442 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000443 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000444
445 /* Single character Unicode objects in the Latin-1 range are
446 shared when using this constructor */
447 if (size == 1 && *u < 256) {
448 unicode = unicode_latin1[*u];
449 if (!unicode) {
450 unicode = _PyUnicode_New(1);
451 if (!unicode)
452 return NULL;
453 unicode->str[0] = *u;
454 unicode_latin1[*u] = unicode;
455 }
456 Py_INCREF(unicode);
457 return (PyObject *)unicode;
458 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000459 }
Tim Petersced69f82003-09-16 20:30:58 +0000460
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 unicode = _PyUnicode_New(size);
462 if (!unicode)
463 return NULL;
464
465 /* Copy the Unicode data into the new object */
466 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000467 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000468
469 return (PyObject *)unicode;
470}
471
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000472PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
473{
474 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000475
Benjamin Peterson857ce152009-01-31 16:29:18 +0000476 if (size < 0) {
477 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000478 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000479 return NULL;
480 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000481
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects.
484 Also, this means the input must be UTF-8, so fall back to the
485 UTF-8 decoder at the end. */
486 if (u != NULL) {
487
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000488 /* Optimization for empty strings */
489 if (size == 0 && unicode_empty != NULL) {
490 Py_INCREF(unicode_empty);
491 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000492 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000493
494 /* Single characters are shared when using this constructor.
495 Restrict to ASCII, since the input must be UTF-8. */
496 if (size == 1 && Py_CHARMASK(*u) < 128) {
497 unicode = unicode_latin1[Py_CHARMASK(*u)];
498 if (!unicode) {
499 unicode = _PyUnicode_New(1);
500 if (!unicode)
501 return NULL;
502 unicode->str[0] = Py_CHARMASK(*u);
503 unicode_latin1[Py_CHARMASK(*u)] = unicode;
504 }
505 Py_INCREF(unicode);
506 return (PyObject *)unicode;
507 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000508
509 return PyUnicode_DecodeUTF8(u, size, NULL);
510 }
511
512 unicode = _PyUnicode_New(size);
513 if (!unicode)
514 return NULL;
515
516 return (PyObject *)unicode;
517}
518
519PyObject *PyUnicode_FromString(const char *u)
520{
521 size_t size = strlen(u);
522 if (size > PY_SSIZE_T_MAX) {
523 PyErr_SetString(PyExc_OverflowError, "input too long");
524 return NULL;
525 }
526
527 return PyUnicode_FromStringAndSize(u, size);
528}
529
Guido van Rossumd57fd912000-03-10 22:53:23 +0000530#ifdef HAVE_WCHAR_H
531
532PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000533 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000534{
535 PyUnicodeObject *unicode;
536
537 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000538 PyErr_BadInternalCall();
539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000540 }
541
542 unicode = _PyUnicode_New(size);
543 if (!unicode)
544 return NULL;
545
546 /* Copy the wchar_t data into the new object */
547#ifdef HAVE_USABLE_WCHAR_T
548 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000549#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000550 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000551 register Py_UNICODE *u;
552 register Py_ssize_t i;
553 u = PyUnicode_AS_UNICODE(unicode);
554 for (i = size; i > 0; i--)
555 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000556 }
557#endif
558
559 return (PyObject *)unicode;
560}
561
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000562static void
563makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
564{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000565 *fmt++ = '%';
566 if (width) {
567 if (zeropad)
568 *fmt++ = '0';
569 fmt += sprintf(fmt, "%d", width);
570 }
571 if (precision)
572 fmt += sprintf(fmt, ".%d", precision);
573 if (longflag)
574 *fmt++ = 'l';
575 else if (size_tflag) {
576 char *f = PY_FORMAT_SIZE_T;
577 while (*f)
578 *fmt++ = *f++;
579 }
580 *fmt++ = c;
581 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000582}
583
584#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
585
586PyObject *
587PyUnicode_FromFormatV(const char *format, va_list vargs)
588{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000589 va_list count;
590 Py_ssize_t callcount = 0;
591 PyObject **callresults = NULL;
592 PyObject **callresult = NULL;
593 Py_ssize_t n = 0;
594 int width = 0;
595 int precision = 0;
596 int zeropad;
597 const char* f;
598 Py_UNICODE *s;
599 PyObject *string;
600 /* used by sprintf */
601 char buffer[21];
602 /* use abuffer instead of buffer, if we need more space
603 * (which can happen if there's a format specifier with width). */
604 char *abuffer = NULL;
605 char *realbuffer;
606 Py_ssize_t abuffersize = 0;
607 char fmt[60]; /* should be enough for %0width.precisionld */
608 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000609
610#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000611 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000612#else
613#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000614 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000615#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000616 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000617#endif
618#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +0000619 /* step 1: count the number of %S/%R format specifications
620 * (we call PyObject_Str()/PyObject_Repr() for these objects
621 * once during step 3 and put the result in an array) */
622 for (f = format; *f; f++) {
623 if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
624 ++callcount;
625 }
626 /* step 2: allocate memory for the results of
627 * PyObject_Str()/PyObject_Repr() calls */
628 if (callcount) {
629 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
630 if (!callresults) {
631 PyErr_NoMemory();
632 return NULL;
633 }
634 callresult = callresults;
635 }
636 /* step 3: figure out how large a buffer we need */
637 for (f = format; *f; f++) {
638 if (*f == '%') {
639 const char* p = f;
640 width = 0;
641 while (isdigit((unsigned)*f))
642 width = (width*10) + *f++ - '0';
643 while (*++f && *f != '%' && !isalpha((unsigned)*f))
644 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000645
Benjamin Peterson857ce152009-01-31 16:29:18 +0000646 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
647 * they don't affect the amount of space we reserve.
648 */
649 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000650 (f[1] == 'd' || f[1] == 'u'))
651 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000652
Benjamin Peterson857ce152009-01-31 16:29:18 +0000653 switch (*f) {
654 case 'c':
655 (void)va_arg(count, int);
656 /* fall through... */
657 case '%':
658 n++;
659 break;
660 case 'd': case 'u': case 'i': case 'x':
661 (void) va_arg(count, int);
662 /* 20 bytes is enough to hold a 64-bit
663 integer. Decimal takes the most space.
664 This isn't enough for octal.
665 If a width is specified we need more
666 (which we allocate later). */
667 if (width < 20)
668 width = 20;
669 n += width;
670 if (abuffersize < width)
671 abuffersize = width;
672 break;
673 case 's':
674 {
675 /* UTF-8 */
676 unsigned char*s;
677 s = va_arg(count, unsigned char*);
678 while (*s) {
679 if (*s < 128) {
680 n++; s++;
681 } else if (*s < 0xc0) {
682 /* invalid UTF-8 */
683 n++; s++;
684 } else if (*s < 0xc0) {
685 n++;
686 s++; if(!*s)break;
687 s++;
688 } else if (*s < 0xe0) {
689 n++;
690 s++; if(!*s)break;
691 s++; if(!*s)break;
692 s++;
693 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000694#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000695 n++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000696#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000697 n+=2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000698#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +0000699 s++; if(!*s)break;
700 s++; if(!*s)break;
701 s++; if(!*s)break;
702 s++;
703 }
704 }
705 break;
706 }
707 case 'U':
708 {
709 PyObject *obj = va_arg(count, PyObject *);
710 assert(obj && PyUnicode_Check(obj));
711 n += PyUnicode_GET_SIZE(obj);
712 break;
713 }
714 case 'V':
715 {
716 PyObject *obj = va_arg(count, PyObject *);
717 const char *str = va_arg(count, const char *);
718 assert(obj || str);
719 assert(!obj || PyUnicode_Check(obj));
720 if (obj)
721 n += PyUnicode_GET_SIZE(obj);
722 else
723 n += strlen(str);
724 break;
725 }
726 case 'S':
727 {
728 PyObject *obj = va_arg(count, PyObject *);
729 PyObject *str;
730 assert(obj);
731 str = PyObject_Str(obj);
732 if (!str)
733 goto fail;
734 n += PyUnicode_GET_SIZE(str);
735 /* Remember the str and switch to the next slot */
736 *callresult++ = str;
737 break;
738 }
739 case 'R':
740 {
741 PyObject *obj = va_arg(count, PyObject *);
742 PyObject *repr;
743 assert(obj);
744 repr = PyObject_Repr(obj);
745 if (!repr)
746 goto fail;
747 n += PyUnicode_GET_SIZE(repr);
748 /* Remember the repr and switch to the next slot */
749 *callresult++ = repr;
750 break;
751 }
752 case 'p':
753 (void) va_arg(count, int);
754 /* maximum 64-bit pointer representation:
755 * 0xffffffffffffffff
756 * so 19 characters is enough.
757 * XXX I count 18 -- what's the extra for?
758 */
759 n += 19;
760 break;
761 default:
762 /* if we stumble upon an unknown
763 formatting code, copy the rest of
764 the format string to the output
765 string. (we cannot just skip the
766 code, since there's no way to know
767 what's in the argument list) */
768 n += strlen(p);
769 goto expand;
770 }
771 } else
772 n++;
773 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000774 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000775 if (abuffersize > 20) {
776 abuffer = PyObject_Malloc(abuffersize);
777 if (!abuffer) {
778 PyErr_NoMemory();
779 goto fail;
780 }
781 realbuffer = abuffer;
782 }
783 else
784 realbuffer = buffer;
785 /* step 4: fill the buffer */
786 /* Since we've analyzed how much space we need for the worst case,
787 we don't have to resize the string.
788 There can be no errors beyond this point. */
789 string = PyUnicode_FromUnicode(NULL, n);
790 if (!string)
791 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000792
Benjamin Peterson857ce152009-01-31 16:29:18 +0000793 s = PyUnicode_AS_UNICODE(string);
794 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000795
Benjamin Peterson857ce152009-01-31 16:29:18 +0000796 for (f = format; *f; f++) {
797 if (*f == '%') {
798 const char* p = f++;
799 int longflag = 0;
800 int size_tflag = 0;
801 zeropad = (*f == '0');
802 /* parse the width.precision part */
803 width = 0;
804 while (isdigit((unsigned)*f))
805 width = (width*10) + *f++ - '0';
806 precision = 0;
807 if (*f == '.') {
808 f++;
809 while (isdigit((unsigned)*f))
810 precision = (precision*10) + *f++ - '0';
811 }
812 /* handle the long flag, but only for %ld and %lu.
813 others can be added when necessary. */
814 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
815 longflag = 1;
816 ++f;
817 }
818 /* handle the size_t flag. */
819 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
820 size_tflag = 1;
821 ++f;
822 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000823
Benjamin Peterson857ce152009-01-31 16:29:18 +0000824 switch (*f) {
825 case 'c':
826 *s++ = va_arg(vargs, int);
827 break;
828 case 'd':
829 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
830 if (longflag)
831 sprintf(realbuffer, fmt, va_arg(vargs, long));
832 else if (size_tflag)
833 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
834 else
835 sprintf(realbuffer, fmt, va_arg(vargs, int));
836 appendstring(realbuffer);
837 break;
838 case 'u':
839 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
840 if (longflag)
841 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
842 else if (size_tflag)
843 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
844 else
845 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
846 appendstring(realbuffer);
847 break;
848 case 'i':
849 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
850 sprintf(realbuffer, fmt, va_arg(vargs, int));
851 appendstring(realbuffer);
852 break;
853 case 'x':
854 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
855 sprintf(realbuffer, fmt, va_arg(vargs, int));
856 appendstring(realbuffer);
857 break;
858 case 's':
859 {
860 /* Parameter must be UTF-8 encoded.
861 In case of encoding errors, use
862 the replacement character. */
863 PyObject *u;
864 p = va_arg(vargs, char*);
865 u = PyUnicode_DecodeUTF8(p, strlen(p),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000866 "replace");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000867 if (!u)
868 goto fail;
869 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000870 PyUnicode_GET_SIZE(u));
Benjamin Peterson857ce152009-01-31 16:29:18 +0000871 s += PyUnicode_GET_SIZE(u);
872 Py_DECREF(u);
873 break;
874 }
875 case 'U':
876 {
877 PyObject *obj = va_arg(vargs, PyObject *);
878 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
879 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
880 s += size;
881 break;
882 }
883 case 'V':
884 {
885 PyObject *obj = va_arg(vargs, PyObject *);
886 const char *str = va_arg(vargs, const char *);
887 if (obj) {
888 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
889 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
890 s += size;
891 } else {
892 appendstring(str);
893 }
894 break;
895 }
896 case 'S':
897 case 'R':
898 {
899 Py_UNICODE *ucopy;
900 Py_ssize_t usize;
901 Py_ssize_t upos;
902 /* unused, since we already have the result */
903 (void) va_arg(vargs, PyObject *);
904 ucopy = PyUnicode_AS_UNICODE(*callresult);
905 usize = PyUnicode_GET_SIZE(*callresult);
906 for (upos = 0; upos<usize;)
907 *s++ = ucopy[upos++];
908 /* We're done with the unicode()/repr() => forget it */
909 Py_DECREF(*callresult);
910 /* switch to next unicode()/repr() result */
911 ++callresult;
912 break;
913 }
914 case 'p':
915 sprintf(buffer, "%p", va_arg(vargs, void*));
916 /* %p is ill-defined: ensure leading 0x. */
917 if (buffer[1] == 'X')
918 buffer[1] = 'x';
919 else if (buffer[1] != 'x') {
920 memmove(buffer+2, buffer, strlen(buffer)+1);
921 buffer[0] = '0';
922 buffer[1] = 'x';
923 }
924 appendstring(buffer);
925 break;
926 case '%':
927 *s++ = '%';
928 break;
929 default:
930 appendstring(p);
931 goto end;
932 }
933 } else
934 *s++ = *f;
935 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000936
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000937 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000938 if (callresults)
939 PyObject_Free(callresults);
940 if (abuffer)
941 PyObject_Free(abuffer);
942 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
943 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000944 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000945 if (callresults) {
946 PyObject **callresult2 = callresults;
947 while (callresult2 < callresult) {
948 Py_DECREF(*callresult2);
949 ++callresult2;
950 }
951 PyObject_Free(callresults);
952 }
953 if (abuffer)
954 PyObject_Free(abuffer);
955 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000956}
957
958#undef appendstring
959
960PyObject *
961PyUnicode_FromFormat(const char *format, ...)
962{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000963 PyObject* ret;
964 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000965
966#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +0000967 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000968#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000969 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000970#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +0000971 ret = PyUnicode_FromFormatV(format, vargs);
972 va_end(vargs);
973 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000974}
975
Martin v. Löwis18e16552006-02-15 17:27:45 +0000976Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000977 wchar_t *w,
978 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979{
980 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000981 PyErr_BadInternalCall();
982 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000983 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000984
985 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000986 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000987 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000988
Guido van Rossumd57fd912000-03-10 22:53:23 +0000989#ifdef HAVE_USABLE_WCHAR_T
990 memcpy(w, unicode->str, size * sizeof(wchar_t));
991#else
992 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000993 register Py_UNICODE *u;
994 register Py_ssize_t i;
995 u = PyUnicode_AS_UNICODE(unicode);
996 for (i = size; i > 0; i--)
997 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000998 }
999#endif
1000
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001001 if (size > PyUnicode_GET_SIZE(unicode))
1002 return PyUnicode_GET_SIZE(unicode);
1003 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001004 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001005}
1006
1007#endif
1008
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001009PyObject *PyUnicode_FromOrdinal(int ordinal)
1010{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001011 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001012
1013#ifdef Py_UNICODE_WIDE
1014 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001015 PyErr_SetString(PyExc_ValueError,
1016 "unichr() arg not in range(0x110000) "
1017 "(wide Python build)");
1018 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001019 }
1020#else
1021 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001022 PyErr_SetString(PyExc_ValueError,
1023 "unichr() arg not in range(0x10000) "
1024 "(narrow Python build)");
1025 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001026 }
1027#endif
1028
Hye-Shik Chang40574832004-04-06 07:24:51 +00001029 s[0] = (Py_UNICODE)ordinal;
1030 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001031}
1032
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033PyObject *PyUnicode_FromObject(register PyObject *obj)
1034{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001035 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001036 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001037 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001038 Py_INCREF(obj);
1039 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001040 }
1041 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001042 /* For a Unicode subtype that's not a Unicode object,
1043 return a true Unicode object with the same data. */
1044 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1045 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001046 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001047 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1048}
1049
1050PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001051 const char *encoding,
1052 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001053{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001054 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001055 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001056 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001057
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001059 PyErr_BadInternalCall();
1060 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001061 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001062
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001063#if 0
1064 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001065 that no encodings is given and then redirect to
1066 PyObject_Unicode() which then applies the additional logic for
1067 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001068
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001069 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001070 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001071
1072 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001073 if (PyUnicode_Check(obj)) {
1074 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001075 PyErr_SetString(PyExc_TypeError,
1076 "decoding Unicode is not supported");
1077 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001078 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001079 return PyObject_Unicode(obj);
1080 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001081#else
1082 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001083 PyErr_SetString(PyExc_TypeError,
1084 "decoding Unicode is not supported");
1085 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001086 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001087#endif
1088
1089 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001090 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001091 s = PyString_AS_STRING(obj);
1092 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001093 }
Christian Heimes3497f942008-05-26 12:29:14 +00001094 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001095 /* Python 2.x specific */
1096 PyErr_Format(PyExc_TypeError,
1097 "decoding bytearray is not supported");
1098 return NULL;
1099 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001100 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001101 /* Overwrite the error message with something more useful in
1102 case of a TypeError. */
1103 if (PyErr_ExceptionMatches(PyExc_TypeError))
1104 PyErr_Format(PyExc_TypeError,
1105 "coercing to Unicode: need string or buffer, "
1106 "%.80s found",
1107 Py_TYPE(obj)->tp_name);
1108 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001109 }
Tim Petersced69f82003-09-16 20:30:58 +00001110
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001111 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 if (len == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001113 Py_INCREF(unicode_empty);
1114 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 }
Tim Petersced69f82003-09-16 20:30:58 +00001116 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001117 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001118
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001119 return v;
1120
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001121 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001122 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123}
1124
1125PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001126 Py_ssize_t size,
1127 const char *encoding,
1128 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001129{
1130 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001131
1132 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001133 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001134
1135 /* Shortcuts for common default encodings */
1136 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001138 else if (strcmp(encoding, "latin-1") == 0)
1139 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001140#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1141 else if (strcmp(encoding, "mbcs") == 0)
1142 return PyUnicode_DecodeMBCS(s, size, errors);
1143#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001144 else if (strcmp(encoding, "ascii") == 0)
1145 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001146
1147 /* Decode via the codec registry */
1148 buffer = PyBuffer_FromMemory((void *)s, size);
1149 if (buffer == NULL)
1150 goto onError;
1151 unicode = PyCodec_Decode(buffer, encoding, errors);
1152 if (unicode == NULL)
1153 goto onError;
1154 if (!PyUnicode_Check(unicode)) {
1155 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001156 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001157 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158 Py_DECREF(unicode);
1159 goto onError;
1160 }
1161 Py_DECREF(buffer);
1162 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001163
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001164 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 Py_XDECREF(buffer);
1166 return NULL;
1167}
1168
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001169PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1170 const char *encoding,
1171 const char *errors)
1172{
1173 PyObject *v;
1174
1175 if (!PyUnicode_Check(unicode)) {
1176 PyErr_BadArgument();
1177 goto onError;
1178 }
1179
1180 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001181 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001182
1183 /* Decode via the codec registry */
1184 v = PyCodec_Decode(unicode, encoding, errors);
1185 if (v == NULL)
1186 goto onError;
1187 return v;
1188
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001189 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001190 return NULL;
1191}
1192
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001194 Py_ssize_t size,
1195 const char *encoding,
1196 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197{
1198 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001199
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 unicode = PyUnicode_FromUnicode(s, size);
1201 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1204 Py_DECREF(unicode);
1205 return v;
1206}
1207
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001208PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1209 const char *encoding,
1210 const char *errors)
1211{
1212 PyObject *v;
1213
1214 if (!PyUnicode_Check(unicode)) {
1215 PyErr_BadArgument();
1216 goto onError;
1217 }
1218
1219 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001220 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001221
1222 /* Encode via the codec registry */
1223 v = PyCodec_Encode(unicode, encoding, errors);
1224 if (v == NULL)
1225 goto onError;
1226 return v;
1227
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001228 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001229 return NULL;
1230}
1231
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1233 const char *encoding,
1234 const char *errors)
1235{
1236 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001237
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238 if (!PyUnicode_Check(unicode)) {
1239 PyErr_BadArgument();
1240 goto onError;
1241 }
Fred Drakee4315f52000-05-09 19:53:39 +00001242
Tim Petersced69f82003-09-16 20:30:58 +00001243 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001244 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001245
1246 /* Shortcuts for common default encodings */
1247 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001248 if (strcmp(encoding, "utf-8") == 0)
1249 return PyUnicode_AsUTF8String(unicode);
1250 else if (strcmp(encoding, "latin-1") == 0)
1251 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001252#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001253 else if (strcmp(encoding, "mbcs") == 0)
1254 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001255#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001256 else if (strcmp(encoding, "ascii") == 0)
1257 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259
1260 /* Encode via the codec registry */
1261 v = PyCodec_Encode(unicode, encoding, errors);
1262 if (v == NULL)
1263 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001264 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001265 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001266 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001267 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268 Py_DECREF(v);
1269 goto onError;
1270 }
1271 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001272
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001273 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274 return NULL;
1275}
1276
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001277PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001278 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001279{
1280 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1281
1282 if (v)
1283 return v;
1284 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1285 if (v && errors == NULL)
1286 ((PyUnicodeObject *)unicode)->defenc = v;
1287 return v;
1288}
1289
Guido van Rossumd57fd912000-03-10 22:53:23 +00001290Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1291{
1292 if (!PyUnicode_Check(unicode)) {
1293 PyErr_BadArgument();
1294 goto onError;
1295 }
1296 return PyUnicode_AS_UNICODE(unicode);
1297
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001298 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001299 return NULL;
1300}
1301
Martin v. Löwis18e16552006-02-15 17:27:45 +00001302Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303{
1304 if (!PyUnicode_Check(unicode)) {
1305 PyErr_BadArgument();
1306 goto onError;
1307 }
1308 return PyUnicode_GET_SIZE(unicode);
1309
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001310 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311 return -1;
1312}
1313
Thomas Wouters78890102000-07-22 19:25:51 +00001314const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001315{
1316 return unicode_default_encoding;
1317}
1318
1319int PyUnicode_SetDefaultEncoding(const char *encoding)
1320{
1321 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001322
Fred Drakee4315f52000-05-09 19:53:39 +00001323 /* Make sure the encoding is valid. As side effect, this also
1324 loads the encoding into the codec registry cache. */
1325 v = _PyCodec_Lookup(encoding);
1326 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001327 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001328 Py_DECREF(v);
1329 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001330 encoding,
1331 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001332 return 0;
1333
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001334 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001335 return -1;
1336}
1337
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001338/* error handling callback helper:
1339 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001340 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001341 and adjust various state variables.
1342 return 0 on success, -1 on error
1343*/
1344
1345static
1346int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001347 const char *encoding, const char *reason,
1348 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1349 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1350 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001351{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001352 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001353
1354 PyObject *restuple = NULL;
1355 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001356 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1357 Py_ssize_t requiredsize;
1358 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001359 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001360 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001361 int res = -1;
1362
1363 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001364 *errorHandler = PyCodec_LookupError(errors);
1365 if (*errorHandler == NULL)
1366 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001367 }
1368
1369 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001370 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001371 encoding, input, insize, *startinpos, *endinpos, reason);
1372 if (*exceptionObject == NULL)
1373 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001374 }
1375 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001376 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1377 goto onError;
1378 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1379 goto onError;
1380 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1381 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001382 }
1383
1384 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1385 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001386 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001387 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001388 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001389 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001390 }
1391 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001392 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001393 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001394 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001395 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001396 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1397 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001398 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001399
1400 /* need more space? (at least enough for what we
1401 have+the replacement+the rest of the string (starting
1402 at the new input position), so we won't have to check space
1403 when there are no errors in the rest of the string) */
1404 repptr = PyUnicode_AS_UNICODE(repunicode);
1405 repsize = PyUnicode_GET_SIZE(repunicode);
1406 requiredsize = *outpos + repsize + insize-newpos;
1407 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001408 if (requiredsize<2*outsize)
1409 requiredsize = 2*outsize;
1410 if (_PyUnicode_Resize(output, requiredsize) < 0)
1411 goto onError;
1412 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001413 }
1414 *endinpos = newpos;
1415 *inptr = input + newpos;
1416 Py_UNICODE_COPY(*outptr, repptr, repsize);
1417 *outptr += repsize;
1418 *outpos += repsize;
1419 /* we made it! */
1420 res = 0;
1421
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001422 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001423 Py_XDECREF(restuple);
1424 return res;
1425}
1426
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001427/* --- UTF-7 Codec -------------------------------------------------------- */
1428
1429/* see RFC2152 for details */
1430
Tim Petersced69f82003-09-16 20:30:58 +00001431static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001432char utf7_special[128] = {
1433 /* indicate whether a UTF-7 character is special i.e. cannot be directly
1434 encoded:
Benjamin Peterson857ce152009-01-31 16:29:18 +00001435 0 - not special
1436 1 - special
1437 2 - whitespace (optional)
1438 3 - RFC2152 Set O (optional) */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001439 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
1440 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1441 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
1442 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
1443 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1444 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
1445 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1446 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
1447
1448};
1449
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001450/* Note: The comparison (c) <= 0 is a trick to work-around gcc
1451 warnings about the comparison always being false; since
1452 utf7_special[0] is 1, we can safely make that one comparison
1453 true */
1454
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001455#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001456 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001457 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001458 (encodeO && (utf7_special[(c)] == 3)))
1459
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001460#define B64(n) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001461 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001462#define B64CHAR(c) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001463 (isalnum(c) || (c) == '+' || (c) == '/')
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001464#define UB64(c) \
1465 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001466 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001467
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001468#define ENCODE(out, ch, bits) \
1469 while (bits >= 6) { \
1470 *out++ = B64(ch >> (bits-6)); \
1471 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001472 }
1473
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001474#define DECODE(out, ch, bits, surrogate) \
1475 while (bits >= 16) { \
1476 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
1477 bits -= 16; \
1478 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001479 /* We have already generated an error for the high surrogate \
1480 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001481 surrogate = 0; \
1482 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001483 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +00001484 it in a 16-bit character */ \
1485 surrogate = 1; \
1486 errmsg = "code pairs are not supported"; \
1487 goto utf7Error; \
1488 } else { \
1489 *out++ = outCh; \
1490 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001491 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001492
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001493PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001494 Py_ssize_t size,
1495 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001496{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001497 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1498}
1499
1500PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001501 Py_ssize_t size,
1502 const char *errors,
1503 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001504{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001505 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001506 Py_ssize_t startinpos;
1507 Py_ssize_t endinpos;
1508 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001509 const char *e;
1510 PyUnicodeObject *unicode;
1511 Py_UNICODE *p;
1512 const char *errmsg = "";
1513 int inShift = 0;
1514 unsigned int bitsleft = 0;
1515 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001516 int surrogate = 0;
1517 PyObject *errorHandler = NULL;
1518 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001519
1520 unicode = _PyUnicode_New(size);
1521 if (!unicode)
1522 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001523 if (size == 0) {
1524 if (consumed)
1525 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001526 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001527 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001528
1529 p = unicode->str;
1530 e = s + size;
1531
1532 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001533 Py_UNICODE ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001534 restart:
Antoine Pitrou4982d5d2008-07-25 17:45:59 +00001535 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001536
1537 if (inShift) {
1538 if ((ch == '-') || !B64CHAR(ch)) {
1539 inShift = 0;
1540 s++;
Tim Petersced69f82003-09-16 20:30:58 +00001541
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001542 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1543 if (bitsleft >= 6) {
1544 /* The shift sequence has a partial character in it. If
1545 bitsleft < 6 then we could just classify it as padding
1546 but that is not the case here */
1547
1548 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001549 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001550 }
1551 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +00001552 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001553 here so indicate the potential of a misencoded character. */
1554
1555 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
1556 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
1557 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001558 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001559 }
1560
1561 if (ch == '-') {
1562 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +00001563 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001564 inShift = 1;
1565 }
1566 } else if (SPECIAL(ch,0,0)) {
1567 errmsg = "unexpected special character";
Benjamin Peterson857ce152009-01-31 16:29:18 +00001568 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001569 } else {
1570 *p++ = ch;
1571 }
1572 } else {
1573 charsleft = (charsleft << 6) | UB64(ch);
1574 bitsleft += 6;
1575 s++;
1576 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
1577 }
1578 }
1579 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001580 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001581 s++;
1582 if (s < e && *s == '-') {
1583 s++;
1584 *p++ = '+';
1585 } else
1586 {
1587 inShift = 1;
1588 bitsleft = 0;
1589 }
1590 }
1591 else if (SPECIAL(ch,0,0)) {
Walter Dörwald9d045422007-08-30 15:34:55 +00001592 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001593 errmsg = "unexpected special character";
1594 s++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001595 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001596 }
1597 else {
1598 *p++ = ch;
1599 s++;
1600 }
1601 continue;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001602 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001603 outpos = p-PyUnicode_AS_UNICODE(unicode);
1604 endinpos = s-starts;
1605 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001606 errors, &errorHandler,
1607 "utf7", errmsg,
1608 starts, size, &startinpos, &endinpos, &exc, &s,
1609 &unicode, &outpos, &p))
1610 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001611 }
1612
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001613 if (inShift && !consumed) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001614 outpos = p-PyUnicode_AS_UNICODE(unicode);
1615 endinpos = size;
1616 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001617 errors, &errorHandler,
1618 "utf7", "unterminated shift sequence",
1619 starts, size, &startinpos, &endinpos, &exc, &s,
1620 &unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001621 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001622 if (s < e)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001623 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001624 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001625 if (consumed) {
1626 if(inShift)
1627 *consumed = startinpos;
1628 else
1629 *consumed = s-starts;
1630 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001631
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001632 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001633 goto onError;
1634
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001635 Py_XDECREF(errorHandler);
1636 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001637 return (PyObject *)unicode;
1638
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001639 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001640 Py_XDECREF(errorHandler);
1641 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001642 Py_DECREF(unicode);
1643 return NULL;
1644}
1645
1646
1647PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001648 Py_ssize_t size,
1649 int encodeSetO,
1650 int encodeWhiteSpace,
1651 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652{
1653 PyObject *v;
1654 /* It might be possible to tighten this worst case */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001655 Py_ssize_t cbAllocated = 5 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001656 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001657 Py_ssize_t i = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001658 unsigned int bitsleft = 0;
1659 unsigned long charsleft = 0;
1660 char * out;
1661 char * start;
1662
Neal Norwitze7d8be82008-07-31 17:17:14 +00001663 if (cbAllocated / 5 != size)
1664 return PyErr_NoMemory();
1665
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001666 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001667 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001668
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001669 v = PyString_FromStringAndSize(NULL, cbAllocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001670 if (v == NULL)
1671 return NULL;
1672
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001673 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001674 for (;i < size; ++i) {
1675 Py_UNICODE ch = s[i];
1676
1677 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001678 if (ch == '+') {
1679 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001680 *out++ = '-';
1681 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1682 charsleft = ch;
1683 bitsleft = 16;
1684 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001685 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001686 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001687 } else {
1688 *out++ = (char) ch;
1689 }
1690 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001691 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1692 *out++ = B64(charsleft << (6-bitsleft));
1693 charsleft = 0;
1694 bitsleft = 0;
1695 /* Characters not in the BASE64 set implicitly unshift the sequence
1696 so no '-' is required, except if the character is itself a '-' */
1697 if (B64CHAR(ch) || ch == '-') {
1698 *out++ = '-';
1699 }
1700 inShift = 0;
1701 *out++ = (char) ch;
1702 } else {
1703 bitsleft += 16;
1704 charsleft = (charsleft << 16) | ch;
1705 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1706
1707 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001708 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001709 or '-' then the shift sequence will be terminated implicitly and we
1710 don't have to insert a '-'. */
1711
1712 if (bitsleft == 0) {
1713 if (i + 1 < size) {
1714 Py_UNICODE ch2 = s[i+1];
1715
1716 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001717
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001718 } else if (B64CHAR(ch2) || ch2 == '-') {
1719 *out++ = '-';
1720 inShift = 0;
1721 } else {
1722 inShift = 0;
1723 }
1724
1725 }
1726 else {
1727 *out++ = '-';
1728 inShift = 0;
1729 }
1730 }
Tim Petersced69f82003-09-16 20:30:58 +00001731 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001732 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001733 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001734 if (bitsleft) {
1735 *out++= B64(charsleft << (6-bitsleft) );
1736 *out++ = '-';
1737 }
1738
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001739 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001740 return v;
1741}
1742
1743#undef SPECIAL
1744#undef B64
1745#undef B64CHAR
1746#undef UB64
1747#undef ENCODE
1748#undef DECODE
1749
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750/* --- UTF-8 Codec -------------------------------------------------------- */
1751
Tim Petersced69f82003-09-16 20:30:58 +00001752static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753char utf8_code_length[256] = {
1754 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1755 illegal prefix. see RFC 2279 for details */
1756 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1757 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1758 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1759 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1760 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1761 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1762 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1763 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1764 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1765 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1766 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1767 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1768 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1769 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1770 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1771 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1772};
1773
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001775 Py_ssize_t size,
1776 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001777{
Walter Dörwald69652032004-09-07 20:24:22 +00001778 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1779}
1780
1781PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001782 Py_ssize_t size,
1783 const char *errors,
1784 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001785{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001786 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001788 Py_ssize_t startinpos;
1789 Py_ssize_t endinpos;
1790 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791 const char *e;
1792 PyUnicodeObject *unicode;
1793 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001794 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001795 PyObject *errorHandler = NULL;
1796 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797
1798 /* Note: size will always be longer than the resulting Unicode
1799 character count */
1800 unicode = _PyUnicode_New(size);
1801 if (!unicode)
1802 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001803 if (size == 0) {
1804 if (consumed)
1805 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001807 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808
1809 /* Unpack UTF-8 encoded data */
1810 p = unicode->str;
1811 e = s + size;
1812
1813 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001814 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815
1816 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001817 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001818 s++;
1819 continue;
1820 }
1821
1822 n = utf8_code_length[ch];
1823
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001824 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001825 if (consumed)
1826 break;
1827 else {
1828 errmsg = "unexpected end of data";
1829 startinpos = s-starts;
1830 endinpos = size;
1831 goto utf8Error;
1832 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001833 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001834
1835 switch (n) {
1836
1837 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001838 errmsg = "unexpected code byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001839 startinpos = s-starts;
1840 endinpos = startinpos+1;
1841 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842
1843 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001844 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001845 startinpos = s-starts;
1846 endinpos = startinpos+1;
1847 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848
1849 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001850 if ((s[1] & 0xc0) != 0x80) {
1851 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001852 startinpos = s-starts;
1853 endinpos = startinpos+2;
1854 goto utf8Error;
1855 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001857 if (ch < 0x80) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001858 startinpos = s-starts;
1859 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001860 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001861 goto utf8Error;
1862 }
1863 else
1864 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865 break;
1866
1867 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001868 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001869 (s[2] & 0xc0) != 0x80) {
1870 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001871 startinpos = s-starts;
1872 endinpos = startinpos+3;
1873 goto utf8Error;
1874 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001876 if (ch < 0x0800) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001877 /* Note: UTF-8 encodings of surrogates are considered
1878 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001879
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001880 XXX For wide builds (UCS-4) we should probably try
1881 to recombine the surrogates into a single code
1882 unit.
1883 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001884 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001885 startinpos = s-starts;
1886 endinpos = startinpos+3;
1887 goto utf8Error;
1888 }
1889 else
1890 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001891 break;
1892
1893 case 4:
1894 if ((s[1] & 0xc0) != 0x80 ||
1895 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001896 (s[3] & 0xc0) != 0x80) {
1897 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001898 startinpos = s-starts;
1899 endinpos = startinpos+4;
1900 goto utf8Error;
1901 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001902 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001903 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001904 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001905 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001906 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001907 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001908 UTF-16 */
1909 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001910 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001911 startinpos = s-starts;
1912 endinpos = startinpos+4;
1913 goto utf8Error;
1914 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001915#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001916 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001917#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001918 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001919
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001920 /* translate from 10000..10FFFF to 0..FFFF */
1921 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001922
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001923 /* high surrogate = top 10 bits added to D800 */
1924 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001925
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001926 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001927 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001928#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929 break;
1930
1931 default:
1932 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001933 errmsg = "unsupported Unicode code range";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001934 startinpos = s-starts;
1935 endinpos = startinpos+n;
1936 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 }
1938 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001939 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001940
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001941 utf8Error:
1942 outpos = p-PyUnicode_AS_UNICODE(unicode);
1943 if (unicode_decode_call_errorhandler(
1944 errors, &errorHandler,
1945 "utf8", errmsg,
1946 starts, size, &startinpos, &endinpos, &exc, &s,
1947 &unicode, &outpos, &p))
1948 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949 }
Walter Dörwald69652032004-09-07 20:24:22 +00001950 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001951 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952
1953 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001954 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955 goto onError;
1956
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001957 Py_XDECREF(errorHandler);
1958 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959 return (PyObject *)unicode;
1960
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001961 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001962 Py_XDECREF(errorHandler);
1963 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964 Py_DECREF(unicode);
1965 return NULL;
1966}
1967
Tim Peters602f7402002-04-27 18:03:26 +00001968/* Allocation strategy: if the string is short, convert into a stack buffer
1969 and allocate exactly as much space needed at the end. Else allocate the
1970 maximum possible needed (4 result bytes per Unicode character), and return
1971 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001972*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001973PyObject *
1974PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001975 Py_ssize_t size,
1976 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977{
Tim Peters602f7402002-04-27 18:03:26 +00001978#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001979
Martin v. Löwis18e16552006-02-15 17:27:45 +00001980 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00001981 PyObject *v; /* result string object */
1982 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00001983 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00001984 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00001985 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001986
Tim Peters602f7402002-04-27 18:03:26 +00001987 assert(s != NULL);
1988 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989
Tim Peters602f7402002-04-27 18:03:26 +00001990 if (size <= MAX_SHORT_UNICHARS) {
1991 /* Write into the stack buffer; nallocated can't overflow.
1992 * At the end, we'll allocate exactly as much heap space as it
1993 * turns out we need.
1994 */
1995 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1996 v = NULL; /* will allocate after we're done */
1997 p = stackbuf;
1998 }
1999 else {
2000 /* Overallocate on the heap, and give the excess back at the end. */
2001 nallocated = size * 4;
2002 if (nallocated / 4 != size) /* overflow! */
2003 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002004 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002005 if (v == NULL)
2006 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002007 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002008 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002009
Tim Peters602f7402002-04-27 18:03:26 +00002010 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002011 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002012
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002013 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002014 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002016
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002018 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002019 *p++ = (char)(0xc0 | (ch >> 6));
2020 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002021 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002022 else {
Tim Peters602f7402002-04-27 18:03:26 +00002023 /* Encode UCS2 Unicode ordinals */
2024 if (ch < 0x10000) {
2025 /* Special case: check for high surrogate */
2026 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2027 Py_UCS4 ch2 = s[i];
2028 /* Check for low surrogate and combine the two to
2029 form a UCS4 value */
2030 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002031 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002032 i++;
2033 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002034 }
Tim Peters602f7402002-04-27 18:03:26 +00002035 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002036 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002037 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002038 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2039 *p++ = (char)(0x80 | (ch & 0x3f));
2040 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002041 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002042 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002043 /* Encode UCS4 Unicode ordinals */
2044 *p++ = (char)(0xf0 | (ch >> 18));
2045 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2046 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2047 *p++ = (char)(0x80 | (ch & 0x3f));
2048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002049 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002050
Tim Peters602f7402002-04-27 18:03:26 +00002051 if (v == NULL) {
2052 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002053 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002054 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002055 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002056 }
2057 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002058 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002059 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002060 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002061 _PyString_Resize(&v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002062 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002064
Tim Peters602f7402002-04-27 18:03:26 +00002065#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066}
2067
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2069{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002070 if (!PyUnicode_Check(unicode)) {
2071 PyErr_BadArgument();
2072 return NULL;
2073 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002074 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002075 PyUnicode_GET_SIZE(unicode),
2076 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077}
2078
Walter Dörwald6e390802007-08-17 16:41:28 +00002079/* --- UTF-32 Codec ------------------------------------------------------- */
2080
2081PyObject *
2082PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002083 Py_ssize_t size,
2084 const char *errors,
2085 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002086{
2087 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2088}
2089
2090PyObject *
2091PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002092 Py_ssize_t size,
2093 const char *errors,
2094 int *byteorder,
2095 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002096{
2097 const char *starts = s;
2098 Py_ssize_t startinpos;
2099 Py_ssize_t endinpos;
2100 Py_ssize_t outpos;
2101 PyUnicodeObject *unicode;
2102 Py_UNICODE *p;
2103#ifndef Py_UNICODE_WIDE
2104 int i, pairs;
2105#else
2106 const int pairs = 0;
2107#endif
2108 const unsigned char *q, *e;
2109 int bo = 0; /* assume native ordering by default */
2110 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002111 /* Offsets from q for retrieving bytes in the right order. */
2112#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2113 int iorder[] = {0, 1, 2, 3};
2114#else
2115 int iorder[] = {3, 2, 1, 0};
2116#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002117 PyObject *errorHandler = NULL;
2118 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002119 /* On narrow builds we split characters outside the BMP into two
2120 codepoints => count how much extra space we need. */
2121#ifndef Py_UNICODE_WIDE
2122 for (i = pairs = 0; i < size/4; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002123 if (((Py_UCS4 *)s)[i] >= 0x10000)
2124 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002125#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002126
2127 /* This might be one to much, because of a BOM */
2128 unicode = _PyUnicode_New((size+3)/4+pairs);
2129 if (!unicode)
2130 return NULL;
2131 if (size == 0)
2132 return (PyObject *)unicode;
2133
2134 /* Unpack UTF-32 encoded data */
2135 p = unicode->str;
2136 q = (unsigned char *)s;
2137 e = q + size;
2138
2139 if (byteorder)
2140 bo = *byteorder;
2141
2142 /* Check for BOM marks (U+FEFF) in the input and adjust current
2143 byte order setting accordingly. In native mode, the leading BOM
2144 mark is skipped, in all other modes, it is copied to the output
2145 stream as-is (giving a ZWNBSP character). */
2146 if (bo == 0) {
2147 if (size >= 4) {
2148 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002149 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002150#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002151 if (bom == 0x0000FEFF) {
2152 q += 4;
2153 bo = -1;
2154 }
2155 else if (bom == 0xFFFE0000) {
2156 q += 4;
2157 bo = 1;
2158 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002159#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002160 if (bom == 0x0000FEFF) {
2161 q += 4;
2162 bo = 1;
2163 }
2164 else if (bom == 0xFFFE0000) {
2165 q += 4;
2166 bo = -1;
2167 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002168#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002169 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002170 }
2171
2172 if (bo == -1) {
2173 /* force LE */
2174 iorder[0] = 0;
2175 iorder[1] = 1;
2176 iorder[2] = 2;
2177 iorder[3] = 3;
2178 }
2179 else if (bo == 1) {
2180 /* force BE */
2181 iorder[0] = 3;
2182 iorder[1] = 2;
2183 iorder[2] = 1;
2184 iorder[3] = 0;
2185 }
2186
2187 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002188 Py_UCS4 ch;
2189 /* remaining bytes at the end? (size should be divisible by 4) */
2190 if (e-q<4) {
2191 if (consumed)
2192 break;
2193 errmsg = "truncated data";
2194 startinpos = ((const char *)q)-starts;
2195 endinpos = ((const char *)e)-starts;
2196 goto utf32Error;
2197 /* The remaining input chars are ignored if the callback
2198 chooses to skip the input */
2199 }
2200 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2201 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002202
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002203 if (ch >= 0x110000)
2204 {
2205 errmsg = "codepoint not in range(0x110000)";
2206 startinpos = ((const char *)q)-starts;
2207 endinpos = startinpos+4;
2208 goto utf32Error;
2209 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002210#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002211 if (ch >= 0x10000)
2212 {
2213 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2214 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2215 }
2216 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002217#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002218 *p++ = ch;
2219 q += 4;
2220 continue;
2221 utf32Error:
2222 outpos = p-PyUnicode_AS_UNICODE(unicode);
2223 if (unicode_decode_call_errorhandler(
2224 errors, &errorHandler,
2225 "utf32", errmsg,
2226 starts, size, &startinpos, &endinpos, &exc, &s,
2227 &unicode, &outpos, &p))
2228 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002229 }
2230
2231 if (byteorder)
2232 *byteorder = bo;
2233
2234 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002235 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002236
2237 /* Adjust length */
2238 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2239 goto onError;
2240
2241 Py_XDECREF(errorHandler);
2242 Py_XDECREF(exc);
2243 return (PyObject *)unicode;
2244
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002245 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002246 Py_DECREF(unicode);
2247 Py_XDECREF(errorHandler);
2248 Py_XDECREF(exc);
2249 return NULL;
2250}
2251
2252PyObject *
2253PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002254 Py_ssize_t size,
2255 const char *errors,
2256 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002257{
2258 PyObject *v;
2259 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002260 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002261#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002262 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002263#else
2264 const int pairs = 0;
2265#endif
2266 /* Offsets from p for storing byte pairs in the right order. */
2267#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2268 int iorder[] = {0, 1, 2, 3};
2269#else
2270 int iorder[] = {3, 2, 1, 0};
2271#endif
2272
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002273#define STORECHAR(CH) \
2274 do { \
2275 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2276 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2277 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2278 p[iorder[0]] = (CH) & 0xff; \
2279 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002280 } while(0)
2281
2282 /* In narrow builds we can output surrogate pairs as one codepoint,
2283 so we need less space. */
2284#ifndef Py_UNICODE_WIDE
2285 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002286 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2287 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2288 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002289#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002290 nsize = (size - pairs + (byteorder == 0));
2291 bytesize = nsize * 4;
2292 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002293 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002294 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002295 if (v == NULL)
2296 return NULL;
2297
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002298 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002299 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002300 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002301 if (size == 0)
2302 return v;
2303
2304 if (byteorder == -1) {
2305 /* force LE */
2306 iorder[0] = 0;
2307 iorder[1] = 1;
2308 iorder[2] = 2;
2309 iorder[3] = 3;
2310 }
2311 else if (byteorder == 1) {
2312 /* force BE */
2313 iorder[0] = 3;
2314 iorder[1] = 2;
2315 iorder[2] = 1;
2316 iorder[3] = 0;
2317 }
2318
2319 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002320 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002321#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002322 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2323 Py_UCS4 ch2 = *s;
2324 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2325 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2326 s++;
2327 size--;
2328 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002329 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002330#endif
2331 STORECHAR(ch);
2332 }
2333 return v;
2334#undef STORECHAR
2335}
2336
2337PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2338{
2339 if (!PyUnicode_Check(unicode)) {
2340 PyErr_BadArgument();
2341 return NULL;
2342 }
2343 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002344 PyUnicode_GET_SIZE(unicode),
2345 NULL,
2346 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002347}
2348
Guido van Rossumd57fd912000-03-10 22:53:23 +00002349/* --- UTF-16 Codec ------------------------------------------------------- */
2350
Tim Peters772747b2001-08-09 22:21:55 +00002351PyObject *
2352PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002353 Py_ssize_t size,
2354 const char *errors,
2355 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356{
Walter Dörwald69652032004-09-07 20:24:22 +00002357 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2358}
2359
2360PyObject *
2361PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002362 Py_ssize_t size,
2363 const char *errors,
2364 int *byteorder,
2365 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002366{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002367 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002368 Py_ssize_t startinpos;
2369 Py_ssize_t endinpos;
2370 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002371 PyUnicodeObject *unicode;
2372 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002373 const unsigned char *q, *e;
2374 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002375 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002376 /* Offsets from q for retrieving byte pairs in the right order. */
2377#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2378 int ihi = 1, ilo = 0;
2379#else
2380 int ihi = 0, ilo = 1;
2381#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002382 PyObject *errorHandler = NULL;
2383 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384
2385 /* Note: size will always be longer than the resulting Unicode
2386 character count */
2387 unicode = _PyUnicode_New(size);
2388 if (!unicode)
2389 return NULL;
2390 if (size == 0)
2391 return (PyObject *)unicode;
2392
2393 /* Unpack UTF-16 encoded data */
2394 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002395 q = (unsigned char *)s;
2396 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002397
2398 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002399 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002400
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002401 /* Check for BOM marks (U+FEFF) in the input and adjust current
2402 byte order setting accordingly. In native mode, the leading BOM
2403 mark is skipped, in all other modes, it is copied to the output
2404 stream as-is (giving a ZWNBSP character). */
2405 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002406 if (size >= 2) {
2407 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002408#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002409 if (bom == 0xFEFF) {
2410 q += 2;
2411 bo = -1;
2412 }
2413 else if (bom == 0xFFFE) {
2414 q += 2;
2415 bo = 1;
2416 }
Tim Petersced69f82003-09-16 20:30:58 +00002417#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002418 if (bom == 0xFEFF) {
2419 q += 2;
2420 bo = 1;
2421 }
2422 else if (bom == 0xFFFE) {
2423 q += 2;
2424 bo = -1;
2425 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002426#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002427 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002428 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002429
Tim Peters772747b2001-08-09 22:21:55 +00002430 if (bo == -1) {
2431 /* force LE */
2432 ihi = 1;
2433 ilo = 0;
2434 }
2435 else if (bo == 1) {
2436 /* force BE */
2437 ihi = 0;
2438 ilo = 1;
2439 }
2440
2441 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002442 Py_UNICODE ch;
2443 /* remaining bytes at the end? (size should be even) */
2444 if (e-q<2) {
2445 if (consumed)
2446 break;
2447 errmsg = "truncated data";
2448 startinpos = ((const char *)q)-starts;
2449 endinpos = ((const char *)e)-starts;
2450 goto utf16Error;
2451 /* The remaining input chars are ignored if the callback
2452 chooses to skip the input */
2453 }
2454 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002455
Benjamin Peterson857ce152009-01-31 16:29:18 +00002456 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002457
2458 if (ch < 0xD800 || ch > 0xDFFF) {
2459 *p++ = ch;
2460 continue;
2461 }
2462
2463 /* UTF-16 code pair: */
2464 if (q >= e) {
2465 errmsg = "unexpected end of data";
2466 startinpos = (((const char *)q)-2)-starts;
2467 endinpos = ((const char *)e)-starts;
2468 goto utf16Error;
2469 }
2470 if (0xD800 <= ch && ch <= 0xDBFF) {
2471 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2472 q += 2;
2473 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002474#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002475 *p++ = ch;
2476 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002477#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002478 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002479#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002480 continue;
2481 }
2482 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002483 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002484 startinpos = (((const char *)q)-4)-starts;
2485 endinpos = startinpos+2;
2486 goto utf16Error;
2487 }
2488
Benjamin Peterson857ce152009-01-31 16:29:18 +00002489 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002490 errmsg = "illegal encoding";
2491 startinpos = (((const char *)q)-2)-starts;
2492 endinpos = startinpos+2;
2493 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002494
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002495 utf16Error:
2496 outpos = p-PyUnicode_AS_UNICODE(unicode);
2497 if (unicode_decode_call_errorhandler(
2498 errors, &errorHandler,
2499 "utf16", errmsg,
2500 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2501 &unicode, &outpos, &p))
2502 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503 }
2504
2505 if (byteorder)
2506 *byteorder = bo;
2507
Walter Dörwald69652032004-09-07 20:24:22 +00002508 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002509 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002510
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002512 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002513 goto onError;
2514
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002515 Py_XDECREF(errorHandler);
2516 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517 return (PyObject *)unicode;
2518
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002519 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002521 Py_XDECREF(errorHandler);
2522 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523 return NULL;
2524}
2525
Tim Peters772747b2001-08-09 22:21:55 +00002526PyObject *
2527PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002528 Py_ssize_t size,
2529 const char *errors,
2530 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531{
2532 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002533 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002534 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002535#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002536 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002537#else
2538 const int pairs = 0;
2539#endif
Tim Peters772747b2001-08-09 22:21:55 +00002540 /* Offsets from p for storing byte pairs in the right order. */
2541#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2542 int ihi = 1, ilo = 0;
2543#else
2544 int ihi = 0, ilo = 1;
2545#endif
2546
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002547#define STORECHAR(CH) \
2548 do { \
2549 p[ihi] = ((CH) >> 8) & 0xff; \
2550 p[ilo] = (CH) & 0xff; \
2551 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002552 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002554#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002555 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002556 if (s[i] >= 0x10000)
2557 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002558#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002559 /* 2 * (size + pairs + (byteorder == 0)) */
2560 if (size > PY_SSIZE_T_MAX ||
2561 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002562 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002563 nsize = size + pairs + (byteorder == 0);
2564 bytesize = nsize * 2;
2565 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002566 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002567 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568 if (v == NULL)
2569 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002570
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002571 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002573 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002574 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002575 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002576
2577 if (byteorder == -1) {
2578 /* force LE */
2579 ihi = 1;
2580 ilo = 0;
2581 }
2582 else if (byteorder == 1) {
2583 /* force BE */
2584 ihi = 0;
2585 ilo = 1;
2586 }
2587
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002588 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002589 Py_UNICODE ch = *s++;
2590 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002591#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002592 if (ch >= 0x10000) {
2593 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2594 ch = 0xD800 | ((ch-0x10000) >> 10);
2595 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002596#endif
Tim Peters772747b2001-08-09 22:21:55 +00002597 STORECHAR(ch);
2598 if (ch2)
2599 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002600 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002602#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603}
2604
2605PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2606{
2607 if (!PyUnicode_Check(unicode)) {
2608 PyErr_BadArgument();
2609 return NULL;
2610 }
2611 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002612 PyUnicode_GET_SIZE(unicode),
2613 NULL,
2614 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615}
2616
2617/* --- Unicode Escape Codec ----------------------------------------------- */
2618
Fredrik Lundh06d12682001-01-24 07:59:11 +00002619static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002620
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002622 Py_ssize_t size,
2623 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002625 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002626 Py_ssize_t startinpos;
2627 Py_ssize_t endinpos;
2628 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002629 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002630 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002631 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002633 char* message;
2634 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002635 PyObject *errorHandler = NULL;
2636 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002637
Guido van Rossumd57fd912000-03-10 22:53:23 +00002638 /* Escaped strings will always be longer than the resulting
2639 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002640 length after conversion to the true value.
2641 (but if the error callback returns a long replacement string
2642 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002643 v = _PyUnicode_New(size);
2644 if (v == NULL)
2645 goto onError;
2646 if (size == 0)
2647 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002648
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002649 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002650 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002651
Guido van Rossumd57fd912000-03-10 22:53:23 +00002652 while (s < end) {
2653 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002654 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002655 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002656
2657 /* Non-escape characters are interpreted as Unicode ordinals */
2658 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002659 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660 continue;
2661 }
2662
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002663 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002664 /* \ - Escapes */
2665 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002666 c = *s++;
2667 if (s > end)
2668 c = '\0'; /* Invalid after \ */
2669 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002670
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002671 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672 case '\n': break;
2673 case '\\': *p++ = '\\'; break;
2674 case '\'': *p++ = '\''; break;
2675 case '\"': *p++ = '\"'; break;
2676 case 'b': *p++ = '\b'; break;
2677 case 'f': *p++ = '\014'; break; /* FF */
2678 case 't': *p++ = '\t'; break;
2679 case 'n': *p++ = '\n'; break;
2680 case 'r': *p++ = '\r'; break;
2681 case 'v': *p++ = '\013'; break; /* VT */
2682 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2683
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002684 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 case '0': case '1': case '2': case '3':
2686 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002687 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002688 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002689 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002690 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002691 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002693 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 break;
2695
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002696 /* hex escapes */
2697 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002699 digits = 2;
2700 message = "truncated \\xXX escape";
2701 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002702
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002703 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002705 digits = 4;
2706 message = "truncated \\uXXXX escape";
2707 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002708
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002709 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002710 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002711 digits = 8;
2712 message = "truncated \\UXXXXXXXX escape";
2713 hexescape:
2714 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002715 outpos = p-PyUnicode_AS_UNICODE(v);
2716 if (s+digits>end) {
2717 endinpos = size;
2718 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002719 errors, &errorHandler,
2720 "unicodeescape", "end of string in escape sequence",
2721 starts, size, &startinpos, &endinpos, &exc, &s,
2722 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002723 goto onError;
2724 goto nextByte;
2725 }
2726 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002727 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002728 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002729 endinpos = (s+i+1)-starts;
2730 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002731 errors, &errorHandler,
2732 "unicodeescape", message,
2733 starts, size, &startinpos, &endinpos, &exc, &s,
2734 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002735 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002736 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002737 }
2738 chr = (chr<<4) & ~0xF;
2739 if (c >= '0' && c <= '9')
2740 chr += c - '0';
2741 else if (c >= 'a' && c <= 'f')
2742 chr += 10 + c - 'a';
2743 else
2744 chr += 10 + c - 'A';
2745 }
2746 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002747 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002748 /* _decoding_error will have already written into the
2749 target buffer. */
2750 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002751 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002752 /* when we get here, chr is a 32-bit unicode character */
2753 if (chr <= 0xffff)
2754 /* UCS-2 character */
2755 *p++ = (Py_UNICODE) chr;
2756 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002757 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002758 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002759#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002760 *p++ = chr;
2761#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002762 chr -= 0x10000L;
2763 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002764 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002765#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002766 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002767 endinpos = s-starts;
2768 outpos = p-PyUnicode_AS_UNICODE(v);
2769 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002770 errors, &errorHandler,
2771 "unicodeescape", "illegal Unicode character",
2772 starts, size, &startinpos, &endinpos, &exc, &s,
2773 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002774 goto onError;
2775 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002776 break;
2777
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002778 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002779 case 'N':
2780 message = "malformed \\N character escape";
2781 if (ucnhash_CAPI == NULL) {
2782 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002783 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002784 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002785 if (m == NULL)
2786 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002787 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002788 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002789 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002790 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002791 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002792 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002793 if (ucnhash_CAPI == NULL)
2794 goto ucnhashError;
2795 }
2796 if (*s == '{') {
2797 const char *start = s+1;
2798 /* look for the closing brace */
2799 while (*s != '}' && s < end)
2800 s++;
2801 if (s > start && s < end && *s == '}') {
2802 /* found a name. look it up in the unicode database */
2803 message = "unknown Unicode character name";
2804 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002805 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002806 goto store;
2807 }
2808 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002809 endinpos = s-starts;
2810 outpos = p-PyUnicode_AS_UNICODE(v);
2811 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002812 errors, &errorHandler,
2813 "unicodeescape", message,
2814 starts, size, &startinpos, &endinpos, &exc, &s,
2815 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002816 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002817 break;
2818
2819 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002820 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002821 message = "\\ at end of string";
2822 s--;
2823 endinpos = s-starts;
2824 outpos = p-PyUnicode_AS_UNICODE(v);
2825 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002826 errors, &errorHandler,
2827 "unicodeescape", message,
2828 starts, size, &startinpos, &endinpos, &exc, &s,
2829 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002830 goto onError;
2831 }
2832 else {
2833 *p++ = '\\';
2834 *p++ = (unsigned char)s[-1];
2835 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002836 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002838 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002839 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002841 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002843 Py_XDECREF(errorHandler);
2844 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002846
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002847 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002848 PyErr_SetString(
2849 PyExc_UnicodeError,
2850 "\\N escapes not supported (can't load unicodedata module)"
2851 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002852 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002853 Py_XDECREF(errorHandler);
2854 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002855 return NULL;
2856
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002857 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002859 Py_XDECREF(errorHandler);
2860 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861 return NULL;
2862}
2863
2864/* Return a Unicode-Escape string version of the Unicode object.
2865
2866 If quotes is true, the string is enclosed in u"" or u'' quotes as
2867 appropriate.
2868
2869*/
2870
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002871Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002872 Py_ssize_t size,
2873 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002874{
2875 /* like wcschr, but doesn't stop at NULL characters */
2876
2877 while (size-- > 0) {
2878 if (*s == ch)
2879 return s;
2880 s++;
2881 }
2882
2883 return NULL;
2884}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002885
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886static
2887PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002888 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 int quotes)
2890{
2891 PyObject *repr;
2892 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002894 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002895#ifdef Py_UNICODE_WIDE
2896 const Py_ssize_t expandsize = 10;
2897#else
2898 const Py_ssize_t expandsize = 6;
2899#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900
Neal Norwitz17753ec2006-08-21 22:21:19 +00002901 /* XXX(nnorwitz): rather than over-allocating, it would be
2902 better to choose a different scheme. Perhaps scan the
2903 first N-chars of the string and allocate based on that size.
2904 */
2905 /* Initial allocation is based on the longest-possible unichr
2906 escape.
2907
2908 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
2909 unichr, so in this case it's the longest unichr escape. In
2910 narrow (UTF-16) builds this is five chars per source unichr
2911 since there are two unichrs in the surrogate pair, so in narrow
2912 (UTF-16) builds it's not the longest unichr escape.
2913
2914 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
2915 so in the narrow (UTF-16) build case it's the longest unichr
2916 escape.
2917 */
2918
Neal Norwitze7d8be82008-07-31 17:17:14 +00002919 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002920 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002921
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002922 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002923 2
2924 + expandsize*size
2925 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 if (repr == NULL)
2927 return NULL;
2928
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002929 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930
2931 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00002933 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934 !findchar(s, size, '"')) ? '"' : '\'';
2935 }
2936 while (size-- > 0) {
2937 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002938
Hye-Shik Chang835b2432005-12-17 04:38:31 +00002939 /* Escape quotes and backslashes */
2940 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002941 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002942 *p++ = '\\';
2943 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002944 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002945 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002946
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002947#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002948 /* Map 21-bit characters to '\U00xxxxxx' */
2949 else if (ch >= 0x10000) {
2950 *p++ = '\\';
2951 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002952 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2953 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2954 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2955 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2956 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2957 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2958 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002959 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002960 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002961 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002962#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002963 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
2964 else if (ch >= 0xD800 && ch < 0xDC00) {
2965 Py_UNICODE ch2;
2966 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002967
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002968 ch2 = *s++;
2969 size--;
2970 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2971 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2972 *p++ = '\\';
2973 *p++ = 'U';
2974 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2975 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2976 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2977 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2978 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2979 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2980 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2981 *p++ = hexdigit[ucs & 0x0000000F];
2982 continue;
2983 }
2984 /* Fall through: isolated surrogates are copied as-is */
2985 s--;
2986 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002987 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00002988#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002989
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002991 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992 *p++ = '\\';
2993 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002994 *p++ = hexdigit[(ch >> 12) & 0x000F];
2995 *p++ = hexdigit[(ch >> 8) & 0x000F];
2996 *p++ = hexdigit[(ch >> 4) & 0x000F];
2997 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002999
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003000 /* Map special whitespace to '\t', \n', '\r' */
3001 else if (ch == '\t') {
3002 *p++ = '\\';
3003 *p++ = 't';
3004 }
3005 else if (ch == '\n') {
3006 *p++ = '\\';
3007 *p++ = 'n';
3008 }
3009 else if (ch == '\r') {
3010 *p++ = '\\';
3011 *p++ = 'r';
3012 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003013
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003014 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003015 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003017 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003018 *p++ = hexdigit[(ch >> 4) & 0x000F];
3019 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003020 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003021
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022 /* Copy everything else as-is */
3023 else
3024 *p++ = (char) ch;
3025 }
3026 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003027 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028
3029 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003030 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031 return repr;
3032}
3033
3034PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003035 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036{
3037 return unicodeescape_string(s, size, 0);
3038}
3039
3040PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3041{
3042 if (!PyUnicode_Check(unicode)) {
3043 PyErr_BadArgument();
3044 return NULL;
3045 }
3046 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003047 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048}
3049
3050/* --- Raw Unicode Escape Codec ------------------------------------------- */
3051
3052PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003053 Py_ssize_t size,
3054 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003056 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003057 Py_ssize_t startinpos;
3058 Py_ssize_t endinpos;
3059 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003061 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062 const char *end;
3063 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003064 PyObject *errorHandler = NULL;
3065 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003066
Guido van Rossumd57fd912000-03-10 22:53:23 +00003067 /* Escaped strings will always be longer than the resulting
3068 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003069 length after conversion to the true value. (But decoding error
3070 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003071 v = _PyUnicode_New(size);
3072 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003073 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003074 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003075 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003076 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077 end = s + size;
3078 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003079 unsigned char c;
3080 Py_UCS4 x;
3081 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003082 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003083
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003084 /* Non-escape characters are interpreted as Unicode ordinals */
3085 if (*s != '\\') {
3086 *p++ = (unsigned char)*s++;
3087 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003088 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003089 startinpos = s-starts;
3090
3091 /* \u-escapes are only interpreted iff the number of leading
3092 backslashes if odd */
3093 bs = s;
3094 for (;s < end;) {
3095 if (*s != '\\')
3096 break;
3097 *p++ = (unsigned char)*s++;
3098 }
3099 if (((s - bs) & 1) == 0 ||
3100 s >= end ||
3101 (*s != 'u' && *s != 'U')) {
3102 continue;
3103 }
3104 p--;
3105 count = *s=='u' ? 4 : 8;
3106 s++;
3107
3108 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3109 outpos = p-PyUnicode_AS_UNICODE(v);
3110 for (x = 0, i = 0; i < count; ++i, ++s) {
3111 c = (unsigned char)*s;
3112 if (!isxdigit(c)) {
3113 endinpos = s-starts;
3114 if (unicode_decode_call_errorhandler(
3115 errors, &errorHandler,
3116 "rawunicodeescape", "truncated \\uXXXX",
3117 starts, size, &startinpos, &endinpos, &exc, &s,
3118 &v, &outpos, &p))
3119 goto onError;
3120 goto nextByte;
3121 }
3122 x = (x<<4) & ~0xF;
3123 if (c >= '0' && c <= '9')
3124 x += c - '0';
3125 else if (c >= 'a' && c <= 'f')
3126 x += 10 + c - 'a';
3127 else
3128 x += 10 + c - 'A';
3129 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003130 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003131 /* UCS-2 character */
3132 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003133 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003134 /* UCS-4 character. Either store directly, or as
3135 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003136#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003137 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003138#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003139 x -= 0x10000L;
3140 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3141 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003142#endif
3143 } else {
3144 endinpos = s-starts;
3145 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003146 if (unicode_decode_call_errorhandler(
3147 errors, &errorHandler,
3148 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003149 starts, size, &startinpos, &endinpos, &exc, &s,
3150 &v, &outpos, &p))
3151 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003152 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003153 nextByte:
3154 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003156 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003157 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003158 Py_XDECREF(errorHandler);
3159 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003161
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003162 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003164 Py_XDECREF(errorHandler);
3165 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166 return NULL;
3167}
3168
3169PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003170 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171{
3172 PyObject *repr;
3173 char *p;
3174 char *q;
3175
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003176 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003177#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003178 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003179#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003180 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003181#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003182
Neal Norwitze7d8be82008-07-31 17:17:14 +00003183 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003184 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003185
Neal Norwitze7d8be82008-07-31 17:17:14 +00003186 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003187 if (repr == NULL)
3188 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003189 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003190 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003192 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003193 while (size-- > 0) {
3194 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003195#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003196 /* Map 32-bit characters to '\Uxxxxxxxx' */
3197 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003198 *p++ = '\\';
3199 *p++ = 'U';
3200 *p++ = hexdigit[(ch >> 28) & 0xf];
3201 *p++ = hexdigit[(ch >> 24) & 0xf];
3202 *p++ = hexdigit[(ch >> 20) & 0xf];
3203 *p++ = hexdigit[(ch >> 16) & 0xf];
3204 *p++ = hexdigit[(ch >> 12) & 0xf];
3205 *p++ = hexdigit[(ch >> 8) & 0xf];
3206 *p++ = hexdigit[(ch >> 4) & 0xf];
3207 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003208 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003209 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003210#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003211 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3212 if (ch >= 0xD800 && ch < 0xDC00) {
3213 Py_UNICODE ch2;
3214 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003215
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003216 ch2 = *s++;
3217 size--;
3218 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3219 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3220 *p++ = '\\';
3221 *p++ = 'U';
3222 *p++ = hexdigit[(ucs >> 28) & 0xf];
3223 *p++ = hexdigit[(ucs >> 24) & 0xf];
3224 *p++ = hexdigit[(ucs >> 20) & 0xf];
3225 *p++ = hexdigit[(ucs >> 16) & 0xf];
3226 *p++ = hexdigit[(ucs >> 12) & 0xf];
3227 *p++ = hexdigit[(ucs >> 8) & 0xf];
3228 *p++ = hexdigit[(ucs >> 4) & 0xf];
3229 *p++ = hexdigit[ucs & 0xf];
3230 continue;
3231 }
3232 /* Fall through: isolated surrogates are copied as-is */
3233 s--;
3234 size++;
3235 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003236#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003237 /* Map 16-bit characters to '\uxxxx' */
3238 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 *p++ = '\\';
3240 *p++ = 'u';
3241 *p++ = hexdigit[(ch >> 12) & 0xf];
3242 *p++ = hexdigit[(ch >> 8) & 0xf];
3243 *p++ = hexdigit[(ch >> 4) & 0xf];
3244 *p++ = hexdigit[ch & 15];
3245 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003246 /* Copy everything else as-is */
3247 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 *p++ = (char) ch;
3249 }
3250 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003251 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003252 return repr;
3253}
3254
3255PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3256{
3257 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003258 PyErr_BadArgument();
3259 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 }
3261 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003262 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263}
3264
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003265/* --- Unicode Internal Codec ------------------------------------------- */
3266
3267PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003268 Py_ssize_t size,
3269 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003270{
3271 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003272 Py_ssize_t startinpos;
3273 Py_ssize_t endinpos;
3274 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003275 PyUnicodeObject *v;
3276 Py_UNICODE *p;
3277 const char *end;
3278 const char *reason;
3279 PyObject *errorHandler = NULL;
3280 PyObject *exc = NULL;
3281
Neal Norwitzd43069c2006-01-08 01:12:10 +00003282#ifdef Py_UNICODE_WIDE
3283 Py_UNICODE unimax = PyUnicode_GetMax();
3284#endif
3285
Armin Rigo7ccbca92006-10-04 12:17:45 +00003286 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003287 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3288 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003289 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003290 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003291 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003292 p = PyUnicode_AS_UNICODE(v);
3293 end = s + size;
3294
3295 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003296 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003297 /* We have to sanity check the raw data, otherwise doom looms for
3298 some malformed UCS-4 data. */
3299 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003300#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003301 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003302#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003303 end-s < Py_UNICODE_SIZE
3304 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003305 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003306 startinpos = s - starts;
3307 if (end-s < Py_UNICODE_SIZE) {
3308 endinpos = end-starts;
3309 reason = "truncated input";
3310 }
3311 else {
3312 endinpos = s - starts + Py_UNICODE_SIZE;
3313 reason = "illegal code point (> 0x10FFFF)";
3314 }
3315 outpos = p - PyUnicode_AS_UNICODE(v);
3316 if (unicode_decode_call_errorhandler(
3317 errors, &errorHandler,
3318 "unicode_internal", reason,
3319 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003320 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003321 goto onError;
3322 }
3323 }
3324 else {
3325 p++;
3326 s += Py_UNICODE_SIZE;
3327 }
3328 }
3329
Martin v. Löwis412fb672006-04-13 06:34:32 +00003330 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003331 goto onError;
3332 Py_XDECREF(errorHandler);
3333 Py_XDECREF(exc);
3334 return (PyObject *)v;
3335
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003336 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003337 Py_XDECREF(v);
3338 Py_XDECREF(errorHandler);
3339 Py_XDECREF(exc);
3340 return NULL;
3341}
3342
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343/* --- Latin-1 Codec ------------------------------------------------------ */
3344
3345PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003346 Py_ssize_t size,
3347 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348{
3349 PyUnicodeObject *v;
3350 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003351
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003353 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003354 Py_UNICODE r = *(unsigned char*)s;
3355 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003356 }
3357
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358 v = _PyUnicode_New(size);
3359 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003360 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003362 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363 p = PyUnicode_AS_UNICODE(v);
3364 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003365 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003367
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003368 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369 Py_XDECREF(v);
3370 return NULL;
3371}
3372
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003373/* create or adjust a UnicodeEncodeError */
3374static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003375 const char *encoding,
3376 const Py_UNICODE *unicode, Py_ssize_t size,
3377 Py_ssize_t startpos, Py_ssize_t endpos,
3378 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003379{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003380 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003381 *exceptionObject = PyUnicodeEncodeError_Create(
3382 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383 }
3384 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003385 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3386 goto onError;
3387 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3388 goto onError;
3389 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3390 goto onError;
3391 return;
3392 onError:
3393 Py_DECREF(*exceptionObject);
3394 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003395 }
3396}
3397
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398/* raises a UnicodeEncodeError */
3399static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003400 const char *encoding,
3401 const Py_UNICODE *unicode, Py_ssize_t size,
3402 Py_ssize_t startpos, Py_ssize_t endpos,
3403 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003404{
3405 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003406 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003407 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003408 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003409}
3410
3411/* error handling callback helper:
3412 build arguments, call the callback and check the arguments,
3413 put the result into newpos and return the replacement string, which
3414 has to be freed by the caller */
3415static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003416 PyObject **errorHandler,
3417 const char *encoding, const char *reason,
3418 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3419 Py_ssize_t startpos, Py_ssize_t endpos,
3420 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003421{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003422 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003423
3424 PyObject *restuple;
3425 PyObject *resunicode;
3426
3427 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003428 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003429 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003430 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003431 }
3432
3433 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003434 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003435 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003436 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003437
3438 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003439 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003440 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003441 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003442 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003443 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003444 Py_DECREF(restuple);
3445 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003446 }
3447 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003448 &resunicode, newpos)) {
3449 Py_DECREF(restuple);
3450 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003451 }
3452 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003453 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003454 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003455 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3456 Py_DECREF(restuple);
3457 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003458 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003459 Py_INCREF(resunicode);
3460 Py_DECREF(restuple);
3461 return resunicode;
3462}
3463
3464static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003465 Py_ssize_t size,
3466 const char *errors,
3467 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003468{
3469 /* output object */
3470 PyObject *res;
3471 /* pointers to the beginning and end+1 of input */
3472 const Py_UNICODE *startp = p;
3473 const Py_UNICODE *endp = p + size;
3474 /* pointer to the beginning of the unencodable characters */
3475 /* const Py_UNICODE *badp = NULL; */
3476 /* pointer into the output */
3477 char *str;
3478 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003479 Py_ssize_t respos = 0;
3480 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003481 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3482 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003483 PyObject *errorHandler = NULL;
3484 PyObject *exc = NULL;
3485 /* the following variable is used for caching string comparisons
3486 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3487 int known_errorHandler = -1;
3488
3489 /* allocate enough for a simple encoding without
3490 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003491 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003492 if (res == NULL)
3493 goto onError;
3494 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003495 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003496 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497 ressize = size;
3498
3499 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003500 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003502 /* can we encode this? */
3503 if (c<limit) {
3504 /* no overflow check, because we know that the space is enough */
3505 *str++ = (char)c;
3506 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003507 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003508 else {
3509 Py_ssize_t unicodepos = p-startp;
3510 Py_ssize_t requiredsize;
3511 PyObject *repunicode;
3512 Py_ssize_t repsize;
3513 Py_ssize_t newpos;
3514 Py_ssize_t respos;
3515 Py_UNICODE *uni2;
3516 /* startpos for collecting unencodable chars */
3517 const Py_UNICODE *collstart = p;
3518 const Py_UNICODE *collend = p;
3519 /* find all unecodable characters */
3520 while ((collend < endp) && ((*collend)>=limit))
3521 ++collend;
3522 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3523 if (known_errorHandler==-1) {
3524 if ((errors==NULL) || (!strcmp(errors, "strict")))
3525 known_errorHandler = 1;
3526 else if (!strcmp(errors, "replace"))
3527 known_errorHandler = 2;
3528 else if (!strcmp(errors, "ignore"))
3529 known_errorHandler = 3;
3530 else if (!strcmp(errors, "xmlcharrefreplace"))
3531 known_errorHandler = 4;
3532 else
3533 known_errorHandler = 0;
3534 }
3535 switch (known_errorHandler) {
3536 case 1: /* strict */
3537 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3538 goto onError;
3539 case 2: /* replace */
3540 while (collstart++<collend)
3541 *str++ = '?'; /* fall through */
3542 case 3: /* ignore */
3543 p = collend;
3544 break;
3545 case 4: /* xmlcharrefreplace */
3546 respos = str-PyString_AS_STRING(res);
3547 /* determine replacement size (temporarily (mis)uses p) */
3548 for (p = collstart, repsize = 0; p < collend; ++p) {
3549 if (*p<10)
3550 repsize += 2+1+1;
3551 else if (*p<100)
3552 repsize += 2+2+1;
3553 else if (*p<1000)
3554 repsize += 2+3+1;
3555 else if (*p<10000)
3556 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003557#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003558 else
3559 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003560#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003561 else if (*p<100000)
3562 repsize += 2+5+1;
3563 else if (*p<1000000)
3564 repsize += 2+6+1;
3565 else
3566 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003567#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003568 }
3569 requiredsize = respos+repsize+(endp-collend);
3570 if (requiredsize > ressize) {
3571 if (requiredsize<2*ressize)
3572 requiredsize = 2*ressize;
3573 if (_PyString_Resize(&res, requiredsize))
3574 goto onError;
3575 str = PyString_AS_STRING(res) + respos;
3576 ressize = requiredsize;
3577 }
3578 /* generate replacement (temporarily (mis)uses p) */
3579 for (p = collstart; p < collend; ++p) {
3580 str += sprintf(str, "&#%d;", (int)*p);
3581 }
3582 p = collend;
3583 break;
3584 default:
3585 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3586 encoding, reason, startp, size, &exc,
3587 collstart-startp, collend-startp, &newpos);
3588 if (repunicode == NULL)
3589 goto onError;
3590 /* need more space? (at least enough for what we have+the
3591 replacement+the rest of the string, so we won't have to
3592 check space for encodable characters) */
3593 respos = str-PyString_AS_STRING(res);
3594 repsize = PyUnicode_GET_SIZE(repunicode);
3595 requiredsize = respos+repsize+(endp-collend);
3596 if (requiredsize > ressize) {
3597 if (requiredsize<2*ressize)
3598 requiredsize = 2*ressize;
3599 if (_PyString_Resize(&res, requiredsize)) {
3600 Py_DECREF(repunicode);
3601 goto onError;
3602 }
3603 str = PyString_AS_STRING(res) + respos;
3604 ressize = requiredsize;
3605 }
3606 /* check if there is anything unencodable in the replacement
3607 and copy it to the output */
3608 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3609 c = *uni2;
3610 if (c >= limit) {
3611 raise_encode_exception(&exc, encoding, startp, size,
3612 unicodepos, unicodepos+1, reason);
3613 Py_DECREF(repunicode);
3614 goto onError;
3615 }
3616 *str = (char)c;
3617 }
3618 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003619 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003620 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003621 }
3622 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003623 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003624 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003625 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003626 /* If this falls res will be NULL */
3627 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 Py_XDECREF(errorHandler);
3629 Py_XDECREF(exc);
3630 return res;
3631
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003632 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 Py_XDECREF(res);
3634 Py_XDECREF(errorHandler);
3635 Py_XDECREF(exc);
3636 return NULL;
3637}
3638
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003640 Py_ssize_t size,
3641 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003643 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644}
3645
3646PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3647{
3648 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003649 PyErr_BadArgument();
3650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 }
3652 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003653 PyUnicode_GET_SIZE(unicode),
3654 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655}
3656
3657/* --- 7-bit ASCII Codec -------------------------------------------------- */
3658
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003660 Py_ssize_t size,
3661 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003663 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664 PyUnicodeObject *v;
3665 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003666 Py_ssize_t startinpos;
3667 Py_ssize_t endinpos;
3668 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 const char *e;
3670 PyObject *errorHandler = NULL;
3671 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003672
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003674 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003675 Py_UNICODE r = *(unsigned char*)s;
3676 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003677 }
Tim Petersced69f82003-09-16 20:30:58 +00003678
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679 v = _PyUnicode_New(size);
3680 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003681 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003683 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 e = s + size;
3686 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003687 register unsigned char c = (unsigned char)*s;
3688 if (c < 128) {
3689 *p++ = c;
3690 ++s;
3691 }
3692 else {
3693 startinpos = s-starts;
3694 endinpos = startinpos + 1;
3695 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3696 if (unicode_decode_call_errorhandler(
3697 errors, &errorHandler,
3698 "ascii", "ordinal not in range(128)",
3699 starts, size, &startinpos, &endinpos, &exc, &s,
3700 &v, &outpos, &p))
3701 goto onError;
3702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003704 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003705 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3706 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 Py_XDECREF(errorHandler);
3708 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003710
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003711 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713 Py_XDECREF(errorHandler);
3714 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715 return NULL;
3716}
3717
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003719 Py_ssize_t size,
3720 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003721{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723}
3724
3725PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3726{
3727 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003728 PyErr_BadArgument();
3729 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730 }
3731 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003732 PyUnicode_GET_SIZE(unicode),
3733 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734}
3735
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003736#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003737
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003738/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003739
Martin v. Löwisd8251432006-06-14 05:21:04 +00003740#if SIZEOF_INT < SIZEOF_SSIZE_T
3741#define NEED_RETRY
3742#endif
3743
3744/* XXX This code is limited to "true" double-byte encodings, as
3745 a) it assumes an incomplete character consists of a single byte, and
3746 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003747 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003748
3749static int is_dbcs_lead_byte(const char *s, int offset)
3750{
3751 const char *curr = s + offset;
3752
3753 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003754 const char *prev = CharPrev(s, curr);
3755 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003756 }
3757 return 0;
3758}
3759
3760/*
3761 * Decode MBCS string into unicode object. If 'final' is set, converts
3762 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3763 */
3764static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003765 const char *s, /* MBCS string */
3766 int size, /* sizeof MBCS string */
3767 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003768{
3769 Py_UNICODE *p;
3770 Py_ssize_t n = 0;
3771 int usize = 0;
3772
3773 assert(size >= 0);
3774
3775 /* Skip trailing lead-byte unless 'final' is set */
3776 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003777 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003778
3779 /* First get the size of the result */
3780 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003781 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3782 if (usize == 0) {
3783 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3784 return -1;
3785 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003786 }
3787
3788 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003789 /* Create unicode object */
3790 *v = _PyUnicode_New(usize);
3791 if (*v == NULL)
3792 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003793 }
3794 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003795 /* Extend unicode object */
3796 n = PyUnicode_GET_SIZE(*v);
3797 if (_PyUnicode_Resize(v, n + usize) < 0)
3798 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003799 }
3800
3801 /* Do the conversion */
3802 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003803 p = PyUnicode_AS_UNICODE(*v) + n;
3804 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3805 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3806 return -1;
3807 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003808 }
3809
3810 return size;
3811}
3812
3813PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003814 Py_ssize_t size,
3815 const char *errors,
3816 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003817{
3818 PyUnicodeObject *v = NULL;
3819 int done;
3820
3821 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003822 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003823
3824#ifdef NEED_RETRY
3825 retry:
3826 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003827 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003828 else
3829#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003830 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003831
3832 if (done < 0) {
3833 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003834 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003835 }
3836
3837 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003838 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003839
3840#ifdef NEED_RETRY
3841 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003842 s += done;
3843 size -= done;
3844 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003845 }
3846#endif
3847
3848 return (PyObject *)v;
3849}
3850
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003851PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003852 Py_ssize_t size,
3853 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003854{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003855 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3856}
3857
3858/*
3859 * Convert unicode into string object (MBCS).
3860 * Returns 0 if succeed, -1 otherwise.
3861 */
3862static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003863 const Py_UNICODE *p, /* unicode */
3864 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003865{
3866 int mbcssize = 0;
3867 Py_ssize_t n = 0;
3868
3869 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003870
3871 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003872 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003873 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3874 if (mbcssize == 0) {
3875 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3876 return -1;
3877 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003878 }
3879
Martin v. Löwisd8251432006-06-14 05:21:04 +00003880 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003881 /* Create string object */
3882 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3883 if (*repr == NULL)
3884 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003885 }
3886 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003887 /* Extend string object */
3888 n = PyString_Size(*repr);
3889 if (_PyString_Resize(repr, n + mbcssize) < 0)
3890 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003891 }
3892
3893 /* Do the conversion */
3894 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003895 char *s = PyString_AS_STRING(*repr) + n;
3896 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3897 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3898 return -1;
3899 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003900 }
3901
3902 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003903}
3904
3905PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003906 Py_ssize_t size,
3907 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003908{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003909 PyObject *repr = NULL;
3910 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00003911
Martin v. Löwisd8251432006-06-14 05:21:04 +00003912#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003913 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00003914 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003915 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003916 else
3917#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003918 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003919
Martin v. Löwisd8251432006-06-14 05:21:04 +00003920 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003921 Py_XDECREF(repr);
3922 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003923 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003924
3925#ifdef NEED_RETRY
3926 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003927 p += INT_MAX;
3928 size -= INT_MAX;
3929 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003930 }
3931#endif
3932
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003933 return repr;
3934}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003935
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003936PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
3937{
3938 if (!PyUnicode_Check(unicode)) {
3939 PyErr_BadArgument();
3940 return NULL;
3941 }
3942 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003943 PyUnicode_GET_SIZE(unicode),
3944 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003945}
3946
Martin v. Löwisd8251432006-06-14 05:21:04 +00003947#undef NEED_RETRY
3948
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003949#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003950
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951/* --- Character Mapping Codec -------------------------------------------- */
3952
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003954 Py_ssize_t size,
3955 PyObject *mapping,
3956 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003959 Py_ssize_t startinpos;
3960 Py_ssize_t endinpos;
3961 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963 PyUnicodeObject *v;
3964 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003965 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 PyObject *errorHandler = NULL;
3967 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003968 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003969 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003970
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971 /* Default to Latin-1 */
3972 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003973 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974
3975 v = _PyUnicode_New(size);
3976 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003977 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003979 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00003982 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003983 mapstring = PyUnicode_AS_UNICODE(mapping);
3984 maplen = PyUnicode_GET_SIZE(mapping);
3985 while (s < e) {
3986 unsigned char ch = *s;
3987 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003989 if (ch < maplen)
3990 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003992 if (x == 0xfffe) {
3993 /* undefined mapping */
3994 outpos = p-PyUnicode_AS_UNICODE(v);
3995 startinpos = s-starts;
3996 endinpos = startinpos+1;
3997 if (unicode_decode_call_errorhandler(
3998 errors, &errorHandler,
3999 "charmap", "character maps to <undefined>",
4000 starts, size, &startinpos, &endinpos, &exc, &s,
4001 &v, &outpos, &p)) {
4002 goto onError;
4003 }
4004 continue;
4005 }
4006 *p++ = x;
4007 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004008 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004009 }
4010 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004011 while (s < e) {
4012 unsigned char ch = *s;
4013 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004014
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004015 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4016 w = PyInt_FromLong((long)ch);
4017 if (w == NULL)
4018 goto onError;
4019 x = PyObject_GetItem(mapping, w);
4020 Py_DECREF(w);
4021 if (x == NULL) {
4022 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4023 /* No mapping found means: mapping is undefined. */
4024 PyErr_Clear();
4025 x = Py_None;
4026 Py_INCREF(x);
4027 } else
4028 goto onError;
4029 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004030
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004031 /* Apply mapping */
4032 if (PyInt_Check(x)) {
4033 long value = PyInt_AS_LONG(x);
4034 if (value < 0 || value > 65535) {
4035 PyErr_SetString(PyExc_TypeError,
4036 "character mapping must be in range(65536)");
4037 Py_DECREF(x);
4038 goto onError;
4039 }
4040 *p++ = (Py_UNICODE)value;
4041 }
4042 else if (x == Py_None) {
4043 /* undefined mapping */
4044 outpos = p-PyUnicode_AS_UNICODE(v);
4045 startinpos = s-starts;
4046 endinpos = startinpos+1;
4047 if (unicode_decode_call_errorhandler(
4048 errors, &errorHandler,
4049 "charmap", "character maps to <undefined>",
4050 starts, size, &startinpos, &endinpos, &exc, &s,
4051 &v, &outpos, &p)) {
4052 Py_DECREF(x);
4053 goto onError;
4054 }
4055 Py_DECREF(x);
4056 continue;
4057 }
4058 else if (PyUnicode_Check(x)) {
4059 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004060
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004061 if (targetsize == 1)
4062 /* 1-1 mapping */
4063 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004064
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004065 else if (targetsize > 1) {
4066 /* 1-n mapping */
4067 if (targetsize > extrachars) {
4068 /* resize first */
4069 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4070 Py_ssize_t needed = (targetsize - extrachars) + \
4071 (targetsize << 2);
4072 extrachars += needed;
4073 /* XXX overflow detection missing */
4074 if (_PyUnicode_Resize(&v,
4075 PyUnicode_GET_SIZE(v) + needed) < 0) {
4076 Py_DECREF(x);
4077 goto onError;
4078 }
4079 p = PyUnicode_AS_UNICODE(v) + oldpos;
4080 }
4081 Py_UNICODE_COPY(p,
4082 PyUnicode_AS_UNICODE(x),
4083 targetsize);
4084 p += targetsize;
4085 extrachars -= targetsize;
4086 }
4087 /* 1-0 mapping: skip the character */
4088 }
4089 else {
4090 /* wrong return value */
4091 PyErr_SetString(PyExc_TypeError,
4092 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004093 Py_DECREF(x);
4094 goto onError;
4095 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004096 Py_DECREF(x);
4097 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004098 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099 }
4100 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004101 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4102 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004103 Py_XDECREF(errorHandler);
4104 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004106
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004107 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004108 Py_XDECREF(errorHandler);
4109 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004110 Py_XDECREF(v);
4111 return NULL;
4112}
4113
Martin v. Löwis3f767792006-06-04 19:36:28 +00004114/* Charmap encoding: the lookup table */
4115
4116struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004117 PyObject_HEAD
4118 unsigned char level1[32];
4119 int count2, count3;
4120 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004121};
4122
4123static PyObject*
4124encoding_map_size(PyObject *obj, PyObject* args)
4125{
4126 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004127 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004128 128*map->count3);
4129}
4130
4131static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004132 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004133 PyDoc_STR("Return the size (in bytes) of this object") },
4134 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004135};
4136
4137static void
4138encoding_map_dealloc(PyObject* o)
4139{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004140 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004141}
4142
4143static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004144 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004145 "EncodingMap", /*tp_name*/
4146 sizeof(struct encoding_map), /*tp_basicsize*/
4147 0, /*tp_itemsize*/
4148 /* methods */
4149 encoding_map_dealloc, /*tp_dealloc*/
4150 0, /*tp_print*/
4151 0, /*tp_getattr*/
4152 0, /*tp_setattr*/
4153 0, /*tp_compare*/
4154 0, /*tp_repr*/
4155 0, /*tp_as_number*/
4156 0, /*tp_as_sequence*/
4157 0, /*tp_as_mapping*/
4158 0, /*tp_hash*/
4159 0, /*tp_call*/
4160 0, /*tp_str*/
4161 0, /*tp_getattro*/
4162 0, /*tp_setattro*/
4163 0, /*tp_as_buffer*/
4164 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4165 0, /*tp_doc*/
4166 0, /*tp_traverse*/
4167 0, /*tp_clear*/
4168 0, /*tp_richcompare*/
4169 0, /*tp_weaklistoffset*/
4170 0, /*tp_iter*/
4171 0, /*tp_iternext*/
4172 encoding_map_methods, /*tp_methods*/
4173 0, /*tp_members*/
4174 0, /*tp_getset*/
4175 0, /*tp_base*/
4176 0, /*tp_dict*/
4177 0, /*tp_descr_get*/
4178 0, /*tp_descr_set*/
4179 0, /*tp_dictoffset*/
4180 0, /*tp_init*/
4181 0, /*tp_alloc*/
4182 0, /*tp_new*/
4183 0, /*tp_free*/
4184 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004185};
4186
4187PyObject*
4188PyUnicode_BuildEncodingMap(PyObject* string)
4189{
4190 Py_UNICODE *decode;
4191 PyObject *result;
4192 struct encoding_map *mresult;
4193 int i;
4194 int need_dict = 0;
4195 unsigned char level1[32];
4196 unsigned char level2[512];
4197 unsigned char *mlevel1, *mlevel2, *mlevel3;
4198 int count2 = 0, count3 = 0;
4199
4200 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4201 PyErr_BadArgument();
4202 return NULL;
4203 }
4204 decode = PyUnicode_AS_UNICODE(string);
4205 memset(level1, 0xFF, sizeof level1);
4206 memset(level2, 0xFF, sizeof level2);
4207
4208 /* If there isn't a one-to-one mapping of NULL to \0,
4209 or if there are non-BMP characters, we need to use
4210 a mapping dictionary. */
4211 if (decode[0] != 0)
4212 need_dict = 1;
4213 for (i = 1; i < 256; i++) {
4214 int l1, l2;
4215 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004216#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004217 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004218#endif
4219 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004220 need_dict = 1;
4221 break;
4222 }
4223 if (decode[i] == 0xFFFE)
4224 /* unmapped character */
4225 continue;
4226 l1 = decode[i] >> 11;
4227 l2 = decode[i] >> 7;
4228 if (level1[l1] == 0xFF)
4229 level1[l1] = count2++;
4230 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004231 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004232 }
4233
4234 if (count2 >= 0xFF || count3 >= 0xFF)
4235 need_dict = 1;
4236
4237 if (need_dict) {
4238 PyObject *result = PyDict_New();
4239 PyObject *key, *value;
4240 if (!result)
4241 return NULL;
4242 for (i = 0; i < 256; i++) {
4243 key = value = NULL;
4244 key = PyInt_FromLong(decode[i]);
4245 value = PyInt_FromLong(i);
4246 if (!key || !value)
4247 goto failed1;
4248 if (PyDict_SetItem(result, key, value) == -1)
4249 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004250 Py_DECREF(key);
4251 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004252 }
4253 return result;
4254 failed1:
4255 Py_XDECREF(key);
4256 Py_XDECREF(value);
4257 Py_DECREF(result);
4258 return NULL;
4259 }
4260
4261 /* Create a three-level trie */
4262 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4263 16*count2 + 128*count3 - 1);
4264 if (!result)
4265 return PyErr_NoMemory();
4266 PyObject_Init(result, &EncodingMapType);
4267 mresult = (struct encoding_map*)result;
4268 mresult->count2 = count2;
4269 mresult->count3 = count3;
4270 mlevel1 = mresult->level1;
4271 mlevel2 = mresult->level23;
4272 mlevel3 = mresult->level23 + 16*count2;
4273 memcpy(mlevel1, level1, 32);
4274 memset(mlevel2, 0xFF, 16*count2);
4275 memset(mlevel3, 0, 128*count3);
4276 count3 = 0;
4277 for (i = 1; i < 256; i++) {
4278 int o1, o2, o3, i2, i3;
4279 if (decode[i] == 0xFFFE)
4280 /* unmapped character */
4281 continue;
4282 o1 = decode[i]>>11;
4283 o2 = (decode[i]>>7) & 0xF;
4284 i2 = 16*mlevel1[o1] + o2;
4285 if (mlevel2[i2] == 0xFF)
4286 mlevel2[i2] = count3++;
4287 o3 = decode[i] & 0x7F;
4288 i3 = 128*mlevel2[i2] + o3;
4289 mlevel3[i3] = i;
4290 }
4291 return result;
4292}
4293
4294static int
4295encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4296{
4297 struct encoding_map *map = (struct encoding_map*)mapping;
4298 int l1 = c>>11;
4299 int l2 = (c>>7) & 0xF;
4300 int l3 = c & 0x7F;
4301 int i;
4302
4303#ifdef Py_UNICODE_WIDE
4304 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004305 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004306 }
4307#endif
4308 if (c == 0)
4309 return 0;
4310 /* level 1*/
4311 i = map->level1[l1];
4312 if (i == 0xFF) {
4313 return -1;
4314 }
4315 /* level 2*/
4316 i = map->level23[16*i+l2];
4317 if (i == 0xFF) {
4318 return -1;
4319 }
4320 /* level 3 */
4321 i = map->level23[16*map->count2 + 128*i + l3];
4322 if (i == 0) {
4323 return -1;
4324 }
4325 return i;
4326}
4327
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004328/* Lookup the character ch in the mapping. If the character
4329 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004330 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004333 PyObject *w = PyInt_FromLong((long)c);
4334 PyObject *x;
4335
4336 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004337 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004338 x = PyObject_GetItem(mapping, w);
4339 Py_DECREF(w);
4340 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004341 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4342 /* No mapping found means: mapping is undefined. */
4343 PyErr_Clear();
4344 x = Py_None;
4345 Py_INCREF(x);
4346 return x;
4347 } else
4348 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004350 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004351 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004352 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004353 long value = PyInt_AS_LONG(x);
4354 if (value < 0 || value > 255) {
4355 PyErr_SetString(PyExc_TypeError,
4356 "character mapping must be in range(256)");
4357 Py_DECREF(x);
4358 return NULL;
4359 }
4360 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004362 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004363 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004365 /* wrong return value */
4366 PyErr_SetString(PyExc_TypeError,
4367 "character mapping must return integer, None or str");
4368 Py_DECREF(x);
4369 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 }
4371}
4372
Martin v. Löwis3f767792006-06-04 19:36:28 +00004373static int
4374charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4375{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004376 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4377 /* exponentially overallocate to minimize reallocations */
4378 if (requiredsize < 2*outsize)
4379 requiredsize = 2*outsize;
4380 if (_PyString_Resize(outobj, requiredsize)) {
4381 return 0;
4382 }
4383 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004384}
4385
Benjamin Peterson857ce152009-01-31 16:29:18 +00004386typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004387 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004388}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389/* lookup the character, put the result in the output string and adjust
4390 various state variables. Reallocate the output string if not enough
4391 space is available. Return a new reference to the object that
4392 was put in the output buffer, or Py_None, if the mapping was undefined
4393 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004394 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004396charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004397 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004399 PyObject *rep;
4400 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004401 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402
Christian Heimese93237d2007-12-19 02:37:44 +00004403 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004404 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004405 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004406 if (res == -1)
4407 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004408 if (outsize<requiredsize)
4409 if (!charmapencode_resize(outobj, outpos, requiredsize))
4410 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004411 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004412 outstart[(*outpos)++] = (char)res;
4413 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004414 }
4415
4416 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004417 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004418 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004419 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004420 Py_DECREF(rep);
4421 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004422 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004423 if (PyInt_Check(rep)) {
4424 Py_ssize_t requiredsize = *outpos+1;
4425 if (outsize<requiredsize)
4426 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4427 Py_DECREF(rep);
4428 return enc_EXCEPTION;
4429 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004430 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004431 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004432 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004433 else {
4434 const char *repchars = PyString_AS_STRING(rep);
4435 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4436 Py_ssize_t requiredsize = *outpos+repsize;
4437 if (outsize<requiredsize)
4438 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4439 Py_DECREF(rep);
4440 return enc_EXCEPTION;
4441 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004442 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004443 memcpy(outstart + *outpos, repchars, repsize);
4444 *outpos += repsize;
4445 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004446 }
Georg Brandl9f167602006-06-04 21:46:16 +00004447 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004448 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449}
4450
4451/* handle an error in PyUnicode_EncodeCharmap
4452 Return 0 on success, -1 on error */
4453static
4454int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004455 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004457 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004458 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459{
4460 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004461 Py_ssize_t repsize;
4462 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004463 Py_UNICODE *uni2;
4464 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004465 Py_ssize_t collstartpos = *inpos;
4466 Py_ssize_t collendpos = *inpos+1;
4467 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004468 char *encoding = "charmap";
4469 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004470 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004471
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 /* find all unencodable characters */
4473 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004474 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004475 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004476 int res = encoding_map_lookup(p[collendpos], mapping);
4477 if (res != -1)
4478 break;
4479 ++collendpos;
4480 continue;
4481 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004482
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004483 rep = charmapencode_lookup(p[collendpos], mapping);
4484 if (rep==NULL)
4485 return -1;
4486 else if (rep!=Py_None) {
4487 Py_DECREF(rep);
4488 break;
4489 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004490 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004491 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 }
4493 /* cache callback name lookup
4494 * (if not done yet, i.e. it's the first error) */
4495 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004496 if ((errors==NULL) || (!strcmp(errors, "strict")))
4497 *known_errorHandler = 1;
4498 else if (!strcmp(errors, "replace"))
4499 *known_errorHandler = 2;
4500 else if (!strcmp(errors, "ignore"))
4501 *known_errorHandler = 3;
4502 else if (!strcmp(errors, "xmlcharrefreplace"))
4503 *known_errorHandler = 4;
4504 else
4505 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506 }
4507 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004508 case 1: /* strict */
4509 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4510 return -1;
4511 case 2: /* replace */
4512 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004513 x = charmapencode_output('?', mapping, res, respos);
4514 if (x==enc_EXCEPTION) {
4515 return -1;
4516 }
4517 else if (x==enc_FAILED) {
4518 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4519 return -1;
4520 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004521 }
4522 /* fall through */
4523 case 3: /* ignore */
4524 *inpos = collendpos;
4525 break;
4526 case 4: /* xmlcharrefreplace */
4527 /* generate replacement (temporarily (mis)uses p) */
4528 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004529 char buffer[2+29+1+1];
4530 char *cp;
4531 sprintf(buffer, "&#%d;", (int)p[collpos]);
4532 for (cp = buffer; *cp; ++cp) {
4533 x = charmapencode_output(*cp, mapping, res, respos);
4534 if (x==enc_EXCEPTION)
4535 return -1;
4536 else if (x==enc_FAILED) {
4537 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4538 return -1;
4539 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004540 }
4541 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004542 *inpos = collendpos;
4543 break;
4544 default:
4545 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004546 encoding, reason, p, size, exceptionObject,
4547 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004548 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004549 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004550 /* generate replacement */
4551 repsize = PyUnicode_GET_SIZE(repunicode);
4552 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004553 x = charmapencode_output(*uni2, mapping, res, respos);
4554 if (x==enc_EXCEPTION) {
4555 return -1;
4556 }
4557 else if (x==enc_FAILED) {
4558 Py_DECREF(repunicode);
4559 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4560 return -1;
4561 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004562 }
4563 *inpos = newpos;
4564 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 }
4566 return 0;
4567}
4568
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004570 Py_ssize_t size,
4571 PyObject *mapping,
4572 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004573{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574 /* output object */
4575 PyObject *res = NULL;
4576 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004577 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004579 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580 PyObject *errorHandler = NULL;
4581 PyObject *exc = NULL;
4582 /* the following variable is used for caching string comparisons
4583 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4584 * 3=ignore, 4=xmlcharrefreplace */
4585 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586
4587 /* Default to Latin-1 */
4588 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004589 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 /* allocate enough for a simple encoding without
4592 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004593 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594 if (res == NULL)
4595 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004596 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004597 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004600 /* try to encode it */
4601 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4602 if (x==enc_EXCEPTION) /* error */
4603 goto onError;
4604 if (x==enc_FAILED) { /* unencodable character */
4605 if (charmap_encoding_error(p, size, &inpos, mapping,
4606 &exc,
4607 &known_errorHandler, &errorHandler, errors,
4608 &res, &respos)) {
4609 goto onError;
4610 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004611 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004612 else
4613 /* done with this character => adjust input position */
4614 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004618 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004619 if (_PyString_Resize(&res, respos))
4620 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004621 }
4622 Py_XDECREF(exc);
4623 Py_XDECREF(errorHandler);
4624 return res;
4625
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004626 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004627 Py_XDECREF(res);
4628 Py_XDECREF(exc);
4629 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630 return NULL;
4631}
4632
4633PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004634 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635{
4636 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004637 PyErr_BadArgument();
4638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004639 }
4640 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004641 PyUnicode_GET_SIZE(unicode),
4642 mapping,
4643 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004644}
4645
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646/* create or adjust a UnicodeTranslateError */
4647static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004648 const Py_UNICODE *unicode, Py_ssize_t size,
4649 Py_ssize_t startpos, Py_ssize_t endpos,
4650 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004653 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004654 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004655 }
4656 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004657 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4658 goto onError;
4659 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4660 goto onError;
4661 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4662 goto onError;
4663 return;
4664 onError:
4665 Py_DECREF(*exceptionObject);
4666 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667 }
4668}
4669
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670/* raises a UnicodeTranslateError */
4671static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004672 const Py_UNICODE *unicode, Py_ssize_t size,
4673 Py_ssize_t startpos, Py_ssize_t endpos,
4674 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004675{
4676 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004677 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004678 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004679 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004680}
4681
4682/* error handling callback helper:
4683 build arguments, call the callback and check the arguments,
4684 put the result into newpos and return the replacement string, which
4685 has to be freed by the caller */
4686static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004687 PyObject **errorHandler,
4688 const char *reason,
4689 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4690 Py_ssize_t startpos, Py_ssize_t endpos,
4691 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004692{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004693 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004694
Martin v. Löwis412fb672006-04-13 06:34:32 +00004695 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004696 PyObject *restuple;
4697 PyObject *resunicode;
4698
4699 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004700 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004701 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004702 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004703 }
4704
4705 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004706 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004707 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004708 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004709
4710 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004711 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004712 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004713 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004714 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004715 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004716 Py_DECREF(restuple);
4717 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004718 }
4719 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004720 &resunicode, &i_newpos)) {
4721 Py_DECREF(restuple);
4722 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004724 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004725 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004726 else
4727 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004728 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004729 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4730 Py_DECREF(restuple);
4731 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004732 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004733 Py_INCREF(resunicode);
4734 Py_DECREF(restuple);
4735 return resunicode;
4736}
4737
4738/* Lookup the character ch in the mapping and put the result in result,
4739 which must be decrefed by the caller.
4740 Return 0 on success, -1 on error */
4741static
4742int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4743{
4744 PyObject *w = PyInt_FromLong((long)c);
4745 PyObject *x;
4746
4747 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004748 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004749 x = PyObject_GetItem(mapping, w);
4750 Py_DECREF(w);
4751 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004752 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4753 /* No mapping found means: use 1:1 mapping. */
4754 PyErr_Clear();
4755 *result = NULL;
4756 return 0;
4757 } else
4758 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004759 }
4760 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004761 *result = x;
4762 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004763 }
4764 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004765 long value = PyInt_AS_LONG(x);
4766 long max = PyUnicode_GetMax();
4767 if (value < 0 || value > max) {
4768 PyErr_Format(PyExc_TypeError,
4769 "character mapping must be in range(0x%lx)", max+1);
4770 Py_DECREF(x);
4771 return -1;
4772 }
4773 *result = x;
4774 return 0;
4775 }
4776 else if (PyUnicode_Check(x)) {
4777 *result = x;
4778 return 0;
4779 }
4780 else {
4781 /* wrong return value */
4782 PyErr_SetString(PyExc_TypeError,
4783 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004784 Py_DECREF(x);
4785 return -1;
4786 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004787}
4788/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004789 if not reallocate and adjust various state variables.
4790 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004791static
Walter Dörwald4894c302003-10-24 14:25:28 +00004792int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004793 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004794{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004795 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004796 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004797 /* remember old output position */
4798 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4799 /* exponentially overallocate to minimize reallocations */
4800 if (requiredsize < 2 * oldsize)
4801 requiredsize = 2 * oldsize;
4802 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4803 return -1;
4804 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 }
4806 return 0;
4807}
4808/* lookup the character, put the result in the output string and adjust
4809 various state variables. Return a new reference to the object that
4810 was put in the output buffer in *result, or Py_None, if the mapping was
4811 undefined (in which case no character was written).
4812 The called must decref result.
4813 Return 0 on success, -1 on error. */
4814static
Walter Dörwald4894c302003-10-24 14:25:28 +00004815int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004816 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4817 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818{
Walter Dörwald4894c302003-10-24 14:25:28 +00004819 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004820 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004821 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004822 /* not found => default to 1:1 mapping */
4823 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 }
4825 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004826 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004827 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004828 /* no overflow check, because we know that the space is enough */
4829 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004830 }
4831 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004832 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4833 if (repsize==1) {
4834 /* no overflow check, because we know that the space is enough */
4835 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4836 }
4837 else if (repsize!=0) {
4838 /* more than one character */
4839 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4840 (insize - (curinp-startinp)) +
4841 repsize - 1;
4842 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4843 return -1;
4844 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4845 *outp += repsize;
4846 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004847 }
4848 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004849 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004850 return 0;
4851}
4852
4853PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004854 Py_ssize_t size,
4855 PyObject *mapping,
4856 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004858 /* output object */
4859 PyObject *res = NULL;
4860 /* pointers to the beginning and end+1 of input */
4861 const Py_UNICODE *startp = p;
4862 const Py_UNICODE *endp = p + size;
4863 /* pointer into the output */
4864 Py_UNICODE *str;
4865 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004866 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004867 char *reason = "character maps to <undefined>";
4868 PyObject *errorHandler = NULL;
4869 PyObject *exc = NULL;
4870 /* the following variable is used for caching string comparisons
4871 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4872 * 3=ignore, 4=xmlcharrefreplace */
4873 int known_errorHandler = -1;
4874
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004876 PyErr_BadArgument();
4877 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004879
4880 /* allocate enough for a simple 1:1 translation without
4881 replacements, if we need more, we'll resize */
4882 res = PyUnicode_FromUnicode(NULL, size);
4883 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004884 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004886 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004889 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004890 /* try to encode it */
4891 PyObject *x = NULL;
4892 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4893 Py_XDECREF(x);
4894 goto onError;
4895 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004896 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004897 if (x!=Py_None) /* it worked => adjust input pointer */
4898 ++p;
4899 else { /* untranslatable character */
4900 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4901 Py_ssize_t repsize;
4902 Py_ssize_t newpos;
4903 Py_UNICODE *uni2;
4904 /* startpos for collecting untranslatable chars */
4905 const Py_UNICODE *collstart = p;
4906 const Py_UNICODE *collend = p+1;
4907 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004909 /* find all untranslatable characters */
4910 while (collend < endp) {
4911 if (charmaptranslate_lookup(*collend, mapping, &x))
4912 goto onError;
4913 Py_XDECREF(x);
4914 if (x!=Py_None)
4915 break;
4916 ++collend;
4917 }
4918 /* cache callback name lookup
4919 * (if not done yet, i.e. it's the first error) */
4920 if (known_errorHandler==-1) {
4921 if ((errors==NULL) || (!strcmp(errors, "strict")))
4922 known_errorHandler = 1;
4923 else if (!strcmp(errors, "replace"))
4924 known_errorHandler = 2;
4925 else if (!strcmp(errors, "ignore"))
4926 known_errorHandler = 3;
4927 else if (!strcmp(errors, "xmlcharrefreplace"))
4928 known_errorHandler = 4;
4929 else
4930 known_errorHandler = 0;
4931 }
4932 switch (known_errorHandler) {
4933 case 1: /* strict */
4934 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004935 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004936 case 2: /* replace */
4937 /* No need to check for space, this is a 1:1 replacement */
4938 for (coll = collstart; coll<collend; ++coll)
4939 *str++ = '?';
4940 /* fall through */
4941 case 3: /* ignore */
4942 p = collend;
4943 break;
4944 case 4: /* xmlcharrefreplace */
4945 /* generate replacement (temporarily (mis)uses p) */
4946 for (p = collstart; p < collend; ++p) {
4947 char buffer[2+29+1+1];
4948 char *cp;
4949 sprintf(buffer, "&#%d;", (int)*p);
4950 if (charmaptranslate_makespace(&res, &str,
4951 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
4952 goto onError;
4953 for (cp = buffer; *cp; ++cp)
4954 *str++ = *cp;
4955 }
4956 p = collend;
4957 break;
4958 default:
4959 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
4960 reason, startp, size, &exc,
4961 collstart-startp, collend-startp, &newpos);
4962 if (repunicode == NULL)
4963 goto onError;
4964 /* generate replacement */
4965 repsize = PyUnicode_GET_SIZE(repunicode);
4966 if (charmaptranslate_makespace(&res, &str,
4967 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
4968 Py_DECREF(repunicode);
4969 goto onError;
4970 }
4971 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
4972 *str++ = *uni2;
4973 p = startp + newpos;
4974 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004975 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004976 }
4977 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004978 /* Resize if we allocated to much */
4979 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00004980 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004981 if (PyUnicode_Resize(&res, respos) < 0)
4982 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004983 }
4984 Py_XDECREF(exc);
4985 Py_XDECREF(errorHandler);
4986 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004988 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004989 Py_XDECREF(res);
4990 Py_XDECREF(exc);
4991 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992 return NULL;
4993}
4994
4995PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004996 PyObject *mapping,
4997 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998{
4999 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005000
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001 str = PyUnicode_FromObject(str);
5002 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005003 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005005 PyUnicode_GET_SIZE(str),
5006 mapping,
5007 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005008 Py_DECREF(str);
5009 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005010
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005011 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012 Py_XDECREF(str);
5013 return NULL;
5014}
Tim Petersced69f82003-09-16 20:30:58 +00005015
Guido van Rossum9e896b32000-04-05 20:11:21 +00005016/* --- Decimal Encoder ---------------------------------------------------- */
5017
5018int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005019 Py_ssize_t length,
5020 char *output,
5021 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005022{
5023 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005024 PyObject *errorHandler = NULL;
5025 PyObject *exc = NULL;
5026 const char *encoding = "decimal";
5027 const char *reason = "invalid decimal Unicode string";
5028 /* the following variable is used for caching string comparisons
5029 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5030 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005031
5032 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005033 PyErr_BadArgument();
5034 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005035 }
5036
5037 p = s;
5038 end = s + length;
5039 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005040 register Py_UNICODE ch = *p;
5041 int decimal;
5042 PyObject *repunicode;
5043 Py_ssize_t repsize;
5044 Py_ssize_t newpos;
5045 Py_UNICODE *uni2;
5046 Py_UNICODE *collstart;
5047 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005048
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005049 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005050 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005051 ++p;
5052 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005053 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005054 decimal = Py_UNICODE_TODECIMAL(ch);
5055 if (decimal >= 0) {
5056 *output++ = '0' + decimal;
5057 ++p;
5058 continue;
5059 }
5060 if (0 < ch && ch < 256) {
5061 *output++ = (char)ch;
5062 ++p;
5063 continue;
5064 }
5065 /* All other characters are considered unencodable */
5066 collstart = p;
5067 collend = p+1;
5068 while (collend < end) {
5069 if ((0 < *collend && *collend < 256) ||
5070 !Py_UNICODE_ISSPACE(*collend) ||
5071 Py_UNICODE_TODECIMAL(*collend))
5072 break;
5073 }
5074 /* cache callback name lookup
5075 * (if not done yet, i.e. it's the first error) */
5076 if (known_errorHandler==-1) {
5077 if ((errors==NULL) || (!strcmp(errors, "strict")))
5078 known_errorHandler = 1;
5079 else if (!strcmp(errors, "replace"))
5080 known_errorHandler = 2;
5081 else if (!strcmp(errors, "ignore"))
5082 known_errorHandler = 3;
5083 else if (!strcmp(errors, "xmlcharrefreplace"))
5084 known_errorHandler = 4;
5085 else
5086 known_errorHandler = 0;
5087 }
5088 switch (known_errorHandler) {
5089 case 1: /* strict */
5090 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5091 goto onError;
5092 case 2: /* replace */
5093 for (p = collstart; p < collend; ++p)
5094 *output++ = '?';
5095 /* fall through */
5096 case 3: /* ignore */
5097 p = collend;
5098 break;
5099 case 4: /* xmlcharrefreplace */
5100 /* generate replacement (temporarily (mis)uses p) */
5101 for (p = collstart; p < collend; ++p)
5102 output += sprintf(output, "&#%d;", (int)*p);
5103 p = collend;
5104 break;
5105 default:
5106 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5107 encoding, reason, s, length, &exc,
5108 collstart-s, collend-s, &newpos);
5109 if (repunicode == NULL)
5110 goto onError;
5111 /* generate replacement */
5112 repsize = PyUnicode_GET_SIZE(repunicode);
5113 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5114 Py_UNICODE ch = *uni2;
5115 if (Py_UNICODE_ISSPACE(ch))
5116 *output++ = ' ';
5117 else {
5118 decimal = Py_UNICODE_TODECIMAL(ch);
5119 if (decimal >= 0)
5120 *output++ = '0' + decimal;
5121 else if (0 < ch && ch < 256)
5122 *output++ = (char)ch;
5123 else {
5124 Py_DECREF(repunicode);
5125 raise_encode_exception(&exc, encoding,
5126 s, length, collstart-s, collend-s, reason);
5127 goto onError;
5128 }
5129 }
5130 }
5131 p = s + newpos;
5132 Py_DECREF(repunicode);
5133 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005134 }
5135 /* 0-terminate the output string */
5136 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005137 Py_XDECREF(exc);
5138 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005139 return 0;
5140
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005141 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005142 Py_XDECREF(exc);
5143 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005144 return -1;
5145}
5146
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147/* --- Helpers ------------------------------------------------------------ */
5148
Eric Smitha9f7d622008-02-17 19:46:49 +00005149#include "stringlib/unicodedefs.h"
Fredrik Lundh6471ee42006-05-24 14:28:11 +00005150
Facundo Batista6f7e6fb2007-11-16 19:16:15 +00005151#define FROM_UNICODE
Fredrik Lundhb9479482006-05-26 17:22:38 +00005152
Fredrik Lundha50d2012006-05-26 17:04:58 +00005153#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005154
5155#include "stringlib/count.h"
5156#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005157#include "stringlib/partition.h"
5158
Fredrik Lundhc8162812006-05-26 19:33:03 +00005159/* helper macro to fixup start/end slice values */
5160#define FIX_START_END(obj) \
5161 if (start < 0) \
5162 start += (obj)->length; \
5163 if (start < 0) \
5164 start = 0; \
5165 if (end > (obj)->length) \
5166 end = (obj)->length; \
5167 if (end < 0) \
5168 end += (obj)->length; \
5169 if (end < 0) \
5170 end = 0;
5171
Martin v. Löwis18e16552006-02-15 17:27:45 +00005172Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005173 PyObject *substr,
5174 Py_ssize_t start,
5175 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005177 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005178 PyUnicodeObject* str_obj;
5179 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005180
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005181 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5182 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005183 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005184 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5185 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005186 Py_DECREF(str_obj);
5187 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 }
Tim Petersced69f82003-09-16 20:30:58 +00005189
Fredrik Lundhc8162812006-05-26 19:33:03 +00005190 FIX_START_END(str_obj);
Tim Petersced69f82003-09-16 20:30:58 +00005191
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005192 result = stringlib_count(
5193 str_obj->str + start, end - start, sub_obj->str, sub_obj->length
5194 );
5195
5196 Py_DECREF(sub_obj);
5197 Py_DECREF(str_obj);
5198
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 return result;
5200}
5201
Martin v. Löwis18e16552006-02-15 17:27:45 +00005202Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005203 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005204 Py_ssize_t start,
5205 Py_ssize_t end,
5206 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005208 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005209
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005210 str = PyUnicode_FromObject(str);
5211 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005212 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005213 sub = PyUnicode_FromObject(sub);
5214 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005215 Py_DECREF(str);
5216 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 }
Tim Petersced69f82003-09-16 20:30:58 +00005218
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005219 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005220 result = stringlib_find_slice(
5221 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5222 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5223 start, end
5224 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005225 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005226 result = stringlib_rfind_slice(
5227 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5228 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5229 start, end
5230 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005231
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005232 Py_DECREF(str);
5233 Py_DECREF(sub);
5234
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235 return result;
5236}
5237
Tim Petersced69f82003-09-16 20:30:58 +00005238static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005240 PyUnicodeObject *substring,
5241 Py_ssize_t start,
5242 Py_ssize_t end,
5243 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245 if (substring->length == 0)
5246 return 1;
5247
Fredrik Lundhc8162812006-05-26 19:33:03 +00005248 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249
5250 end -= substring->length;
5251 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005252 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253
5254 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005255 if (Py_UNICODE_MATCH(self, end, substring))
5256 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257 } else {
5258 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005259 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260 }
5261
5262 return 0;
5263}
5264
Martin v. Löwis18e16552006-02-15 17:27:45 +00005265Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005266 PyObject *substr,
5267 Py_ssize_t start,
5268 Py_ssize_t end,
5269 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005271 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005272
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 str = PyUnicode_FromObject(str);
5274 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005275 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276 substr = PyUnicode_FromObject(substr);
5277 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005278 Py_DECREF(str);
5279 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280 }
Tim Petersced69f82003-09-16 20:30:58 +00005281
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005283 (PyUnicodeObject *)substr,
5284 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285 Py_DECREF(str);
5286 Py_DECREF(substr);
5287 return result;
5288}
5289
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290/* Apply fixfct filter to the Unicode object self and return a
5291 reference to the modified object */
5292
Tim Petersced69f82003-09-16 20:30:58 +00005293static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005295 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296{
5297
5298 PyUnicodeObject *u;
5299
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005300 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005302 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005303
5304 Py_UNICODE_COPY(u->str, self->str, self->length);
5305
Tim Peters7a29bd52001-09-12 03:03:31 +00005306 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005307 /* fixfct should return TRUE if it modified the buffer. If
5308 FALSE, return a reference to the original buffer instead
5309 (to save space, not time) */
5310 Py_INCREF(self);
5311 Py_DECREF(u);
5312 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 }
5314 return (PyObject*) u;
5315}
5316
Tim Petersced69f82003-09-16 20:30:58 +00005317static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318int fixupper(PyUnicodeObject *self)
5319{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005320 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321 Py_UNICODE *s = self->str;
5322 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005323
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005325 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005326
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005327 ch = Py_UNICODE_TOUPPER(*s);
5328 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005329 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005330 *s = ch;
5331 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 s++;
5333 }
5334
5335 return status;
5336}
5337
Tim Petersced69f82003-09-16 20:30:58 +00005338static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005339int fixlower(PyUnicodeObject *self)
5340{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005341 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 Py_UNICODE *s = self->str;
5343 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005344
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005346 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005347
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005348 ch = Py_UNICODE_TOLOWER(*s);
5349 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005351 *s = ch;
5352 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 s++;
5354 }
5355
5356 return status;
5357}
5358
Tim Petersced69f82003-09-16 20:30:58 +00005359static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360int fixswapcase(PyUnicodeObject *self)
5361{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005362 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 Py_UNICODE *s = self->str;
5364 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005365
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366 while (len-- > 0) {
5367 if (Py_UNICODE_ISUPPER(*s)) {
5368 *s = Py_UNICODE_TOLOWER(*s);
5369 status = 1;
5370 } else if (Py_UNICODE_ISLOWER(*s)) {
5371 *s = Py_UNICODE_TOUPPER(*s);
5372 status = 1;
5373 }
5374 s++;
5375 }
5376
5377 return status;
5378}
5379
Tim Petersced69f82003-09-16 20:30:58 +00005380static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381int fixcapitalize(PyUnicodeObject *self)
5382{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005383 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005384 Py_UNICODE *s = self->str;
5385 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005386
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005387 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005388 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005389 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005390 *s = Py_UNICODE_TOUPPER(*s);
5391 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005393 s++;
5394 while (--len > 0) {
5395 if (Py_UNICODE_ISUPPER(*s)) {
5396 *s = Py_UNICODE_TOLOWER(*s);
5397 status = 1;
5398 }
5399 s++;
5400 }
5401 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402}
5403
5404static
5405int fixtitle(PyUnicodeObject *self)
5406{
5407 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5408 register Py_UNICODE *e;
5409 int previous_is_cased;
5410
5411 /* Shortcut for single character strings */
5412 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005413 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5414 if (*p != ch) {
5415 *p = ch;
5416 return 1;
5417 }
5418 else
5419 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 }
Tim Petersced69f82003-09-16 20:30:58 +00005421
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 e = p + PyUnicode_GET_SIZE(self);
5423 previous_is_cased = 0;
5424 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005425 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005426
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005427 if (previous_is_cased)
5428 *p = Py_UNICODE_TOLOWER(ch);
5429 else
5430 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005431
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005432 if (Py_UNICODE_ISLOWER(ch) ||
5433 Py_UNICODE_ISUPPER(ch) ||
5434 Py_UNICODE_ISTITLE(ch))
5435 previous_is_cased = 1;
5436 else
5437 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 }
5439 return 1;
5440}
5441
Tim Peters8ce9f162004-08-27 01:49:32 +00005442PyObject *
5443PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444{
Tim Peters8ce9f162004-08-27 01:49:32 +00005445 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005446 const Py_UNICODE blank = ' ';
5447 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005448 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005449 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005450 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5451 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005452 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5453 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005454 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005455 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005456 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457
Tim Peters05eba1f2004-08-27 21:32:02 +00005458 fseq = PySequence_Fast(seq, "");
5459 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005460 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005461 }
5462
Tim Peters91879ab2004-08-27 22:35:44 +00005463 /* Grrrr. A codec may be invoked to convert str objects to
5464 * Unicode, and so it's possible to call back into Python code
5465 * during PyUnicode_FromObject(), and so it's possible for a sick
5466 * codec to change the size of fseq (if seq is a list). Therefore
5467 * we have to keep refetching the size -- can't assume seqlen
5468 * is invariant.
5469 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005470 seqlen = PySequence_Fast_GET_SIZE(fseq);
5471 /* If empty sequence, return u"". */
5472 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005473 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5474 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005475 }
5476 /* If singleton sequence with an exact Unicode, return that. */
5477 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005478 item = PySequence_Fast_GET_ITEM(fseq, 0);
5479 if (PyUnicode_CheckExact(item)) {
5480 Py_INCREF(item);
5481 res = (PyUnicodeObject *)item;
5482 goto Done;
5483 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005484 }
5485
Tim Peters05eba1f2004-08-27 21:32:02 +00005486 /* At least two items to join, or one that isn't exact Unicode. */
5487 if (seqlen > 1) {
5488 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005489 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005490 sep = &blank;
5491 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005492 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005493 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005494 internal_separator = PyUnicode_FromObject(separator);
5495 if (internal_separator == NULL)
5496 goto onError;
5497 sep = PyUnicode_AS_UNICODE(internal_separator);
5498 seplen = PyUnicode_GET_SIZE(internal_separator);
5499 /* In case PyUnicode_FromObject() mutated seq. */
5500 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005501 }
5502 }
5503
5504 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005505 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005506 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005507 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005508 res_p = PyUnicode_AS_UNICODE(res);
5509 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005510
Tim Peters05eba1f2004-08-27 21:32:02 +00005511 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005512 Py_ssize_t itemlen;
5513 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005514
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005515 item = PySequence_Fast_GET_ITEM(fseq, i);
5516 /* Convert item to Unicode. */
5517 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5518 PyErr_Format(PyExc_TypeError,
5519 "sequence item %zd: expected string or Unicode,"
5520 " %.80s found",
5521 i, Py_TYPE(item)->tp_name);
5522 goto onError;
5523 }
5524 item = PyUnicode_FromObject(item);
5525 if (item == NULL)
5526 goto onError;
5527 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005528
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005529 /* In case PyUnicode_FromObject() mutated seq. */
5530 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005531
Tim Peters8ce9f162004-08-27 01:49:32 +00005532 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005533 itemlen = PyUnicode_GET_SIZE(item);
5534 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005535 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005536 goto Overflow;
5537 if (i < seqlen - 1) {
5538 new_res_used += seplen;
5539 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005540 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005541 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005542 if (new_res_used > res_alloc) {
5543 /* double allocated size until it's big enough */
5544 do {
5545 res_alloc += res_alloc;
5546 if (res_alloc <= 0)
5547 goto Overflow;
5548 } while (new_res_used > res_alloc);
5549 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5550 Py_DECREF(item);
5551 goto onError;
5552 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005553 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005554 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005555
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005556 /* Copy item, and maybe the separator. */
5557 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5558 res_p += itemlen;
5559 if (i < seqlen - 1) {
5560 Py_UNICODE_COPY(res_p, sep, seplen);
5561 res_p += seplen;
5562 }
5563 Py_DECREF(item);
5564 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005565 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005566
Tim Peters05eba1f2004-08-27 21:32:02 +00005567 /* Shrink res to match the used area; this probably can't fail,
5568 * but it's cheap to check.
5569 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005570 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005571 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005572
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005573 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005574 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005575 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 return (PyObject *)res;
5577
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005578 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005579 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005580 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005581 Py_DECREF(item);
5582 /* fall through */
5583
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005584 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005585 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005586 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005587 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 return NULL;
5589}
5590
Tim Petersced69f82003-09-16 20:30:58 +00005591static
5592PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005593 Py_ssize_t left,
5594 Py_ssize_t right,
5595 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596{
5597 PyUnicodeObject *u;
5598
5599 if (left < 0)
5600 left = 0;
5601 if (right < 0)
5602 right = 0;
5603
Tim Peters7a29bd52001-09-12 03:03:31 +00005604 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 Py_INCREF(self);
5606 return self;
5607 }
5608
Neal Norwitze7d8be82008-07-31 17:17:14 +00005609 if (left > PY_SSIZE_T_MAX - self->length ||
5610 right > PY_SSIZE_T_MAX - (left + self->length)) {
5611 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5612 return NULL;
5613 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614 u = _PyUnicode_New(left + self->length + right);
5615 if (u) {
5616 if (left)
5617 Py_UNICODE_FILL(u->str, fill, left);
5618 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5619 if (right)
5620 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5621 }
5622
5623 return u;
5624}
5625
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005626#define SPLIT_APPEND(data, left, right) \
5627 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
5628 if (!str) \
5629 goto onError; \
5630 if (PyList_Append(list, str)) { \
5631 Py_DECREF(str); \
5632 goto onError; \
5633 } \
5634 else \
5635 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636
5637static
5638PyObject *split_whitespace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005639 PyObject *list,
5640 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005642 register Py_ssize_t i;
5643 register Py_ssize_t j;
5644 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005646 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647
5648 for (i = j = 0; i < len; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005649 /* find a token */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005650 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005651 i++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005652 j = i;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005653 while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
5654 i++;
5655 if (j < i) {
5656 if (maxcount-- <= 0)
5657 break;
5658 SPLIT_APPEND(buf, j, i);
5659 while (i < len && Py_UNICODE_ISSPACE(buf[i]))
5660 i++;
5661 j = i;
5662 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 }
5664 if (j < len) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005665 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 }
5667 return list;
5668
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005669 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 Py_DECREF(list);
5671 return NULL;
5672}
5673
5674PyObject *PyUnicode_Splitlines(PyObject *string,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005675 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005677 register Py_ssize_t i;
5678 register Py_ssize_t j;
5679 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 PyObject *list;
5681 PyObject *str;
5682 Py_UNICODE *data;
5683
5684 string = PyUnicode_FromObject(string);
5685 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005686 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 data = PyUnicode_AS_UNICODE(string);
5688 len = PyUnicode_GET_SIZE(string);
5689
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 list = PyList_New(0);
5691 if (!list)
5692 goto onError;
5693
5694 for (i = j = 0; i < len; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005695 Py_ssize_t eol;
Tim Petersced69f82003-09-16 20:30:58 +00005696
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005697 /* Find a line and append it */
5698 while (i < len && !BLOOM_LINEBREAK(data[i]))
5699 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005701 /* Skip the line break reading CRLF as one line break */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005702 eol = i;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005703 if (i < len) {
5704 if (data[i] == '\r' && i + 1 < len &&
5705 data[i+1] == '\n')
5706 i += 2;
5707 else
5708 i++;
5709 if (keepends)
5710 eol = i;
5711 }
5712 SPLIT_APPEND(data, j, eol);
5713 j = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 }
5715 if (j < len) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005716 SPLIT_APPEND(data, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717 }
5718
5719 Py_DECREF(string);
5720 return list;
5721
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005722 onError:
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005723 Py_XDECREF(list);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 Py_DECREF(string);
5725 return NULL;
5726}
5727
Tim Petersced69f82003-09-16 20:30:58 +00005728static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729PyObject *split_char(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005730 PyObject *list,
5731 Py_UNICODE ch,
5732 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005734 register Py_ssize_t i;
5735 register Py_ssize_t j;
5736 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005738 register const Py_UNICODE *buf = self->str;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739
5740 for (i = j = 0; i < len; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005741 if (buf[i] == ch) {
5742 if (maxcount-- <= 0)
5743 break;
5744 SPLIT_APPEND(buf, j, i);
5745 i = j = i + 1;
5746 } else
5747 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 }
5749 if (j <= len) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005750 SPLIT_APPEND(buf, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 }
5752 return list;
5753
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005754 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 Py_DECREF(list);
5756 return NULL;
5757}
5758
Tim Petersced69f82003-09-16 20:30:58 +00005759static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760PyObject *split_substring(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005761 PyObject *list,
5762 PyUnicodeObject *substring,
5763 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005765 register Py_ssize_t i;
5766 register Py_ssize_t j;
5767 Py_ssize_t len = self->length;
5768 Py_ssize_t sublen = substring->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 PyObject *str;
5770
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00005771 for (i = j = 0; i <= len - sublen; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005772 if (Py_UNICODE_MATCH(self, i, substring)) {
5773 if (maxcount-- <= 0)
5774 break;
5775 SPLIT_APPEND(self->str, j, i);
5776 i = j = i + sublen;
5777 } else
5778 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779 }
5780 if (j <= len) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005781 SPLIT_APPEND(self->str, j, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782 }
5783 return list;
5784
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005785 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786 Py_DECREF(list);
5787 return NULL;
5788}
5789
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005790static
5791PyObject *rsplit_whitespace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005792 PyObject *list,
5793 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005794{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005795 register Py_ssize_t i;
5796 register Py_ssize_t j;
5797 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005798 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005799 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005800
5801 for (i = j = len - 1; i >= 0; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005802 /* find a token */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005803 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005804 i--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005805 j = i;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005806 while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
5807 i--;
5808 if (j > i) {
5809 if (maxcount-- <= 0)
5810 break;
5811 SPLIT_APPEND(buf, i + 1, j + 1);
5812 while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
5813 i--;
5814 j = i;
5815 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005816 }
5817 if (j >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005818 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005819 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005820 if (PyList_Reverse(list) < 0)
5821 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005822 return list;
5823
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005824 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005825 Py_DECREF(list);
5826 return NULL;
5827}
5828
Benjamin Peterson857ce152009-01-31 16:29:18 +00005829static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005830PyObject *rsplit_char(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005831 PyObject *list,
5832 Py_UNICODE ch,
5833 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005834{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005835 register Py_ssize_t i;
5836 register Py_ssize_t j;
5837 Py_ssize_t len = self->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005838 PyObject *str;
Christian Heimes4d4f2702008-01-30 11:32:37 +00005839 register const Py_UNICODE *buf = self->str;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005840
5841 for (i = j = len - 1; i >= 0; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005842 if (buf[i] == ch) {
5843 if (maxcount-- <= 0)
5844 break;
5845 SPLIT_APPEND(buf, i + 1, j + 1);
5846 j = i = i - 1;
5847 } else
5848 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005849 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00005850 if (j >= -1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005851 SPLIT_APPEND(buf, 0, j + 1);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005852 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005853 if (PyList_Reverse(list) < 0)
5854 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005855 return list;
5856
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005857 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005858 Py_DECREF(list);
5859 return NULL;
5860}
5861
Benjamin Peterson857ce152009-01-31 16:29:18 +00005862static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005863PyObject *rsplit_substring(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005864 PyObject *list,
5865 PyUnicodeObject *substring,
5866 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005867{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 register Py_ssize_t i;
5869 register Py_ssize_t j;
5870 Py_ssize_t len = self->length;
5871 Py_ssize_t sublen = substring->length;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005872 PyObject *str;
5873
5874 for (i = len - sublen, j = len; i >= 0; ) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005875 if (Py_UNICODE_MATCH(self, i, substring)) {
5876 if (maxcount-- <= 0)
5877 break;
5878 SPLIT_APPEND(self->str, i + sublen, j);
5879 j = i;
5880 i -= sublen;
5881 } else
5882 i--;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005883 }
5884 if (j >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005885 SPLIT_APPEND(self->str, 0, j);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005886 }
Fredrik Lundhb63588c2006-05-23 18:44:25 +00005887 if (PyList_Reverse(list) < 0)
5888 goto onError;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005889 return list;
5890
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005891 onError:
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005892 Py_DECREF(list);
5893 return NULL;
5894}
5895
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896#undef SPLIT_APPEND
5897
5898static
5899PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005900 PyUnicodeObject *substring,
5901 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902{
5903 PyObject *list;
5904
5905 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005906 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907
5908 list = PyList_New(0);
5909 if (!list)
5910 return NULL;
5911
5912 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005913 return split_whitespace(self,list,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914
5915 else if (substring->length == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005916 return split_char(self,list,substring->str[0],maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917
5918 else if (substring->length == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005919 Py_DECREF(list);
5920 PyErr_SetString(PyExc_ValueError, "empty separator");
5921 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 }
5923 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005924 return split_substring(self,list,substring,maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925}
5926
Tim Petersced69f82003-09-16 20:30:58 +00005927static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005928PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005929 PyUnicodeObject *substring,
5930 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005931{
5932 PyObject *list;
5933
5934 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005935 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005936
5937 list = PyList_New(0);
5938 if (!list)
5939 return NULL;
5940
5941 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005942 return rsplit_whitespace(self,list,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005943
5944 else if (substring->length == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005945 return rsplit_char(self,list,substring->str[0],maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005946
5947 else if (substring->length == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005948 Py_DECREF(list);
5949 PyErr_SetString(PyExc_ValueError, "empty separator");
5950 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005951 }
5952 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005953 return rsplit_substring(self,list,substring,maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005954}
5955
5956static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005958 PyUnicodeObject *str1,
5959 PyUnicodeObject *str2,
5960 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961{
5962 PyUnicodeObject *u;
5963
5964 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005965 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966
Fredrik Lundh347ee272006-05-24 16:35:18 +00005967 if (str1->length == str2->length) {
5968 /* same length */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005969 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005970 if (str1->length == 1) {
5971 /* replace characters */
5972 Py_UNICODE u1, u2;
5973 if (!findchar(self->str, self->length, str1->str[0]))
5974 goto nothing;
5975 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5976 if (!u)
5977 return NULL;
5978 Py_UNICODE_COPY(u->str, self->str, self->length);
5979 u1 = str1->str[0];
5980 u2 = str2->str[0];
5981 for (i = 0; i < u->length; i++)
5982 if (u->str[i] == u1) {
5983 if (--maxcount < 0)
5984 break;
5985 u->str[i] = u2;
5986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005988 i = fastsearch(
5989 self->str, self->length, str1->str, str1->length, FAST_SEARCH
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005991 if (i < 0)
5992 goto nothing;
5993 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5994 if (!u)
5995 return NULL;
5996 Py_UNICODE_COPY(u->str, self->str, self->length);
5997 while (i <= self->length - str1->length)
5998 if (Py_UNICODE_MATCH(self, i, str1)) {
5999 if (--maxcount < 0)
6000 break;
6001 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6002 i += str1->length;
6003 } else
6004 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006007
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006008 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00006009 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 Py_UNICODE *p;
6011
6012 /* replace strings */
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006013 n = stringlib_count(self->str, self->length, str1->str, str1->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 if (n > maxcount)
6015 n = maxcount;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006016 if (n == 0)
6017 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00006018 /* new_size = self->length + n * (str2->length - str1->length)); */
6019 delta = (str2->length - str1->length);
6020 if (delta == 0) {
6021 new_size = self->length;
6022 } else {
6023 product = n * (str2->length - str1->length);
6024 if ((product / (str2->length - str1->length)) != n) {
6025 PyErr_SetString(PyExc_OverflowError,
6026 "replace string is too long");
6027 return NULL;
6028 }
6029 new_size = self->length + product;
6030 if (new_size < 0) {
6031 PyErr_SetString(PyExc_OverflowError,
6032 "replace string is too long");
6033 return NULL;
6034 }
6035 }
6036 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006037 if (!u)
6038 return NULL;
6039 i = 0;
6040 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006041 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006042 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006043 while (n-- > 0) {
6044 /* look for next match */
6045 j = i;
6046 while (j <= e) {
6047 if (Py_UNICODE_MATCH(self, j, str1))
6048 break;
6049 j++;
6050 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006051 if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006052 if (j > e)
6053 break;
6054 /* copy unchanged part [i:j] */
6055 Py_UNICODE_COPY(p, self->str+i, j-i);
6056 p += j - i;
6057 }
6058 /* copy substitution string */
6059 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00006060 Py_UNICODE_COPY(p, str2->str, str2->length);
6061 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006062 }
6063 i = j + str1->length;
6064 }
6065 if (i < self->length)
6066 /* copy tail [i:] */
6067 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00006068 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00006069 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00006070 while (n > 0) {
6071 Py_UNICODE_COPY(p, str2->str, str2->length);
6072 p += str2->length;
6073 if (--n <= 0)
6074 break;
6075 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00006077 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 }
6079 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00006081
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006082 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00006083 /* nothing to replace; return original string (when possible) */
6084 if (PyUnicode_CheckExact(self)) {
6085 Py_INCREF(self);
6086 return (PyObject *) self;
6087 }
6088 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089}
6090
6091/* --- Unicode Object Methods --------------------------------------------- */
6092
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006093PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006094 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095\n\
6096Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006097characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098
6099static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006100unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 return fixup(self, fixtitle);
6103}
6104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006105PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006106 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107\n\
6108Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006109have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110
6111static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006112unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 return fixup(self, fixcapitalize);
6115}
6116
6117#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006118PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006119 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120\n\
6121Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006122normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123
6124static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006125unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126{
6127 PyObject *list;
6128 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006129 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 /* Split into words */
6132 list = split(self, NULL, -1);
6133 if (!list)
6134 return NULL;
6135
6136 /* Capitalize each word */
6137 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6138 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006139 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 if (item == NULL)
6141 goto onError;
6142 Py_DECREF(PyList_GET_ITEM(list, i));
6143 PyList_SET_ITEM(list, i, item);
6144 }
6145
6146 /* Join the words to form a new string */
6147 item = PyUnicode_Join(NULL, list);
6148
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006149 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150 Py_DECREF(list);
6151 return (PyObject *)item;
6152}
6153#endif
6154
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006155/* Argument converter. Coerces to a single unicode character */
6156
6157static int
6158convert_uc(PyObject *obj, void *addr)
6159{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006160 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6161 PyObject *uniobj;
6162 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006163
Benjamin Peterson857ce152009-01-31 16:29:18 +00006164 uniobj = PyUnicode_FromObject(obj);
6165 if (uniobj == NULL) {
6166 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006167 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006168 return 0;
6169 }
6170 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6171 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006172 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006173 Py_DECREF(uniobj);
6174 return 0;
6175 }
6176 unistr = PyUnicode_AS_UNICODE(uniobj);
6177 *fillcharloc = unistr[0];
6178 Py_DECREF(uniobj);
6179 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006180}
6181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006182PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006183 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006185Return S centered in a Unicode string of length width. Padding is\n\
6186done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187
6188static PyObject *
6189unicode_center(PyUnicodeObject *self, PyObject *args)
6190{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006191 Py_ssize_t marg, left;
6192 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006193 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194
Thomas Woutersde017742006-02-16 19:34:37 +00006195 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196 return NULL;
6197
Tim Peters7a29bd52001-09-12 03:03:31 +00006198 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 Py_INCREF(self);
6200 return (PyObject*) self;
6201 }
6202
6203 marg = width - self->length;
6204 left = marg / 2 + (marg & width & 1);
6205
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006206 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207}
6208
Marc-André Lemburge5034372000-08-08 08:04:29 +00006209#if 0
6210
6211/* This code should go into some future Unicode collation support
6212 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00006213 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006214
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006215/* speedy UTF-16 code point order comparison */
6216/* gleaned from: */
6217/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6218
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006219static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006220{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006221 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006222 0, 0, 0, 0, 0, 0, 0, 0,
6223 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006224 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006225};
6226
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227static int
6228unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6229{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006230 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006231
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 Py_UNICODE *s1 = str1->str;
6233 Py_UNICODE *s2 = str2->str;
6234
6235 len1 = str1->length;
6236 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006237
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006239 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006240
6241 c1 = *s1++;
6242 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006243
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006244 if (c1 > (1<<11) * 26)
6245 c1 += utf16Fixup[c1>>11];
6246 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006247 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006248 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006249
6250 if (c1 != c2)
6251 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006252
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006253 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254 }
6255
6256 return (len1 < len2) ? -1 : (len1 != len2);
6257}
6258
Marc-André Lemburge5034372000-08-08 08:04:29 +00006259#else
6260
6261static int
6262unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6263{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006264 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006265
6266 Py_UNICODE *s1 = str1->str;
6267 Py_UNICODE *s2 = str2->str;
6268
6269 len1 = str1->length;
6270 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006271
Marc-André Lemburge5034372000-08-08 08:04:29 +00006272 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006273 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006274
Fredrik Lundh45714e92001-06-26 16:39:36 +00006275 c1 = *s1++;
6276 c2 = *s2++;
6277
6278 if (c1 != c2)
6279 return (c1 < c2) ? -1 : 1;
6280
Marc-André Lemburge5034372000-08-08 08:04:29 +00006281 len1--; len2--;
6282 }
6283
6284 return (len1 < len2) ? -1 : (len1 != len2);
6285}
6286
6287#endif
6288
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006290 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291{
6292 PyUnicodeObject *u = NULL, *v = NULL;
6293 int result;
6294
6295 /* Coerce the two arguments */
6296 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6297 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006298 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6300 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006301 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302
Thomas Wouters7e474022000-07-16 12:04:32 +00006303 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006305 Py_DECREF(u);
6306 Py_DECREF(v);
6307 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308 }
6309
6310 result = unicode_compare(u, v);
6311
6312 Py_DECREF(u);
6313 Py_DECREF(v);
6314 return result;
6315
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006316 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 Py_XDECREF(u);
6318 Py_XDECREF(v);
6319 return -1;
6320}
6321
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006322PyObject *PyUnicode_RichCompare(PyObject *left,
6323 PyObject *right,
6324 int op)
6325{
6326 int result;
6327
6328 result = PyUnicode_Compare(left, right);
6329 if (result == -1 && PyErr_Occurred())
6330 goto onError;
6331
6332 /* Convert the return value to a Boolean */
6333 switch (op) {
6334 case Py_EQ:
6335 result = (result == 0);
6336 break;
6337 case Py_NE:
6338 result = (result != 0);
6339 break;
6340 case Py_LE:
6341 result = (result <= 0);
6342 break;
6343 case Py_GE:
6344 result = (result >= 0);
6345 break;
6346 case Py_LT:
6347 result = (result == -1);
6348 break;
6349 case Py_GT:
6350 result = (result == 1);
6351 break;
6352 }
6353 return PyBool_FromLong(result);
6354
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006355 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006356
6357 /* Standard case
6358
6359 Type errors mean that PyUnicode_FromObject() could not convert
6360 one of the arguments (usually the right hand side) to Unicode,
6361 ie. we can't handle the comparison request. However, it is
6362 possible that the other object knows a comparison method, which
6363 is why we return Py_NotImplemented to give the other object a
6364 chance.
6365
6366 */
6367 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6368 PyErr_Clear();
6369 Py_INCREF(Py_NotImplemented);
6370 return Py_NotImplemented;
6371 }
6372 if (op != Py_EQ && op != Py_NE)
6373 return NULL;
6374
6375 /* Equality comparison.
6376
6377 This is a special case: we silence any PyExc_UnicodeDecodeError
6378 and instead turn it into a PyErr_UnicodeWarning.
6379
6380 */
6381 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6382 return NULL;
6383 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006384 if (PyErr_Warn(PyExc_UnicodeWarning,
6385 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006386 "Unicode equal comparison "
6387 "failed to convert both arguments to Unicode - "
6388 "interpreting them as being unequal" :
6389 "Unicode unequal comparison "
6390 "failed to convert both arguments to Unicode - "
6391 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006392 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006393 return NULL;
6394 result = (op == Py_NE);
6395 return PyBool_FromLong(result);
6396}
6397
Guido van Rossum403d68b2000-03-13 15:55:09 +00006398int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006399 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006400{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006401 PyObject *str, *sub;
6402 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006403
6404 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006405 sub = PyUnicode_FromObject(element);
6406 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006407 PyErr_SetString(PyExc_TypeError,
6408 "'in <string>' requires string as left operand");
Fredrik Lundh833bf942006-05-23 10:12:21 +00006409 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006410 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006411
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006412 str = PyUnicode_FromObject(container);
6413 if (!str) {
6414 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006415 return -1;
6416 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006417
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006418 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006419
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006420 Py_DECREF(str);
6421 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006422
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006423 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006424}
6425
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426/* Concat to string or Unicode object giving a new Unicode object. */
6427
6428PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006429 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430{
6431 PyUnicodeObject *u = NULL, *v = NULL, *w;
6432
6433 /* Coerce the two arguments */
6434 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6435 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006436 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6438 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006439 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440
6441 /* Shortcuts */
6442 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006443 Py_DECREF(v);
6444 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 }
6446 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006447 Py_DECREF(u);
6448 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 }
6450
6451 /* Concat the two Unicode strings */
6452 w = _PyUnicode_New(u->length + v->length);
6453 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006454 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 Py_UNICODE_COPY(w->str, u->str, u->length);
6456 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6457
6458 Py_DECREF(u);
6459 Py_DECREF(v);
6460 return (PyObject *)w;
6461
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006462 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 Py_XDECREF(u);
6464 Py_XDECREF(v);
6465 return NULL;
6466}
6467
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006468PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006469 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006471Return the number of non-overlapping occurrences of substring sub in\n\
6472Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006473interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474
6475static PyObject *
6476unicode_count(PyUnicodeObject *self, PyObject *args)
6477{
6478 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006479 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006480 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 PyObject *result;
6482
Guido van Rossumb8872e62000-05-09 14:14:27 +00006483 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006484 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 return NULL;
6486
6487 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006488 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006490 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006491
Fredrik Lundhc8162812006-05-26 19:33:03 +00006492 FIX_START_END(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006494 result = PyInt_FromSsize_t(
6495 stringlib_count(self->str + start, end - start,
6496 substring->str, substring->length)
6497 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498
6499 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006500
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501 return result;
6502}
6503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006504PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006505 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006507Encodes S using the codec registered for encoding. encoding defaults\n\
6508to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006509handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006510a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6511'xmlcharrefreplace' as well as any other name registered with\n\
6512codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513
6514static PyObject *
6515unicode_encode(PyUnicodeObject *self, PyObject *args)
6516{
6517 char *encoding = NULL;
6518 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006519 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006520
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
6522 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006523 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006524 if (v == NULL)
6525 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006526 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006527 PyErr_Format(PyExc_TypeError,
6528 "encoder did not return a string/unicode object "
6529 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006530 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006531 Py_DECREF(v);
6532 return NULL;
6533 }
6534 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006535
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006536 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006537 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006538}
6539
6540PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006541 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006542\n\
6543Decodes S using the codec registered for encoding. encoding defaults\n\
6544to the default encoding. errors may be given to set a different error\n\
6545handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6546a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6547as well as any other name registerd with codecs.register_error that is\n\
6548able to handle UnicodeDecodeErrors.");
6549
6550static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00006551unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006552{
6553 char *encoding = NULL;
6554 char *errors = NULL;
6555 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006556
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006557 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
6558 return NULL;
6559 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006560 if (v == NULL)
6561 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006562 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006563 PyErr_Format(PyExc_TypeError,
6564 "decoder did not return a string/unicode object "
6565 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006566 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006567 Py_DECREF(v);
6568 return NULL;
6569 }
6570 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006571
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006572 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006573 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574}
6575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006576PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006577 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578\n\
6579Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006580If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581
6582static PyObject*
6583unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6584{
6585 Py_UNICODE *e;
6586 Py_UNICODE *p;
6587 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006588 Py_UNICODE *qe;
6589 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 PyUnicodeObject *u;
6591 int tabsize = 8;
6592
6593 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006594 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595
Thomas Wouters7e474022000-07-16 12:04:32 +00006596 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006597 i = 0; /* chars up to and including most recent \n or \r */
6598 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6599 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 for (p = self->str; p < e; p++)
6601 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006602 if (tabsize > 0) {
6603 incr = tabsize - (j % tabsize); /* cannot overflow */
6604 if (j > PY_SSIZE_T_MAX - incr)
6605 goto overflow1;
6606 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006607 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006610 if (j > PY_SSIZE_T_MAX - 1)
6611 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 j++;
6613 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006614 if (i > PY_SSIZE_T_MAX - j)
6615 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006617 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618 }
6619 }
6620
Guido van Rossum5bdff602008-03-11 21:18:06 +00006621 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006622 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006623
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 /* Second pass: create output string and fill it */
6625 u = _PyUnicode_New(i + j);
6626 if (!u)
6627 return NULL;
6628
Guido van Rossum5bdff602008-03-11 21:18:06 +00006629 j = 0; /* same as in first pass */
6630 q = u->str; /* next output char */
6631 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632
6633 for (p = self->str; p < e; p++)
6634 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006635 if (tabsize > 0) {
6636 i = tabsize - (j % tabsize);
6637 j += i;
6638 while (i--) {
6639 if (q >= qe)
6640 goto overflow2;
6641 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006642 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006643 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006644 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006645 else {
6646 if (q >= qe)
6647 goto overflow2;
6648 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006649 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650 if (*p == '\n' || *p == '\r')
6651 j = 0;
6652 }
6653
6654 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006655
6656 overflow2:
6657 Py_DECREF(u);
6658 overflow1:
6659 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6660 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661}
6662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006663PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006664 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665\n\
6666Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006667such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668arguments start and end are interpreted as in slice notation.\n\
6669\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006670Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671
6672static PyObject *
6673unicode_find(PyUnicodeObject *self, PyObject *args)
6674{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006675 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006676 Py_ssize_t start;
6677 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006678 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679
Facundo Batista57d56692007-11-16 18:04:14 +00006680 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006683 result = stringlib_find_slice(
6684 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6685 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6686 start, end
6687 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688
6689 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006690
6691 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692}
6693
6694static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006695unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696{
6697 if (index < 0 || index >= self->length) {
6698 PyErr_SetString(PyExc_IndexError, "string index out of range");
6699 return NULL;
6700 }
6701
6702 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6703}
6704
6705static long
6706unicode_hash(PyUnicodeObject *self)
6707{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006708 /* Since Unicode objects compare equal to their ASCII string
6709 counterparts, they should use the individual character values
6710 as basis for their hash value. This is needed to assure that
6711 strings and Unicode objects behave in the same way as
6712 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713
Martin v. Löwis18e16552006-02-15 17:27:45 +00006714 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006715 register Py_UNICODE *p;
6716 register long x;
6717
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006719 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006720 len = PyUnicode_GET_SIZE(self);
6721 p = PyUnicode_AS_UNICODE(self);
6722 x = *p << 7;
6723 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006724 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006725 x ^= PyUnicode_GET_SIZE(self);
6726 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006727 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006728 self->hash = x;
6729 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730}
6731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006732PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006733 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006735Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736
6737static PyObject *
6738unicode_index(PyUnicodeObject *self, PyObject *args)
6739{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006740 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006741 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006742 Py_ssize_t start;
6743 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744
Facundo Batista57d56692007-11-16 18:04:14 +00006745 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006748 result = stringlib_find_slice(
6749 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6750 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6751 start, end
6752 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753
6754 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006755
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 if (result < 0) {
6757 PyErr_SetString(PyExc_ValueError, "substring not found");
6758 return NULL;
6759 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006760
Martin v. Löwis18e16552006-02-15 17:27:45 +00006761 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762}
6763
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006764PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006765 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006767Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006768at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769
6770static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006771unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772{
6773 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6774 register const Py_UNICODE *e;
6775 int cased;
6776
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 /* Shortcut for single character strings */
6778 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006779 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006781 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006782 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006783 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006784
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 e = p + PyUnicode_GET_SIZE(self);
6786 cased = 0;
6787 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006788 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006789
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006790 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6791 return PyBool_FromLong(0);
6792 else if (!cased && Py_UNICODE_ISLOWER(ch))
6793 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006795 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796}
6797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006798PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006799 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006801Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006802at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803
6804static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006805unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806{
6807 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6808 register const Py_UNICODE *e;
6809 int cased;
6810
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811 /* Shortcut for single character strings */
6812 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006813 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006815 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006816 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006817 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006818
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 e = p + PyUnicode_GET_SIZE(self);
6820 cased = 0;
6821 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006822 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006823
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006824 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6825 return PyBool_FromLong(0);
6826 else if (!cased && Py_UNICODE_ISUPPER(ch))
6827 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006829 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830}
6831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006832PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006833 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006835Return True if S is a titlecased string and there is at least one\n\
6836character in S, i.e. upper- and titlecase characters may only\n\
6837follow uncased characters and lowercase characters only cased ones.\n\
6838Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839
6840static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006841unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842{
6843 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6844 register const Py_UNICODE *e;
6845 int cased, previous_is_cased;
6846
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847 /* Shortcut for single character strings */
6848 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006849 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6850 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006852 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006853 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006854 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006855
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 e = p + PyUnicode_GET_SIZE(self);
6857 cased = 0;
6858 previous_is_cased = 0;
6859 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006860 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006861
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006862 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6863 if (previous_is_cased)
6864 return PyBool_FromLong(0);
6865 previous_is_cased = 1;
6866 cased = 1;
6867 }
6868 else if (Py_UNICODE_ISLOWER(ch)) {
6869 if (!previous_is_cased)
6870 return PyBool_FromLong(0);
6871 previous_is_cased = 1;
6872 cased = 1;
6873 }
6874 else
6875 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006877 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878}
6879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006880PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006881 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006883Return True if all characters in S are whitespace\n\
6884and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885
6886static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006887unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888{
6889 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6890 register const Py_UNICODE *e;
6891
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 /* Shortcut for single character strings */
6893 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006894 Py_UNICODE_ISSPACE(*p))
6895 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006897 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006898 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006899 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006900
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 e = p + PyUnicode_GET_SIZE(self);
6902 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006903 if (!Py_UNICODE_ISSPACE(*p))
6904 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006906 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907}
6908
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006909PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006910 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006911\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006912Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006913and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006914
6915static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006916unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006917{
6918 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6919 register const Py_UNICODE *e;
6920
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006921 /* Shortcut for single character strings */
6922 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006923 Py_UNICODE_ISALPHA(*p))
6924 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006925
6926 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006927 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006928 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006929
6930 e = p + PyUnicode_GET_SIZE(self);
6931 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006932 if (!Py_UNICODE_ISALPHA(*p))
6933 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006934 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006935 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006936}
6937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006938PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006939 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006940\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006941Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006942and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006943
6944static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006945unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006946{
6947 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6948 register const Py_UNICODE *e;
6949
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006950 /* Shortcut for single character strings */
6951 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006952 Py_UNICODE_ISALNUM(*p))
6953 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006954
6955 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006956 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006957 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006958
6959 e = p + PyUnicode_GET_SIZE(self);
6960 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006961 if (!Py_UNICODE_ISALNUM(*p))
6962 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006963 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006964 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006965}
6966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006967PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006968 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006969\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006970Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006971False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972
6973static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006974unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975{
6976 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6977 register const Py_UNICODE *e;
6978
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 /* Shortcut for single character strings */
6980 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006981 Py_UNICODE_ISDECIMAL(*p))
6982 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006984 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006985 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006986 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006987
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988 e = p + PyUnicode_GET_SIZE(self);
6989 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006990 if (!Py_UNICODE_ISDECIMAL(*p))
6991 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006993 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006994}
6995
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006996PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006997 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006999Return True if all characters in S are digits\n\
7000and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007001
7002static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007003unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004{
7005 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7006 register const Py_UNICODE *e;
7007
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 /* Shortcut for single character strings */
7009 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007010 Py_UNICODE_ISDIGIT(*p))
7011 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007013 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007014 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007015 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007016
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 e = p + PyUnicode_GET_SIZE(self);
7018 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007019 if (!Py_UNICODE_ISDIGIT(*p))
7020 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007022 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023}
7024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007025PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007026 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007028Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007029False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030
7031static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007032unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033{
7034 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7035 register const Py_UNICODE *e;
7036
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037 /* Shortcut for single character strings */
7038 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007039 Py_UNICODE_ISNUMERIC(*p))
7040 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007041
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007042 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007043 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007044 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007045
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046 e = p + PyUnicode_GET_SIZE(self);
7047 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007048 if (!Py_UNICODE_ISNUMERIC(*p))
7049 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007051 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052}
7053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007054PyDoc_STRVAR(join__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007055 "S.join(sequence) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056\n\
7057Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007058sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059
7060static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007061unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007063 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064}
7065
Martin v. Löwis18e16552006-02-15 17:27:45 +00007066static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067unicode_length(PyUnicodeObject *self)
7068{
7069 return self->length;
7070}
7071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007072PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007073 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007075Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007076done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077
7078static PyObject *
7079unicode_ljust(PyUnicodeObject *self, PyObject *args)
7080{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007081 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007082 Py_UNICODE fillchar = ' ';
7083
Martin v. Löwis412fb672006-04-13 06:34:32 +00007084 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085 return NULL;
7086
Tim Peters7a29bd52001-09-12 03:03:31 +00007087 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088 Py_INCREF(self);
7089 return (PyObject*) self;
7090 }
7091
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007092 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093}
7094
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007095PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007096 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007098Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099
7100static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007101unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103 return fixup(self, fixlower);
7104}
7105
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007106#define LEFTSTRIP 0
7107#define RIGHTSTRIP 1
7108#define BOTHSTRIP 2
7109
7110/* Arrays indexed by above */
7111static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7112
7113#define STRIPNAME(i) (stripformat[i]+3)
7114
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007115/* externally visible for str.strip(unicode) */
7116PyObject *
7117_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7118{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007119 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7120 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7121 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7122 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7123 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007124
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007125 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00007126
Benjamin Peterson857ce152009-01-31 16:29:18 +00007127 i = 0;
7128 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007129 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7130 i++;
7131 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00007132 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007133
Benjamin Peterson857ce152009-01-31 16:29:18 +00007134 j = len;
7135 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007136 do {
7137 j--;
7138 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7139 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007140 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007141
Benjamin Peterson857ce152009-01-31 16:29:18 +00007142 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007143 Py_INCREF(self);
7144 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007145 }
7146 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007147 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007148}
7149
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150
7151static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007152do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007154 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7155 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007156
Benjamin Peterson857ce152009-01-31 16:29:18 +00007157 i = 0;
7158 if (striptype != RIGHTSTRIP) {
7159 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7160 i++;
7161 }
7162 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007163
Benjamin Peterson857ce152009-01-31 16:29:18 +00007164 j = len;
7165 if (striptype != LEFTSTRIP) {
7166 do {
7167 j--;
7168 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7169 j++;
7170 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007171
Benjamin Peterson857ce152009-01-31 16:29:18 +00007172 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7173 Py_INCREF(self);
7174 return (PyObject*)self;
7175 }
7176 else
7177 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178}
7179
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007180
7181static PyObject *
7182do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7183{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007184 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007185
Benjamin Peterson857ce152009-01-31 16:29:18 +00007186 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7187 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007188
Benjamin Peterson857ce152009-01-31 16:29:18 +00007189 if (sep != NULL && sep != Py_None) {
7190 if (PyUnicode_Check(sep))
7191 return _PyUnicode_XStrip(self, striptype, sep);
7192 else if (PyString_Check(sep)) {
7193 PyObject *res;
7194 sep = PyUnicode_FromObject(sep);
7195 if (sep==NULL)
7196 return NULL;
7197 res = _PyUnicode_XStrip(self, striptype, sep);
7198 Py_DECREF(sep);
7199 return res;
7200 }
7201 else {
7202 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007203 "%s arg must be None, unicode or str",
7204 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007205 return NULL;
7206 }
7207 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007208
Benjamin Peterson857ce152009-01-31 16:29:18 +00007209 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007210}
7211
7212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007213PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007214 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007215\n\
7216Return a copy of the string S with leading and trailing\n\
7217whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007218If chars is given and not None, remove characters in chars instead.\n\
7219If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007220
7221static PyObject *
7222unicode_strip(PyUnicodeObject *self, PyObject *args)
7223{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007224 if (PyTuple_GET_SIZE(args) == 0)
7225 return do_strip(self, BOTHSTRIP); /* Common case */
7226 else
7227 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007228}
7229
7230
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007231PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007232 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007233\n\
7234Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007235If chars is given and not None, remove characters in chars instead.\n\
7236If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007237
7238static PyObject *
7239unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7240{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007241 if (PyTuple_GET_SIZE(args) == 0)
7242 return do_strip(self, LEFTSTRIP); /* Common case */
7243 else
7244 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007245}
7246
7247
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007248PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007249 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007250\n\
7251Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007252If chars is given and not None, remove characters in chars instead.\n\
7253If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007254
7255static PyObject *
7256unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7257{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007258 if (PyTuple_GET_SIZE(args) == 0)
7259 return do_strip(self, RIGHTSTRIP); /* Common case */
7260 else
7261 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007262}
7263
7264
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007266unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267{
7268 PyUnicodeObject *u;
7269 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007270 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007271 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272
7273 if (len < 0)
7274 len = 0;
7275
Tim Peters7a29bd52001-09-12 03:03:31 +00007276 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277 /* no repeat, return original string */
7278 Py_INCREF(str);
7279 return (PyObject*) str;
7280 }
Tim Peters8f422462000-09-09 06:13:41 +00007281
7282 /* ensure # of chars needed doesn't overflow int and # of bytes
7283 * needed doesn't overflow size_t
7284 */
7285 nchars = len * str->length;
7286 if (len && nchars / len != str->length) {
7287 PyErr_SetString(PyExc_OverflowError,
7288 "repeated string is too long");
7289 return NULL;
7290 }
7291 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7292 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7293 PyErr_SetString(PyExc_OverflowError,
7294 "repeated string is too long");
7295 return NULL;
7296 }
7297 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298 if (!u)
7299 return NULL;
7300
7301 p = u->str;
7302
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007303 if (str->length == 1 && len > 0) {
7304 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007305 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007306 Py_ssize_t done = 0; /* number of characters copied this far */
7307 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007308 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007309 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007310 }
7311 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007312 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007313 Py_UNICODE_COPY(p+done, p, n);
7314 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007315 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007316 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007317
7318 return (PyObject*) u;
7319}
7320
7321PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007322 PyObject *subobj,
7323 PyObject *replobj,
7324 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325{
7326 PyObject *self;
7327 PyObject *str1;
7328 PyObject *str2;
7329 PyObject *result;
7330
7331 self = PyUnicode_FromObject(obj);
7332 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007333 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334 str1 = PyUnicode_FromObject(subobj);
7335 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007336 Py_DECREF(self);
7337 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338 }
7339 str2 = PyUnicode_FromObject(replobj);
7340 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007341 Py_DECREF(self);
7342 Py_DECREF(str1);
7343 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344 }
Tim Petersced69f82003-09-16 20:30:58 +00007345 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007346 (PyUnicodeObject *)str1,
7347 (PyUnicodeObject *)str2,
7348 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349 Py_DECREF(self);
7350 Py_DECREF(str1);
7351 Py_DECREF(str2);
7352 return result;
7353}
7354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007355PyDoc_STRVAR(replace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007356 "S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357\n\
7358Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007359old replaced by new. If the optional argument count is\n\
7360given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361
7362static PyObject*
7363unicode_replace(PyUnicodeObject *self, PyObject *args)
7364{
7365 PyUnicodeObject *str1;
7366 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007367 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007368 PyObject *result;
7369
Martin v. Löwis18e16552006-02-15 17:27:45 +00007370 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371 return NULL;
7372 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7373 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007374 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007376 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007377 Py_DECREF(str1);
7378 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007379 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380
7381 result = replace(self, str1, str2, maxcount);
7382
7383 Py_DECREF(str1);
7384 Py_DECREF(str2);
7385 return result;
7386}
7387
7388static
7389PyObject *unicode_repr(PyObject *unicode)
7390{
7391 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007392 PyUnicode_GET_SIZE(unicode),
7393 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394}
7395
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007396PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007397 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398\n\
7399Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007400such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007401arguments start and end are interpreted as in slice notation.\n\
7402\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007403Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404
7405static PyObject *
7406unicode_rfind(PyUnicodeObject *self, PyObject *args)
7407{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007408 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007409 Py_ssize_t start;
7410 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007411 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412
Facundo Batista57d56692007-11-16 18:04:14 +00007413 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007414 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007416 result = stringlib_rfind_slice(
7417 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7418 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7419 start, end
7420 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421
7422 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007423
7424 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425}
7426
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007427PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007428 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007429\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007430Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007431
7432static PyObject *
7433unicode_rindex(PyUnicodeObject *self, PyObject *args)
7434{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007435 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007436 Py_ssize_t start;
7437 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007438 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439
Facundo Batista57d56692007-11-16 18:04:14 +00007440 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007441 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007443 result = stringlib_rfind_slice(
7444 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7445 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7446 start, end
7447 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448
7449 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007450
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451 if (result < 0) {
7452 PyErr_SetString(PyExc_ValueError, "substring not found");
7453 return NULL;
7454 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007455 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456}
7457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007458PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007459 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007461Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007462done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463
7464static PyObject *
7465unicode_rjust(PyUnicodeObject *self, PyObject *args)
7466{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007467 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007468 Py_UNICODE fillchar = ' ';
7469
Martin v. Löwis412fb672006-04-13 06:34:32 +00007470 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471 return NULL;
7472
Tim Peters7a29bd52001-09-12 03:03:31 +00007473 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474 Py_INCREF(self);
7475 return (PyObject*) self;
7476 }
7477
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007478 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007479}
7480
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007482unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483{
7484 /* standard clamping */
7485 if (start < 0)
7486 start = 0;
7487 if (end < 0)
7488 end = 0;
7489 if (end > self->length)
7490 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007491 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007492 /* full slice, return original string */
7493 Py_INCREF(self);
7494 return (PyObject*) self;
7495 }
7496 if (start > end)
7497 start = end;
7498 /* copy slice */
7499 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007500 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501}
7502
7503PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007504 PyObject *sep,
7505 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506{
7507 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007508
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509 s = PyUnicode_FromObject(s);
7510 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007511 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007512 if (sep != NULL) {
7513 sep = PyUnicode_FromObject(sep);
7514 if (sep == NULL) {
7515 Py_DECREF(s);
7516 return NULL;
7517 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518 }
7519
7520 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7521
7522 Py_DECREF(s);
7523 Py_XDECREF(sep);
7524 return result;
7525}
7526
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007527PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007528 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529\n\
7530Return a list of the words in S, using sep as the\n\
7531delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007532splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007533whitespace string is a separator and empty strings are\n\
7534removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007535
7536static PyObject*
7537unicode_split(PyUnicodeObject *self, PyObject *args)
7538{
7539 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007540 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541
Martin v. Löwis18e16552006-02-15 17:27:45 +00007542 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543 return NULL;
7544
7545 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007546 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007548 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007550 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551}
7552
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007553PyObject *
7554PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7555{
7556 PyObject* str_obj;
7557 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007558 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007559
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007560 str_obj = PyUnicode_FromObject(str_in);
7561 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007562 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007563 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007564 if (!sep_obj) {
7565 Py_DECREF(str_obj);
7566 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007567 }
7568
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007569 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007570 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7571 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7572 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007573
Fredrik Lundhb9479482006-05-26 17:22:38 +00007574 Py_DECREF(sep_obj);
7575 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007576
7577 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007578}
7579
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007580
7581PyObject *
7582PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7583{
7584 PyObject* str_obj;
7585 PyObject* sep_obj;
7586 PyObject* out;
7587
7588 str_obj = PyUnicode_FromObject(str_in);
7589 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007590 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007591 sep_obj = PyUnicode_FromObject(sep_in);
7592 if (!sep_obj) {
7593 Py_DECREF(str_obj);
7594 return NULL;
7595 }
7596
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007597 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007598 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7599 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7600 );
7601
7602 Py_DECREF(sep_obj);
7603 Py_DECREF(str_obj);
7604
7605 return out;
7606}
7607
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007608PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007609 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007610\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007611Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007612the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007613found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007614
7615static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007616unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007617{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007618 return PyUnicode_Partition((PyObject *)self, separator);
7619}
7620
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007621PyDoc_STRVAR(rpartition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007622 "S.rpartition(sep) -> (tail, sep, head)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007623\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007624Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007625the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007626separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007627
7628static PyObject*
7629unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7630{
7631 return PyUnicode_RPartition((PyObject *)self, separator);
7632}
7633
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007634PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007635 PyObject *sep,
7636 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007637{
7638 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007639
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007640 s = PyUnicode_FromObject(s);
7641 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007642 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007643 if (sep != NULL) {
7644 sep = PyUnicode_FromObject(sep);
7645 if (sep == NULL) {
7646 Py_DECREF(s);
7647 return NULL;
7648 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007649 }
7650
7651 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7652
7653 Py_DECREF(s);
7654 Py_XDECREF(sep);
7655 return result;
7656}
7657
7658PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007659 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007660\n\
7661Return a list of the words in S, using sep as the\n\
7662delimiter string, starting at the end of the string and\n\
7663working to the front. If maxsplit is given, at most maxsplit\n\
7664splits are done. If sep is not specified, any whitespace string\n\
7665is a separator.");
7666
7667static PyObject*
7668unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7669{
7670 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007671 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007672
Martin v. Löwis18e16552006-02-15 17:27:45 +00007673 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007674 return NULL;
7675
7676 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007677 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007678 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007679 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007680 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007681 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007682}
7683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007684PyDoc_STRVAR(splitlines__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007685 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686\n\
7687Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007688Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007689is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690
7691static PyObject*
7692unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7693{
Guido van Rossum86662912000-04-11 15:38:46 +00007694 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695
Guido van Rossum86662912000-04-11 15:38:46 +00007696 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697 return NULL;
7698
Guido van Rossum86662912000-04-11 15:38:46 +00007699 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700}
7701
7702static
7703PyObject *unicode_str(PyUnicodeObject *self)
7704{
Fred Drakee4315f52000-05-09 19:53:39 +00007705 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706}
7707
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007708PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007709 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710\n\
7711Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007712and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713
7714static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007715unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717 return fixup(self, fixswapcase);
7718}
7719
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007720PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007721 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722\n\
7723Return a copy of the string S, where all characters have been mapped\n\
7724through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007725Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7726Unmapped characters are left untouched. Characters mapped to None\n\
7727are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728
7729static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007730unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731{
Tim Petersced69f82003-09-16 20:30:58 +00007732 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007733 self->length,
7734 table,
7735 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736}
7737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007738PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007739 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007741Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742
7743static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007744unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746 return fixup(self, fixupper);
7747}
7748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007749PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007750 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751\n\
Georg Brandl98064072008-09-09 19:26:00 +00007752Pad a numeric string S with zeros on the left, to fill a field\n\
7753of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754
7755static PyObject *
7756unicode_zfill(PyUnicodeObject *self, PyObject *args)
7757{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007758 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759 PyUnicodeObject *u;
7760
Martin v. Löwis18e16552006-02-15 17:27:45 +00007761 Py_ssize_t width;
7762 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 return NULL;
7764
7765 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007766 if (PyUnicode_CheckExact(self)) {
7767 Py_INCREF(self);
7768 return (PyObject*) self;
7769 }
7770 else
7771 return PyUnicode_FromUnicode(
7772 PyUnicode_AS_UNICODE(self),
7773 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007774 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775 }
7776
7777 fill = width - self->length;
7778
7779 u = pad(self, fill, 0, '0');
7780
Walter Dörwald068325e2002-04-15 13:36:47 +00007781 if (u == NULL)
7782 return NULL;
7783
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784 if (u->str[fill] == '+' || u->str[fill] == '-') {
7785 /* move sign to beginning of string */
7786 u->str[0] = u->str[fill];
7787 u->str[fill] = '0';
7788 }
7789
7790 return (PyObject*) u;
7791}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792
7793#if 0
7794static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007795free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007797 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798}
7799#endif
7800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007801PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007802 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007803\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007804Return True if S starts with the specified prefix, False otherwise.\n\
7805With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007806With optional end, stop comparing S at that position.\n\
7807prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808
7809static PyObject *
7810unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007811 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812{
Georg Brandl24250812006-06-09 18:45:48 +00007813 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007815 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007816 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007817 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818
Georg Brandl24250812006-06-09 18:45:48 +00007819 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007820 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7821 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007822 if (PyTuple_Check(subobj)) {
7823 Py_ssize_t i;
7824 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7825 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007826 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007827 if (substring == NULL)
7828 return NULL;
7829 result = tailmatch(self, substring, start, end, -1);
7830 Py_DECREF(substring);
7831 if (result) {
7832 Py_RETURN_TRUE;
7833 }
7834 }
7835 /* nothing matched */
7836 Py_RETURN_FALSE;
7837 }
7838 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007840 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007841 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007843 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844}
7845
7846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007847PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007848 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007850Return True if S ends with the specified suffix, False otherwise.\n\
7851With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007852With optional end, stop comparing S at that position.\n\
7853suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854
7855static PyObject *
7856unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007857 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858{
Georg Brandl24250812006-06-09 18:45:48 +00007859 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007861 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007862 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007863 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864
Georg Brandl24250812006-06-09 18:45:48 +00007865 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007866 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7867 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007868 if (PyTuple_Check(subobj)) {
7869 Py_ssize_t i;
7870 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7871 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007872 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007873 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007874 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007875 result = tailmatch(self, substring, start, end, +1);
7876 Py_DECREF(substring);
7877 if (result) {
7878 Py_RETURN_TRUE;
7879 }
7880 }
7881 Py_RETURN_FALSE;
7882 }
7883 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007885 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886
Georg Brandl24250812006-06-09 18:45:48 +00007887 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007889 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890}
7891
7892
Eric Smitha9f7d622008-02-17 19:46:49 +00007893/* Implements do_string_format, which is unicode because of stringlib */
7894#include "stringlib/string_format.h"
7895
7896PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007897 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007898\n\
7899");
7900
Eric Smithdc13b792008-05-30 18:10:04 +00007901static PyObject *
7902unicode__format__(PyObject *self, PyObject *args)
7903{
7904 PyObject *format_spec;
7905 PyObject *result = NULL;
7906 PyObject *tmp = NULL;
7907
7908 /* If 2.x, convert format_spec to the same type as value */
7909 /* This is to allow things like u''.format('') */
7910 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7911 goto done;
7912 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7913 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007914 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007915 goto done;
7916 }
7917 tmp = PyObject_Unicode(format_spec);
7918 if (tmp == NULL)
7919 goto done;
7920 format_spec = tmp;
7921
7922 result = _PyUnicode_FormatAdvanced(self,
7923 PyUnicode_AS_UNICODE(format_spec),
7924 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007925 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007926 Py_XDECREF(tmp);
7927 return result;
7928}
7929
Eric Smitha9f7d622008-02-17 19:46:49 +00007930PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007931 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007932\n\
7933");
7934
Robert Schuppenies901c9972008-06-10 10:10:31 +00007935static PyObject *
7936unicode__sizeof__(PyUnicodeObject *v)
7937{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007938 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7939 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007940}
7941
7942PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007943 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007944\n\
7945");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007946
7947static PyObject *
7948unicode_getnewargs(PyUnicodeObject *v)
7949{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007950 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007951}
7952
7953
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954static PyMethodDef unicode_methods[] = {
7955
7956 /* Order is according to common usage: often used methods should
7957 appear first, since lookup is done sequentially. */
7958
Georg Brandlecdc0a92006-03-30 12:19:07 +00007959 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007960 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7961 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007962 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007963 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7964 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7965 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7966 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7967 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7968 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7969 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007970 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007971 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7972 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7973 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007974 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007975 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007976/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7977 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7978 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7979 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007980 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007981 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007982 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007983 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007984 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7985 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7986 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7987 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7988 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7989 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7990 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7991 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7992 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7993 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7994 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7995 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7996 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7997 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007998 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007999 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8000 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8001 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8002 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00008003 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00008004#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008005 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006#endif
8007
8008#if 0
8009 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008010 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011#endif
8012
Benjamin Peterson857ce152009-01-31 16:29:18 +00008013 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 {NULL, NULL}
8015};
8016
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008017static PyObject *
8018unicode_mod(PyObject *v, PyObject *w)
8019{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008020 if (!PyUnicode_Check(v)) {
8021 Py_INCREF(Py_NotImplemented);
8022 return Py_NotImplemented;
8023 }
8024 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008025}
8026
8027static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008028 0, /*nb_add*/
8029 0, /*nb_subtract*/
8030 0, /*nb_multiply*/
8031 0, /*nb_divide*/
8032 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008033};
8034
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008036 (lenfunc) unicode_length, /* sq_length */
8037 PyUnicode_Concat, /* sq_concat */
8038 (ssizeargfunc) unicode_repeat, /* sq_repeat */
8039 (ssizeargfunc) unicode_getitem, /* sq_item */
8040 (ssizessizeargfunc) unicode_slice, /* sq_slice */
8041 0, /* sq_ass_item */
8042 0, /* sq_ass_slice */
8043 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044};
8045
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008046static PyObject*
8047unicode_subscript(PyUnicodeObject* self, PyObject* item)
8048{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00008049 if (PyIndex_Check(item)) {
8050 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008051 if (i == -1 && PyErr_Occurred())
8052 return NULL;
8053 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008054 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008055 return unicode_getitem(self, i);
8056 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008057 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008058 Py_UNICODE* source_buf;
8059 Py_UNICODE* result_buf;
8060 PyObject* result;
8061
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008062 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008063 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008064 return NULL;
8065 }
8066
8067 if (slicelength <= 0) {
8068 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00008069 } else if (start == 0 && step == 1 && slicelength == self->length &&
8070 PyUnicode_CheckExact(self)) {
8071 Py_INCREF(self);
8072 return (PyObject *)self;
8073 } else if (step == 1) {
8074 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008075 } else {
8076 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00008077 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8078 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008079
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008080 if (result_buf == NULL)
8081 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008082
8083 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8084 result_buf[i] = source_buf[cur];
8085 }
Tim Petersced69f82003-09-16 20:30:58 +00008086
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008087 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00008088 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008089 return result;
8090 }
8091 } else {
8092 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8093 return NULL;
8094 }
8095}
8096
8097static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00008098 (lenfunc)unicode_length, /* mp_length */
8099 (binaryfunc)unicode_subscript, /* mp_subscript */
8100 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00008101};
8102
Martin v. Löwis18e16552006-02-15 17:27:45 +00008103static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008105 Py_ssize_t index,
8106 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107{
8108 if (index != 0) {
8109 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008110 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111 return -1;
8112 }
8113 *ptr = (void *) self->str;
8114 return PyUnicode_GET_DATA_SIZE(self);
8115}
8116
Martin v. Löwis18e16552006-02-15 17:27:45 +00008117static Py_ssize_t
8118unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008119 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120{
8121 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008122 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123 return -1;
8124}
8125
8126static int
8127unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008128 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129{
8130 if (lenp)
8131 *lenp = PyUnicode_GET_DATA_SIZE(self);
8132 return 1;
8133}
8134
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008135static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008136unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008137 Py_ssize_t index,
8138 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139{
8140 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008141
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142 if (index != 0) {
8143 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008144 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 return -1;
8146 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008147 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008149 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008150 *ptr = (void *) PyString_AS_STRING(str);
8151 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008152}
8153
8154/* Helpers for PyUnicode_Format() */
8155
8156static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008157getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008159 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008161 (*p_argidx)++;
8162 if (arglen < 0)
8163 return args;
8164 else
8165 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166 }
8167 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008168 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169 return NULL;
8170}
8171
8172#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008173#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008175#define F_ALT (1<<3)
8176#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177
Martin v. Löwis18e16552006-02-15 17:27:45 +00008178static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008179strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008181 register Py_ssize_t i;
8182 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008184 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186 return len;
8187}
8188
Neal Norwitzfc76d632006-01-10 06:03:13 +00008189static int
8190doubletounicode(Py_UNICODE *buffer, size_t len, const char *format, double x)
8191{
Tim Peters15231542006-02-16 01:08:01 +00008192 Py_ssize_t result;
8193
Neal Norwitzfc76d632006-01-10 06:03:13 +00008194 PyOS_ascii_formatd((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008195 result = strtounicode(buffer, (char *)buffer);
8196 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008197}
8198
8199static int
8200longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8201{
Tim Peters15231542006-02-16 01:08:01 +00008202 Py_ssize_t result;
8203
Neal Norwitzfc76d632006-01-10 06:03:13 +00008204 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008205 result = strtounicode(buffer, (char *)buffer);
8206 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008207}
8208
Guido van Rossum078151d2002-08-11 04:24:12 +00008209/* XXX To save some code duplication, formatfloat/long/int could have been
8210 shared with stringobject.c, converting from 8-bit to Unicode after the
8211 formatting is done. */
8212
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213static int
8214formatfloat(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008215 size_t buflen,
8216 int flags,
8217 int prec,
8218 int type,
8219 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008221 /* fmt = '%#.' + `prec` + `type`
8222 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223 char fmt[20];
8224 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008225
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226 x = PyFloat_AsDouble(v);
8227 if (x == -1.0 && PyErr_Occurred())
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008228 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008230 prec = 6;
Eric Smithd6c393a2008-07-17 19:49:47 +00008231 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008232 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008233 /* Worst case length calc to ensure no buffer overrun:
8234
8235 'g' formats:
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008236 fmt = %#.<prec>g
8237 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
8238 for any double rep.)
8239 len = 1 + prec + 1 + 2 + 5 = 9 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008240
8241 'f' formats:
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008242 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
8243 len = 1 + 50 + 1 + prec = 52 + prec
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008244
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008245 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00008246 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008247
8248 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008249 if (((type == 'g' || type == 'G') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008250 buflen <= (size_t)10 + (size_t)prec) ||
8251 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
8252 PyErr_SetString(PyExc_OverflowError,
8253 "formatted float is too long (precision too large?)");
8254 return -1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008255 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00008256 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008257 (flags&F_ALT) ? "#" : "",
8258 prec, type);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008259 return doubletounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260}
8261
Tim Peters38fd5b62000-09-21 05:43:11 +00008262static PyObject*
8263formatlong(PyObject *val, int flags, int prec, int type)
8264{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008265 char *buf;
8266 int i, len;
8267 PyObject *str; /* temporary string object. */
8268 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008269
Benjamin Peterson857ce152009-01-31 16:29:18 +00008270 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8271 if (!str)
8272 return NULL;
8273 result = _PyUnicode_New(len);
8274 if (!result) {
8275 Py_DECREF(str);
8276 return NULL;
8277 }
8278 for (i = 0; i < len; i++)
8279 result->str[i] = buf[i];
8280 result->str[len] = 0;
8281 Py_DECREF(str);
8282 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008283}
8284
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285static int
8286formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008287 size_t buflen,
8288 int flags,
8289 int prec,
8290 int type,
8291 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008293 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008294 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8295 * + 1 + 1
8296 * = 24
8297 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008298 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008299 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 long x;
8301
8302 x = PyInt_AsLong(v);
8303 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008304 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008305 if (x < 0 && type == 'u') {
8306 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008307 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008308 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8309 sign = "-";
8310 else
8311 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008313 prec = 1;
8314
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008315 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8316 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008317 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008318 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008319 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008320 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008321 return -1;
8322 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008323
8324 if ((flags & F_ALT) &&
8325 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008326 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008327 * of issues that cause pain:
8328 * - when 0 is being converted, the C standard leaves off
8329 * the '0x' or '0X', which is inconsistent with other
8330 * %#x/%#X conversions and inconsistent with Python's
8331 * hex() function
8332 * - there are platforms that violate the standard and
8333 * convert 0 with the '0x' or '0X'
8334 * (Metrowerks, Compaq Tru64)
8335 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008336 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008337 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008338 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008339 * We can achieve the desired consistency by inserting our
8340 * own '0x' or '0X' prefix, and substituting %x/%X in place
8341 * of %#x/%#X.
8342 *
8343 * Note that this is the same approach as used in
8344 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008345 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008346 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8347 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008348 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008349 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008350 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8351 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008352 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008353 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008354 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008355 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008356 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008357 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358}
8359
8360static int
8361formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008362 size_t buflen,
8363 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008365 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008366 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008367 if (PyUnicode_GET_SIZE(v) != 1)
8368 goto onError;
8369 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008370 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008372 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008373 if (PyString_GET_SIZE(v) != 1)
8374 goto onError;
8375 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008376 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377
8378 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008379 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008381 x = PyInt_AsLong(v);
8382 if (x == -1 && PyErr_Occurred())
8383 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008384#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008385 if (x < 0 || x > 0x10ffff) {
8386 PyErr_SetString(PyExc_OverflowError,
8387 "%c arg not in range(0x110000) "
8388 "(wide Python build)");
8389 return -1;
8390 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008391#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008392 if (x < 0 || x > 0xffff) {
8393 PyErr_SetString(PyExc_OverflowError,
8394 "%c arg not in range(0x10000) "
8395 "(narrow Python build)");
8396 return -1;
8397 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008398#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008399 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400 }
8401 buf[1] = '\0';
8402 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008403
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008404 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008405 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008406 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008407 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008408}
8409
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008410/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8411
8412 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
8413 chars are formatted. XXX This is a magic number. Each formatting
8414 routine does bounds checking to ensure no overflow, but a better
8415 solution may be to malloc a buffer of appropriate size for each
8416 format. For now, the current solution is sufficient.
8417*/
8418#define FORMATBUFLEN (size_t)120
8419
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008421 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422{
8423 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008424 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425 int args_owned = 0;
8426 PyUnicodeObject *result = NULL;
8427 PyObject *dict = NULL;
8428 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008429
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008431 PyErr_BadInternalCall();
8432 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008433 }
8434 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008435 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008436 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008437 fmt = PyUnicode_AS_UNICODE(uformat);
8438 fmtcnt = PyUnicode_GET_SIZE(uformat);
8439
8440 reslen = rescnt = fmtcnt + 100;
8441 result = _PyUnicode_New(reslen);
8442 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008443 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008444 res = PyUnicode_AS_UNICODE(result);
8445
8446 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008447 arglen = PyTuple_Size(args);
8448 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008449 }
8450 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008451 arglen = -1;
8452 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453 }
Christian Heimese93237d2007-12-19 02:37:44 +00008454 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008455 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008456 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457
8458 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008459 if (*fmt != '%') {
8460 if (--rescnt < 0) {
8461 rescnt = fmtcnt + 100;
8462 reslen += rescnt;
8463 if (_PyUnicode_Resize(&result, reslen) < 0)
8464 goto onError;
8465 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8466 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008467 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008468 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008469 }
8470 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008471 /* Got a format specifier */
8472 int flags = 0;
8473 Py_ssize_t width = -1;
8474 int prec = -1;
8475 Py_UNICODE c = '\0';
8476 Py_UNICODE fill;
8477 int isnumok;
8478 PyObject *v = NULL;
8479 PyObject *temp = NULL;
8480 Py_UNICODE *pbuf;
8481 Py_UNICODE sign;
8482 Py_ssize_t len;
8483 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
8484
8485 fmt++;
8486 if (*fmt == '(') {
8487 Py_UNICODE *keystart;
8488 Py_ssize_t keylen;
8489 PyObject *key;
8490 int pcount = 1;
8491
8492 if (dict == NULL) {
8493 PyErr_SetString(PyExc_TypeError,
8494 "format requires a mapping");
8495 goto onError;
8496 }
8497 ++fmt;
8498 --fmtcnt;
8499 keystart = fmt;
8500 /* Skip over balanced parentheses */
8501 while (pcount > 0 && --fmtcnt >= 0) {
8502 if (*fmt == ')')
8503 --pcount;
8504 else if (*fmt == '(')
8505 ++pcount;
8506 fmt++;
8507 }
8508 keylen = fmt - keystart - 1;
8509 if (fmtcnt < 0 || pcount > 0) {
8510 PyErr_SetString(PyExc_ValueError,
8511 "incomplete format key");
8512 goto onError;
8513 }
8514#if 0
8515 /* keys are converted to strings using UTF-8 and
8516 then looked up since Python uses strings to hold
8517 variables names etc. in its namespaces and we
8518 wouldn't want to break common idioms. */
8519 key = PyUnicode_EncodeUTF8(keystart,
8520 keylen,
8521 NULL);
8522#else
8523 key = PyUnicode_FromUnicode(keystart, keylen);
8524#endif
8525 if (key == NULL)
8526 goto onError;
8527 if (args_owned) {
8528 Py_DECREF(args);
8529 args_owned = 0;
8530 }
8531 args = PyObject_GetItem(dict, key);
8532 Py_DECREF(key);
8533 if (args == NULL) {
8534 goto onError;
8535 }
8536 args_owned = 1;
8537 arglen = -1;
8538 argidx = -2;
8539 }
8540 while (--fmtcnt >= 0) {
8541 switch (c = *fmt++) {
8542 case '-': flags |= F_LJUST; continue;
8543 case '+': flags |= F_SIGN; continue;
8544 case ' ': flags |= F_BLANK; continue;
8545 case '#': flags |= F_ALT; continue;
8546 case '0': flags |= F_ZERO; continue;
8547 }
8548 break;
8549 }
8550 if (c == '*') {
8551 v = getnextarg(args, arglen, &argidx);
8552 if (v == NULL)
8553 goto onError;
8554 if (!PyInt_Check(v)) {
8555 PyErr_SetString(PyExc_TypeError,
8556 "* wants int");
8557 goto onError;
8558 }
8559 width = PyInt_AsLong(v);
8560 if (width < 0) {
8561 flags |= F_LJUST;
8562 width = -width;
8563 }
8564 if (--fmtcnt >= 0)
8565 c = *fmt++;
8566 }
8567 else if (c >= '0' && c <= '9') {
8568 width = c - '0';
8569 while (--fmtcnt >= 0) {
8570 c = *fmt++;
8571 if (c < '0' || c > '9')
8572 break;
8573 if ((width*10) / 10 != width) {
8574 PyErr_SetString(PyExc_ValueError,
8575 "width too big");
8576 goto onError;
8577 }
8578 width = width*10 + (c - '0');
8579 }
8580 }
8581 if (c == '.') {
8582 prec = 0;
8583 if (--fmtcnt >= 0)
8584 c = *fmt++;
8585 if (c == '*') {
8586 v = getnextarg(args, arglen, &argidx);
8587 if (v == NULL)
8588 goto onError;
8589 if (!PyInt_Check(v)) {
8590 PyErr_SetString(PyExc_TypeError,
8591 "* wants int");
8592 goto onError;
8593 }
8594 prec = PyInt_AsLong(v);
8595 if (prec < 0)
8596 prec = 0;
8597 if (--fmtcnt >= 0)
8598 c = *fmt++;
8599 }
8600 else if (c >= '0' && c <= '9') {
8601 prec = c - '0';
8602 while (--fmtcnt >= 0) {
8603 c = Py_CHARMASK(*fmt++);
8604 if (c < '0' || c > '9')
8605 break;
8606 if ((prec*10) / 10 != prec) {
8607 PyErr_SetString(PyExc_ValueError,
8608 "prec too big");
8609 goto onError;
8610 }
8611 prec = prec*10 + (c - '0');
8612 }
8613 }
8614 } /* prec */
8615 if (fmtcnt >= 0) {
8616 if (c == 'h' || c == 'l' || c == 'L') {
8617 if (--fmtcnt >= 0)
8618 c = *fmt++;
8619 }
8620 }
8621 if (fmtcnt < 0) {
8622 PyErr_SetString(PyExc_ValueError,
8623 "incomplete format");
8624 goto onError;
8625 }
8626 if (c != '%') {
8627 v = getnextarg(args, arglen, &argidx);
8628 if (v == NULL)
8629 goto onError;
8630 }
8631 sign = 0;
8632 fill = ' ';
8633 switch (c) {
8634
8635 case '%':
8636 pbuf = formatbuf;
8637 /* presume that buffer length is at least 1 */
8638 pbuf[0] = '%';
8639 len = 1;
8640 break;
8641
8642 case 's':
8643 case 'r':
8644 if (PyUnicode_Check(v) && c == 's') {
8645 temp = v;
8646 Py_INCREF(temp);
8647 }
8648 else {
8649 PyObject *unicode;
8650 if (c == 's')
8651 temp = PyObject_Unicode(v);
8652 else
8653 temp = PyObject_Repr(v);
8654 if (temp == NULL)
8655 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008656 if (PyUnicode_Check(temp))
8657 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008658 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008659 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008660 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8661 PyString_GET_SIZE(temp),
8662 NULL,
8663 "strict");
8664 Py_DECREF(temp);
8665 temp = unicode;
8666 if (temp == NULL)
8667 goto onError;
8668 }
8669 else {
8670 Py_DECREF(temp);
8671 PyErr_SetString(PyExc_TypeError,
8672 "%s argument has non-string str()");
8673 goto onError;
8674 }
8675 }
8676 pbuf = PyUnicode_AS_UNICODE(temp);
8677 len = PyUnicode_GET_SIZE(temp);
8678 if (prec >= 0 && len > prec)
8679 len = prec;
8680 break;
8681
8682 case 'i':
8683 case 'd':
8684 case 'u':
8685 case 'o':
8686 case 'x':
8687 case 'X':
8688 if (c == 'i')
8689 c = 'd';
8690 isnumok = 0;
8691 if (PyNumber_Check(v)) {
8692 PyObject *iobj=NULL;
8693
8694 if (PyInt_Check(v) || (PyLong_Check(v))) {
8695 iobj = v;
8696 Py_INCREF(iobj);
8697 }
8698 else {
8699 iobj = PyNumber_Int(v);
8700 if (iobj==NULL) iobj = PyNumber_Long(v);
8701 }
8702 if (iobj!=NULL) {
8703 if (PyInt_Check(iobj)) {
8704 isnumok = 1;
8705 pbuf = formatbuf;
8706 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8707 flags, prec, c, iobj);
8708 Py_DECREF(iobj);
8709 if (len < 0)
8710 goto onError;
8711 sign = 1;
8712 }
8713 else if (PyLong_Check(iobj)) {
8714 isnumok = 1;
8715 temp = formatlong(iobj, flags, prec, c);
8716 Py_DECREF(iobj);
8717 if (!temp)
8718 goto onError;
8719 pbuf = PyUnicode_AS_UNICODE(temp);
8720 len = PyUnicode_GET_SIZE(temp);
8721 sign = 1;
8722 }
8723 else {
8724 Py_DECREF(iobj);
8725 }
8726 }
8727 }
8728 if (!isnumok) {
8729 PyErr_Format(PyExc_TypeError,
8730 "%%%c format: a number is required, "
8731 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8732 goto onError;
8733 }
8734 if (flags & F_ZERO)
8735 fill = '0';
8736 break;
8737
8738 case 'e':
8739 case 'E':
8740 case 'f':
8741 case 'F':
8742 case 'g':
8743 case 'G':
8744 if (c == 'F')
8745 c = 'f';
8746 pbuf = formatbuf;
8747 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8748 flags, prec, c, v);
8749 if (len < 0)
8750 goto onError;
8751 sign = 1;
8752 if (flags & F_ZERO)
8753 fill = '0';
8754 break;
8755
8756 case 'c':
8757 pbuf = formatbuf;
8758 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8759 if (len < 0)
8760 goto onError;
8761 break;
8762
8763 default:
8764 PyErr_Format(PyExc_ValueError,
8765 "unsupported format character '%c' (0x%x) "
8766 "at index %zd",
8767 (31<=c && c<=126) ? (char)c : '?',
8768 (int)c,
8769 (Py_ssize_t)(fmt - 1 -
8770 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008771 goto onError;
8772 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008773 if (sign) {
8774 if (*pbuf == '-' || *pbuf == '+') {
8775 sign = *pbuf++;
8776 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008777 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008778 else if (flags & F_SIGN)
8779 sign = '+';
8780 else if (flags & F_BLANK)
8781 sign = ' ';
8782 else
8783 sign = 0;
8784 }
8785 if (width < len)
8786 width = len;
8787 if (rescnt - (sign != 0) < width) {
8788 reslen -= rescnt;
8789 rescnt = width + fmtcnt + 100;
8790 reslen += rescnt;
8791 if (reslen < 0) {
8792 Py_XDECREF(temp);
8793 PyErr_NoMemory();
8794 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008795 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008796 if (_PyUnicode_Resize(&result, reslen) < 0) {
8797 Py_XDECREF(temp);
8798 goto onError;
8799 }
8800 res = PyUnicode_AS_UNICODE(result)
8801 + reslen - rescnt;
8802 }
8803 if (sign) {
8804 if (fill != ' ')
8805 *res++ = sign;
8806 rescnt--;
8807 if (width > len)
8808 width--;
8809 }
8810 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8811 assert(pbuf[0] == '0');
8812 assert(pbuf[1] == c);
8813 if (fill != ' ') {
8814 *res++ = *pbuf++;
8815 *res++ = *pbuf++;
8816 }
8817 rescnt -= 2;
8818 width -= 2;
8819 if (width < 0)
8820 width = 0;
8821 len -= 2;
8822 }
8823 if (width > len && !(flags & F_LJUST)) {
8824 do {
8825 --rescnt;
8826 *res++ = fill;
8827 } while (--width > len);
8828 }
8829 if (fill == ' ') {
8830 if (sign)
8831 *res++ = sign;
8832 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8833 assert(pbuf[0] == '0');
8834 assert(pbuf[1] == c);
8835 *res++ = *pbuf++;
8836 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008837 }
8838 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008839 Py_UNICODE_COPY(res, pbuf, len);
8840 res += len;
8841 rescnt -= len;
8842 while (--width >= len) {
8843 --rescnt;
8844 *res++ = ' ';
8845 }
8846 if (dict && (argidx < arglen) && c != '%') {
8847 PyErr_SetString(PyExc_TypeError,
8848 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008849 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008850 goto onError;
8851 }
8852 Py_XDECREF(temp);
8853 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854 } /* until end */
8855 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008856 PyErr_SetString(PyExc_TypeError,
8857 "not all arguments converted during string formatting");
8858 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859 }
8860
Thomas Woutersa96affe2006-03-12 00:29:36 +00008861 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008862 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008864 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865 }
8866 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867 return (PyObject *)result;
8868
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008869 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870 Py_XDECREF(result);
8871 Py_DECREF(uformat);
8872 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008873 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874 }
8875 return NULL;
8876}
8877
8878static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008879 (readbufferproc) unicode_buffer_getreadbuf,
8880 (writebufferproc) unicode_buffer_getwritebuf,
8881 (segcountproc) unicode_buffer_getsegcount,
8882 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883};
8884
Jeremy Hylton938ace62002-07-17 16:30:39 +00008885static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008886unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8887
Tim Peters6d6c1a32001-08-02 04:15:00 +00008888static PyObject *
8889unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8890{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008891 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008892 static char *kwlist[] = {"string", "encoding", "errors", 0};
8893 char *encoding = NULL;
8894 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008895
Benjamin Peterson857ce152009-01-31 16:29:18 +00008896 if (type != &PyUnicode_Type)
8897 return unicode_subtype_new(type, args, kwds);
8898 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008899 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008900 return NULL;
8901 if (x == NULL)
8902 return (PyObject *)_PyUnicode_New(0);
8903 if (encoding == NULL && errors == NULL)
8904 return PyObject_Unicode(x);
8905 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008906 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008907}
8908
Guido van Rossume023fe02001-08-30 03:12:59 +00008909static PyObject *
8910unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8911{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008912 PyUnicodeObject *tmp, *pnew;
8913 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008914
Benjamin Peterson857ce152009-01-31 16:29:18 +00008915 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8916 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8917 if (tmp == NULL)
8918 return NULL;
8919 assert(PyUnicode_Check(tmp));
8920 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8921 if (pnew == NULL) {
8922 Py_DECREF(tmp);
8923 return NULL;
8924 }
8925 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8926 if (pnew->str == NULL) {
8927 _Py_ForgetReference((PyObject *)pnew);
8928 PyObject_Del(pnew);
8929 Py_DECREF(tmp);
8930 return PyErr_NoMemory();
8931 }
8932 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8933 pnew->length = n;
8934 pnew->hash = tmp->hash;
8935 Py_DECREF(tmp);
8936 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008937}
8938
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008939PyDoc_STRVAR(unicode_doc,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008940 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008941\n\
8942Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008943encoding defaults to the current default string encoding.\n\
8944errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008945
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008947 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008948 "unicode", /* tp_name */
8949 sizeof(PyUnicodeObject), /* tp_size */
8950 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008952 (destructor)unicode_dealloc, /* tp_dealloc */
8953 0, /* tp_print */
8954 0, /* tp_getattr */
8955 0, /* tp_setattr */
8956 0, /* tp_compare */
8957 unicode_repr, /* tp_repr */
8958 &unicode_as_number, /* tp_as_number */
8959 &unicode_as_sequence, /* tp_as_sequence */
8960 &unicode_as_mapping, /* tp_as_mapping */
8961 (hashfunc) unicode_hash, /* tp_hash*/
8962 0, /* tp_call*/
8963 (reprfunc) unicode_str, /* tp_str */
8964 PyObject_GenericGetAttr, /* tp_getattro */
8965 0, /* tp_setattro */
8966 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008967 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008968 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008969 unicode_doc, /* tp_doc */
8970 0, /* tp_traverse */
8971 0, /* tp_clear */
8972 PyUnicode_RichCompare, /* tp_richcompare */
8973 0, /* tp_weaklistoffset */
8974 0, /* tp_iter */
8975 0, /* tp_iternext */
8976 unicode_methods, /* tp_methods */
8977 0, /* tp_members */
8978 0, /* tp_getset */
8979 &PyBaseString_Type, /* tp_base */
8980 0, /* tp_dict */
8981 0, /* tp_descr_get */
8982 0, /* tp_descr_set */
8983 0, /* tp_dictoffset */
8984 0, /* tp_init */
8985 0, /* tp_alloc */
8986 unicode_new, /* tp_new */
8987 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988};
8989
8990/* Initialize the Unicode implementation */
8991
Thomas Wouters78890102000-07-22 19:25:51 +00008992void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008994 int i;
8995
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008996 /* XXX - move this array to unicodectype.c ? */
8997 Py_UNICODE linebreak[] = {
8998 0x000A, /* LINE FEED */
8999 0x000D, /* CARRIAGE RETURN */
9000 0x001C, /* FILE SEPARATOR */
9001 0x001D, /* GROUP SEPARATOR */
9002 0x001E, /* RECORD SEPARATOR */
9003 0x0085, /* NEXT LINE */
9004 0x2028, /* LINE SEPARATOR */
9005 0x2029, /* PARAGRAPH SEPARATOR */
9006 };
9007
Fred Drakee4315f52000-05-09 19:53:39 +00009008 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00009009 free_list = NULL;
9010 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00009012 if (!unicode_empty)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009013 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00009014
Marc-André Lemburg90e81472000-06-07 09:13:21 +00009015 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009016 for (i = 0; i < 256; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009017 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009018 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009019 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00009020
9021 /* initialize the linebreak bloom filter */
9022 bloom_linebreak = make_bloom_mask(
9023 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9024 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00009025
9026 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027}
9028
9029/* Finalize the Unicode implementation */
9030
Christian Heimes3b718a72008-02-14 12:47:33 +00009031int
9032PyUnicode_ClearFreeList(void)
9033{
9034 int freelist_size = numfree;
9035 PyUnicodeObject *u;
9036
9037 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009038 PyUnicodeObject *v = u;
9039 u = *(PyUnicodeObject **)u;
9040 if (v->str)
9041 PyObject_DEL(v->str);
9042 Py_XDECREF(v->defenc);
9043 PyObject_Del(v);
9044 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00009045 }
9046 free_list = NULL;
9047 assert(numfree == 0);
9048 return freelist_size;
9049}
9050
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051void
Thomas Wouters78890102000-07-22 19:25:51 +00009052_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009054 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009056 Py_XDECREF(unicode_empty);
9057 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009058
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009059 for (i = 0; i < 256; i++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009060 if (unicode_latin1[i]) {
9061 Py_DECREF(unicode_latin1[i]);
9062 unicode_latin1[i] = NULL;
9063 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009064 }
Christian Heimes3b718a72008-02-14 12:47:33 +00009065 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009067
Anthony Baxterac6bd462006-04-13 02:06:09 +00009068#ifdef __cplusplus
9069}
9070#endif
9071
9072
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009073/*
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00009074 Local variables:
9075 c-basic-offset: 4
9076 indent-tabs-mode: nil
9077 End:
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00009078*/