blob: 130ca48463c7dd2963afb7485bf90435d4bea206 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000118/* case 0x0009: * HORIZONTAL TABULATION */
119/* case 0x000A: * LINE FEED */
120/* case 0x000B: * VERTICAL TABULATION */
121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000149/* 0x000A, * LINE FEED */
150/* 0x000D, * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000151 0, 0, 1, 0, 0, 1, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000153/* 0x001C, * FILE SEPARATOR */
154/* 0x001D, * GROUP SEPARATOR */
155/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000156 0, 0, 0, 0, 1, 1, 1, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000161
Benjamin Peterson857ce152009-01-31 16:29:18 +0000162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000170};
171
172
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000173Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000174PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000176#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000177 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000178#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000179 /* This is actually an illegal character, so it should
180 not be passed to unichr. */
181 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000182#endif
183}
184
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000185/* --- Bloom Filters ----------------------------------------------------- */
186
187/* stuff to implement simple "bloom filters" for Unicode characters.
188 to keep things simple, we use a single bitmask, using the least 5
189 bits from each unicode characters as the bit index. */
190
191/* the linebreak mask is set up by Unicode_Init below */
192
Antoine Pitrou10042922010-01-13 14:01:26 +0000193#if LONG_BIT >= 128
194#define BLOOM_WIDTH 128
195#elif LONG_BIT >= 64
196#define BLOOM_WIDTH 64
197#elif LONG_BIT >= 32
198#define BLOOM_WIDTH 32
199#else
200#error "LONG_BIT is smaller than 32"
201#endif
202
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000203#define BLOOM_MASK unsigned long
204
205static BLOOM_MASK bloom_linebreak;
206
Antoine Pitrou10042922010-01-13 14:01:26 +0000207#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
208#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000209
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000210#define BLOOM_LINEBREAK(ch) \
211 ((ch) < 128U ? ascii_linebreak[(ch)] : \
212 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000213
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000214Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000215{
216 /* calculate simple bloom-style bitmask for a given unicode string */
217
Antoine Pitrou10042922010-01-13 14:01:26 +0000218 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000219 Py_ssize_t i;
220
221 mask = 0;
222 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000223 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000224
225 return mask;
226}
227
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000228Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000229{
230 Py_ssize_t i;
231
232 for (i = 0; i < setlen; i++)
233 if (set[i] == chr)
234 return 1;
235
Fredrik Lundh77633512006-05-23 19:47:35 +0000236 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000237}
238
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000239#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000240 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
241
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242/* --- Unicode Object ----------------------------------------------------- */
243
244static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000245int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000246 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247{
248 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000249
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000252 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000254 /* Resizing shared object (unicode_empty or single character
255 objects) in-place is not allowed. Use PyUnicode_Resize()
256 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000257
Benjamin Peterson857ce152009-01-31 16:29:18 +0000258 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000259 (unicode->length == 1 &&
260 unicode->str[0] < 256U &&
261 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000263 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 return -1;
265 }
266
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000267 /* We allocate one more byte to make sure the string is Ux0000 terminated.
268 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000269 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000270 it contains). */
271
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000273 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000274 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000275 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000276 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 PyErr_NoMemory();
278 return -1;
279 }
280 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000281 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000282
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000283 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000285 if (unicode->defenc) {
286 Py_DECREF(unicode->defenc);
287 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 }
289 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 return 0;
292}
293
294/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000295 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296
297 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000298 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299
300*/
301
302static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000303PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000304{
305 register PyUnicodeObject *unicode;
306
Andrew Dalkee0df7622006-05-27 11:04:36 +0000307 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308 if (length == 0 && unicode_empty != NULL) {
309 Py_INCREF(unicode_empty);
310 return unicode_empty;
311 }
312
Neal Norwitze7d8be82008-07-31 17:17:14 +0000313 /* Ensure we won't overflow the size. */
314 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
315 return (PyUnicodeObject *)PyErr_NoMemory();
316 }
317
Guido van Rossumd57fd912000-03-10 22:53:23 +0000318 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000319 if (free_list) {
320 unicode = free_list;
321 free_list = *(PyUnicodeObject **)unicode;
322 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000323 if (unicode->str) {
324 /* Keep-Alive optimization: we only upsize the buffer,
325 never downsize it. */
326 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000327 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000328 PyObject_DEL(unicode->str);
329 unicode->str = NULL;
330 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000331 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000332 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000333 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
334 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000335 }
336 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337 }
338 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000339 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000340 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000341 if (unicode == NULL)
342 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000343 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
344 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000345 }
346
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000347 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000348 PyErr_NoMemory();
349 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000350 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000351 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000352 * the caller fails before initializing str -- unicode_resize()
353 * reads str[0], and the Keep-Alive optimization can keep memory
354 * allocated for str alive across a call to unicode_dealloc(unicode).
355 * We don't want unicode_resize to read uninitialized memory in
356 * that case.
357 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000358 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000360 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000362 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000364
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000365 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000366 /* XXX UNREF/NEWREF interface should be more symmetrical */
367 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000368 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000369 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000370 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371}
372
373static
Guido van Rossum9475a232001-10-05 20:51:39 +0000374void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000376 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000377 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000378 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000379 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
380 PyObject_DEL(unicode->str);
381 unicode->str = NULL;
382 unicode->length = 0;
383 }
384 if (unicode->defenc) {
385 Py_DECREF(unicode->defenc);
386 unicode->defenc = NULL;
387 }
388 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000389 *(PyUnicodeObject **)unicode = free_list;
390 free_list = unicode;
391 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392 }
393 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000394 PyObject_DEL(unicode->str);
395 Py_XDECREF(unicode->defenc);
396 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397 }
398}
399
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000400static
401int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000402{
403 register PyUnicodeObject *v;
404
405 /* Argument checks */
406 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyErr_BadInternalCall();
408 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000409 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000410 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000411 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000412 PyErr_BadInternalCall();
413 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000414 }
415
416 /* Resizing unicode_empty and single character objects is not
417 possible since these are being shared. We simply return a fresh
418 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000419 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 (v == unicode_empty || v->length == 1)) {
421 PyUnicodeObject *w = _PyUnicode_New(length);
422 if (w == NULL)
423 return -1;
424 Py_UNICODE_COPY(w->str, v->str,
425 length < v->length ? length : v->length);
426 Py_DECREF(*unicode);
427 *unicode = w;
428 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429 }
430
431 /* Note that we don't have to modify *unicode for unshared Unicode
432 objects, since we can modify them in-place. */
433 return unicode_resize(v, length);
434}
435
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000436int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
437{
438 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
439}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000442 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443{
444 PyUnicodeObject *unicode;
445
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000446 /* If the Unicode data is known at construction time, we can apply
447 some optimizations which share commonly used objects. */
448 if (u != NULL) {
449
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000450 /* Optimization for empty strings */
451 if (size == 0 && unicode_empty != NULL) {
452 Py_INCREF(unicode_empty);
453 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000454 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455
456 /* Single character Unicode objects in the Latin-1 range are
457 shared when using this constructor */
458 if (size == 1 && *u < 256) {
459 unicode = unicode_latin1[*u];
460 if (!unicode) {
461 unicode = _PyUnicode_New(1);
462 if (!unicode)
463 return NULL;
464 unicode->str[0] = *u;
465 unicode_latin1[*u] = unicode;
466 }
467 Py_INCREF(unicode);
468 return (PyObject *)unicode;
469 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000470 }
Tim Petersced69f82003-09-16 20:30:58 +0000471
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472 unicode = _PyUnicode_New(size);
473 if (!unicode)
474 return NULL;
475
476 /* Copy the Unicode data into the new object */
477 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000478 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479
480 return (PyObject *)unicode;
481}
482
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000483PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
484{
485 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000486
Benjamin Peterson857ce152009-01-31 16:29:18 +0000487 if (size < 0) {
488 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000490 return NULL;
491 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000492
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000493 /* If the Unicode data is known at construction time, we can apply
494 some optimizations which share commonly used objects.
495 Also, this means the input must be UTF-8, so fall back to the
496 UTF-8 decoder at the end. */
497 if (u != NULL) {
498
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000499 /* Optimization for empty strings */
500 if (size == 0 && unicode_empty != NULL) {
501 Py_INCREF(unicode_empty);
502 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000503 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000504
505 /* Single characters are shared when using this constructor.
506 Restrict to ASCII, since the input must be UTF-8. */
507 if (size == 1 && Py_CHARMASK(*u) < 128) {
508 unicode = unicode_latin1[Py_CHARMASK(*u)];
509 if (!unicode) {
510 unicode = _PyUnicode_New(1);
511 if (!unicode)
512 return NULL;
513 unicode->str[0] = Py_CHARMASK(*u);
514 unicode_latin1[Py_CHARMASK(*u)] = unicode;
515 }
516 Py_INCREF(unicode);
517 return (PyObject *)unicode;
518 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000519
520 return PyUnicode_DecodeUTF8(u, size, NULL);
521 }
522
523 unicode = _PyUnicode_New(size);
524 if (!unicode)
525 return NULL;
526
527 return (PyObject *)unicode;
528}
529
530PyObject *PyUnicode_FromString(const char *u)
531{
532 size_t size = strlen(u);
533 if (size > PY_SSIZE_T_MAX) {
534 PyErr_SetString(PyExc_OverflowError, "input too long");
535 return NULL;
536 }
537
538 return PyUnicode_FromStringAndSize(u, size);
539}
540
Guido van Rossumd57fd912000-03-10 22:53:23 +0000541#ifdef HAVE_WCHAR_H
542
Mark Dickinson6b265f12009-03-18 16:07:26 +0000543#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
544# define CONVERT_WCHAR_TO_SURROGATES
545#endif
546
547#ifdef CONVERT_WCHAR_TO_SURROGATES
548
549/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
550 to convert from UTF32 to UTF16. */
551
552PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
553 Py_ssize_t size)
554{
555 PyUnicodeObject *unicode;
556 register Py_ssize_t i;
557 Py_ssize_t alloc;
558 const wchar_t *orig_w;
559
560 if (w == NULL) {
561 PyErr_BadInternalCall();
562 return NULL;
563 }
564
565 alloc = size;
566 orig_w = w;
567 for (i = size; i > 0; i--) {
568 if (*w > 0xFFFF)
569 alloc++;
570 w++;
571 }
572 w = orig_w;
573 unicode = _PyUnicode_New(alloc);
574 if (!unicode)
575 return NULL;
576
577 /* Copy the wchar_t data into the new object */
578 {
579 register Py_UNICODE *u;
580 u = PyUnicode_AS_UNICODE(unicode);
581 for (i = size; i > 0; i--) {
582 if (*w > 0xFFFF) {
583 wchar_t ordinal = *w++;
584 ordinal -= 0x10000;
585 *u++ = 0xD800 | (ordinal >> 10);
586 *u++ = 0xDC00 | (ordinal & 0x3FF);
587 }
588 else
589 *u++ = *w++;
590 }
591 }
592 return (PyObject *)unicode;
593}
594
595#else
596
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000598 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599{
600 PyUnicodeObject *unicode;
601
602 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000603 PyErr_BadInternalCall();
604 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000605 }
606
607 unicode = _PyUnicode_New(size);
608 if (!unicode)
609 return NULL;
610
611 /* Copy the wchar_t data into the new object */
612#ifdef HAVE_USABLE_WCHAR_T
613 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000614#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000615 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000616 register Py_UNICODE *u;
617 register Py_ssize_t i;
618 u = PyUnicode_AS_UNICODE(unicode);
619 for (i = size; i > 0; i--)
620 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621 }
622#endif
623
624 return (PyObject *)unicode;
625}
626
Mark Dickinson6b265f12009-03-18 16:07:26 +0000627#endif /* CONVERT_WCHAR_TO_SURROGATES */
628
629#undef CONVERT_WCHAR_TO_SURROGATES
630
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000631static void
632makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
633{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000634 *fmt++ = '%';
635 if (width) {
636 if (zeropad)
637 *fmt++ = '0';
638 fmt += sprintf(fmt, "%d", width);
639 }
640 if (precision)
641 fmt += sprintf(fmt, ".%d", precision);
642 if (longflag)
643 *fmt++ = 'l';
644 else if (size_tflag) {
645 char *f = PY_FORMAT_SIZE_T;
646 while (*f)
647 *fmt++ = *f++;
648 }
649 *fmt++ = c;
650 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000651}
652
653#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
654
655PyObject *
656PyUnicode_FromFormatV(const char *format, va_list vargs)
657{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000658 va_list count;
659 Py_ssize_t callcount = 0;
660 PyObject **callresults = NULL;
661 PyObject **callresult = NULL;
662 Py_ssize_t n = 0;
663 int width = 0;
664 int precision = 0;
665 int zeropad;
666 const char* f;
667 Py_UNICODE *s;
668 PyObject *string;
669 /* used by sprintf */
670 char buffer[21];
671 /* use abuffer instead of buffer, if we need more space
672 * (which can happen if there's a format specifier with width). */
673 char *abuffer = NULL;
674 char *realbuffer;
675 Py_ssize_t abuffersize = 0;
676 char fmt[60]; /* should be enough for %0width.precisionld */
677 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000678
679#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000680 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000681#else
682#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000683 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000684#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000685 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000686#endif
687#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000688 /* step 1: count the number of %S/%R/%s format specifications
689 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
690 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000691 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000692 if (*f == '%') {
693 if (*(f+1)=='%')
694 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000695 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000696 ++callcount;
697 while (isdigit((unsigned)*f))
698 width = (width*10) + *f++ - '0';
699 while (*++f && *f != '%' && !isalpha((unsigned)*f))
700 ;
701 if (*f == 's')
702 ++callcount;
703 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000704 }
705 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000706 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000707 if (callcount) {
708 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
709 if (!callresults) {
710 PyErr_NoMemory();
711 return NULL;
712 }
713 callresult = callresults;
714 }
715 /* step 3: figure out how large a buffer we need */
716 for (f = format; *f; f++) {
717 if (*f == '%') {
718 const char* p = f;
719 width = 0;
720 while (isdigit((unsigned)*f))
721 width = (width*10) + *f++ - '0';
722 while (*++f && *f != '%' && !isalpha((unsigned)*f))
723 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000724
Benjamin Peterson857ce152009-01-31 16:29:18 +0000725 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
726 * they don't affect the amount of space we reserve.
727 */
728 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000729 (f[1] == 'd' || f[1] == 'u'))
730 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000731
Benjamin Peterson857ce152009-01-31 16:29:18 +0000732 switch (*f) {
733 case 'c':
734 (void)va_arg(count, int);
735 /* fall through... */
736 case '%':
737 n++;
738 break;
739 case 'd': case 'u': case 'i': case 'x':
740 (void) va_arg(count, int);
741 /* 20 bytes is enough to hold a 64-bit
742 integer. Decimal takes the most space.
743 This isn't enough for octal.
744 If a width is specified we need more
745 (which we allocate later). */
746 if (width < 20)
747 width = 20;
748 n += width;
749 if (abuffersize < width)
750 abuffersize = width;
751 break;
752 case 's':
753 {
754 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000755 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000756 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
757 if (!str)
758 goto fail;
759 n += PyUnicode_GET_SIZE(str);
760 /* Remember the str and switch to the next slot */
761 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000762 break;
763 }
764 case 'U':
765 {
766 PyObject *obj = va_arg(count, PyObject *);
767 assert(obj && PyUnicode_Check(obj));
768 n += PyUnicode_GET_SIZE(obj);
769 break;
770 }
771 case 'V':
772 {
773 PyObject *obj = va_arg(count, PyObject *);
774 const char *str = va_arg(count, const char *);
775 assert(obj || str);
776 assert(!obj || PyUnicode_Check(obj));
777 if (obj)
778 n += PyUnicode_GET_SIZE(obj);
779 else
780 n += strlen(str);
781 break;
782 }
783 case 'S':
784 {
785 PyObject *obj = va_arg(count, PyObject *);
786 PyObject *str;
787 assert(obj);
788 str = PyObject_Str(obj);
789 if (!str)
790 goto fail;
791 n += PyUnicode_GET_SIZE(str);
792 /* Remember the str and switch to the next slot */
793 *callresult++ = str;
794 break;
795 }
796 case 'R':
797 {
798 PyObject *obj = va_arg(count, PyObject *);
799 PyObject *repr;
800 assert(obj);
801 repr = PyObject_Repr(obj);
802 if (!repr)
803 goto fail;
804 n += PyUnicode_GET_SIZE(repr);
805 /* Remember the repr and switch to the next slot */
806 *callresult++ = repr;
807 break;
808 }
809 case 'p':
810 (void) va_arg(count, int);
811 /* maximum 64-bit pointer representation:
812 * 0xffffffffffffffff
813 * so 19 characters is enough.
814 * XXX I count 18 -- what's the extra for?
815 */
816 n += 19;
817 break;
818 default:
819 /* if we stumble upon an unknown
820 formatting code, copy the rest of
821 the format string to the output
822 string. (we cannot just skip the
823 code, since there's no way to know
824 what's in the argument list) */
825 n += strlen(p);
826 goto expand;
827 }
828 } else
829 n++;
830 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000831 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000832 if (abuffersize > 20) {
833 abuffer = PyObject_Malloc(abuffersize);
834 if (!abuffer) {
835 PyErr_NoMemory();
836 goto fail;
837 }
838 realbuffer = abuffer;
839 }
840 else
841 realbuffer = buffer;
842 /* step 4: fill the buffer */
843 /* Since we've analyzed how much space we need for the worst case,
844 we don't have to resize the string.
845 There can be no errors beyond this point. */
846 string = PyUnicode_FromUnicode(NULL, n);
847 if (!string)
848 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000849
Benjamin Peterson857ce152009-01-31 16:29:18 +0000850 s = PyUnicode_AS_UNICODE(string);
851 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000852
Benjamin Peterson857ce152009-01-31 16:29:18 +0000853 for (f = format; *f; f++) {
854 if (*f == '%') {
855 const char* p = f++;
856 int longflag = 0;
857 int size_tflag = 0;
858 zeropad = (*f == '0');
859 /* parse the width.precision part */
860 width = 0;
861 while (isdigit((unsigned)*f))
862 width = (width*10) + *f++ - '0';
863 precision = 0;
864 if (*f == '.') {
865 f++;
866 while (isdigit((unsigned)*f))
867 precision = (precision*10) + *f++ - '0';
868 }
869 /* handle the long flag, but only for %ld and %lu.
870 others can be added when necessary. */
871 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
872 longflag = 1;
873 ++f;
874 }
875 /* handle the size_t flag. */
876 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
877 size_tflag = 1;
878 ++f;
879 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000880
Benjamin Peterson857ce152009-01-31 16:29:18 +0000881 switch (*f) {
882 case 'c':
883 *s++ = va_arg(vargs, int);
884 break;
885 case 'd':
886 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
887 if (longflag)
888 sprintf(realbuffer, fmt, va_arg(vargs, long));
889 else if (size_tflag)
890 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
891 else
892 sprintf(realbuffer, fmt, va_arg(vargs, int));
893 appendstring(realbuffer);
894 break;
895 case 'u':
896 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
897 if (longflag)
898 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
899 else if (size_tflag)
900 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
901 else
902 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
903 appendstring(realbuffer);
904 break;
905 case 'i':
906 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
907 sprintf(realbuffer, fmt, va_arg(vargs, int));
908 appendstring(realbuffer);
909 break;
910 case 'x':
911 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
912 sprintf(realbuffer, fmt, va_arg(vargs, int));
913 appendstring(realbuffer);
914 break;
915 case 's':
916 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000917 /* unused, since we already have the result */
918 (void) va_arg(vargs, char *);
919 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
920 PyUnicode_GET_SIZE(*callresult));
921 s += PyUnicode_GET_SIZE(*callresult);
922 /* We're done with the unicode()/repr() => forget it */
923 Py_DECREF(*callresult);
924 /* switch to next unicode()/repr() result */
925 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000926 break;
927 }
928 case 'U':
929 {
930 PyObject *obj = va_arg(vargs, PyObject *);
931 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
932 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
933 s += size;
934 break;
935 }
936 case 'V':
937 {
938 PyObject *obj = va_arg(vargs, PyObject *);
939 const char *str = va_arg(vargs, const char *);
940 if (obj) {
941 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
942 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
943 s += size;
944 } else {
945 appendstring(str);
946 }
947 break;
948 }
949 case 'S':
950 case 'R':
951 {
952 Py_UNICODE *ucopy;
953 Py_ssize_t usize;
954 Py_ssize_t upos;
955 /* unused, since we already have the result */
956 (void) va_arg(vargs, PyObject *);
957 ucopy = PyUnicode_AS_UNICODE(*callresult);
958 usize = PyUnicode_GET_SIZE(*callresult);
959 for (upos = 0; upos<usize;)
960 *s++ = ucopy[upos++];
961 /* We're done with the unicode()/repr() => forget it */
962 Py_DECREF(*callresult);
963 /* switch to next unicode()/repr() result */
964 ++callresult;
965 break;
966 }
967 case 'p':
968 sprintf(buffer, "%p", va_arg(vargs, void*));
969 /* %p is ill-defined: ensure leading 0x. */
970 if (buffer[1] == 'X')
971 buffer[1] = 'x';
972 else if (buffer[1] != 'x') {
973 memmove(buffer+2, buffer, strlen(buffer)+1);
974 buffer[0] = '0';
975 buffer[1] = 'x';
976 }
977 appendstring(buffer);
978 break;
979 case '%':
980 *s++ = '%';
981 break;
982 default:
983 appendstring(p);
984 goto end;
985 }
986 } else
987 *s++ = *f;
988 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000989
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000990 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000991 if (callresults)
992 PyObject_Free(callresults);
993 if (abuffer)
994 PyObject_Free(abuffer);
995 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
996 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000997 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000998 if (callresults) {
999 PyObject **callresult2 = callresults;
1000 while (callresult2 < callresult) {
1001 Py_DECREF(*callresult2);
1002 ++callresult2;
1003 }
1004 PyObject_Free(callresults);
1005 }
1006 if (abuffer)
1007 PyObject_Free(abuffer);
1008 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001009}
1010
1011#undef appendstring
1012
1013PyObject *
1014PyUnicode_FromFormat(const char *format, ...)
1015{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001016 PyObject* ret;
1017 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001018
1019#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001020 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001021#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001022 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001023#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001024 ret = PyUnicode_FromFormatV(format, vargs);
1025 va_end(vargs);
1026 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001027}
1028
Martin v. Löwis18e16552006-02-15 17:27:45 +00001029Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001030 wchar_t *w,
1031 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001032{
1033 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001034 PyErr_BadInternalCall();
1035 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001037
1038 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001040 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001041
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042#ifdef HAVE_USABLE_WCHAR_T
1043 memcpy(w, unicode->str, size * sizeof(wchar_t));
1044#else
1045 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001046 register Py_UNICODE *u;
1047 register Py_ssize_t i;
1048 u = PyUnicode_AS_UNICODE(unicode);
1049 for (i = size; i > 0; i--)
1050 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051 }
1052#endif
1053
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001054 if (size > PyUnicode_GET_SIZE(unicode))
1055 return PyUnicode_GET_SIZE(unicode);
1056 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001057 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058}
1059
1060#endif
1061
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001062PyObject *PyUnicode_FromOrdinal(int ordinal)
1063{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001064 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001065
1066#ifdef Py_UNICODE_WIDE
1067 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001068 PyErr_SetString(PyExc_ValueError,
1069 "unichr() arg not in range(0x110000) "
1070 "(wide Python build)");
1071 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001072 }
1073#else
1074 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001075 PyErr_SetString(PyExc_ValueError,
1076 "unichr() arg not in range(0x10000) "
1077 "(narrow Python build)");
1078 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001079 }
1080#endif
1081
Hye-Shik Chang40574832004-04-06 07:24:51 +00001082 s[0] = (Py_UNICODE)ordinal;
1083 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001084}
1085
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086PyObject *PyUnicode_FromObject(register PyObject *obj)
1087{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001088 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001089 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001090 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 Py_INCREF(obj);
1092 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001093 }
1094 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001095 /* For a Unicode subtype that's not a Unicode object,
1096 return a true Unicode object with the same data. */
1097 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1098 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001099 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001100 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1101}
1102
1103PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001104 const char *encoding,
1105 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001106{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001107 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001108 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001109 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001110
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001112 PyErr_BadInternalCall();
1113 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001115
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001116#if 0
1117 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001118 that no encodings is given and then redirect to
1119 PyObject_Unicode() which then applies the additional logic for
1120 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001121
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001122 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001123 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001124
1125 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001126 if (PyUnicode_Check(obj)) {
1127 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001128 PyErr_SetString(PyExc_TypeError,
1129 "decoding Unicode is not supported");
1130 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001131 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001132 return PyObject_Unicode(obj);
1133 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001134#else
1135 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001136 PyErr_SetString(PyExc_TypeError,
1137 "decoding Unicode is not supported");
1138 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001139 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001140#endif
1141
1142 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001143 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001144 s = PyString_AS_STRING(obj);
1145 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001146 }
Christian Heimes3497f942008-05-26 12:29:14 +00001147 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001148 /* Python 2.x specific */
1149 PyErr_Format(PyExc_TypeError,
1150 "decoding bytearray is not supported");
1151 return NULL;
1152 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001153 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001154 /* Overwrite the error message with something more useful in
1155 case of a TypeError. */
1156 if (PyErr_ExceptionMatches(PyExc_TypeError))
1157 PyErr_Format(PyExc_TypeError,
1158 "coercing to Unicode: need string or buffer, "
1159 "%.80s found",
1160 Py_TYPE(obj)->tp_name);
1161 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001162 }
Tim Petersced69f82003-09-16 20:30:58 +00001163
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001164 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 if (len == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001166 Py_INCREF(unicode_empty);
1167 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 }
Tim Petersced69f82003-09-16 20:30:58 +00001169 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001170 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001171
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001172 return v;
1173
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001174 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001175 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176}
1177
1178PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001179 Py_ssize_t size,
1180 const char *encoding,
1181 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182{
1183 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001184
1185 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001186 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001187
1188 /* Shortcuts for common default encodings */
1189 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001191 else if (strcmp(encoding, "latin-1") == 0)
1192 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001193#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1194 else if (strcmp(encoding, "mbcs") == 0)
1195 return PyUnicode_DecodeMBCS(s, size, errors);
1196#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001197 else if (strcmp(encoding, "ascii") == 0)
1198 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199
1200 /* Decode via the codec registry */
1201 buffer = PyBuffer_FromMemory((void *)s, size);
1202 if (buffer == NULL)
1203 goto onError;
1204 unicode = PyCodec_Decode(buffer, encoding, errors);
1205 if (unicode == NULL)
1206 goto onError;
1207 if (!PyUnicode_Check(unicode)) {
1208 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001209 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001210 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 Py_DECREF(unicode);
1212 goto onError;
1213 }
1214 Py_DECREF(buffer);
1215 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001216
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001217 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218 Py_XDECREF(buffer);
1219 return NULL;
1220}
1221
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001222PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1223 const char *encoding,
1224 const char *errors)
1225{
1226 PyObject *v;
1227
1228 if (!PyUnicode_Check(unicode)) {
1229 PyErr_BadArgument();
1230 goto onError;
1231 }
1232
1233 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001234 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001235
1236 /* Decode via the codec registry */
1237 v = PyCodec_Decode(unicode, encoding, errors);
1238 if (v == NULL)
1239 goto onError;
1240 return v;
1241
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001242 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001243 return NULL;
1244}
1245
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001247 Py_ssize_t size,
1248 const char *encoding,
1249 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250{
1251 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001252
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 unicode = PyUnicode_FromUnicode(s, size);
1254 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1257 Py_DECREF(unicode);
1258 return v;
1259}
1260
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001261PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1262 const char *encoding,
1263 const char *errors)
1264{
1265 PyObject *v;
1266
1267 if (!PyUnicode_Check(unicode)) {
1268 PyErr_BadArgument();
1269 goto onError;
1270 }
1271
1272 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001273 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001274
1275 /* Encode via the codec registry */
1276 v = PyCodec_Encode(unicode, encoding, errors);
1277 if (v == NULL)
1278 goto onError;
1279 return v;
1280
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001281 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001282 return NULL;
1283}
1284
Guido van Rossumd57fd912000-03-10 22:53:23 +00001285PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1286 const char *encoding,
1287 const char *errors)
1288{
1289 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001290
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 if (!PyUnicode_Check(unicode)) {
1292 PyErr_BadArgument();
1293 goto onError;
1294 }
Fred Drakee4315f52000-05-09 19:53:39 +00001295
Tim Petersced69f82003-09-16 20:30:58 +00001296 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001297 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001298
1299 /* Shortcuts for common default encodings */
1300 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001301 if (strcmp(encoding, "utf-8") == 0)
1302 return PyUnicode_AsUTF8String(unicode);
1303 else if (strcmp(encoding, "latin-1") == 0)
1304 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001305#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001306 else if (strcmp(encoding, "mbcs") == 0)
1307 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001308#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001309 else if (strcmp(encoding, "ascii") == 0)
1310 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001311 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312
1313 /* Encode via the codec registry */
1314 v = PyCodec_Encode(unicode, encoding, errors);
1315 if (v == NULL)
1316 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001317 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001318 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001319 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001320 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 Py_DECREF(v);
1322 goto onError;
1323 }
1324 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001325
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001326 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 return NULL;
1328}
1329
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001330PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001331 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001332{
1333 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1334
1335 if (v)
1336 return v;
1337 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1338 if (v && errors == NULL)
1339 ((PyUnicodeObject *)unicode)->defenc = v;
1340 return v;
1341}
1342
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1344{
1345 if (!PyUnicode_Check(unicode)) {
1346 PyErr_BadArgument();
1347 goto onError;
1348 }
1349 return PyUnicode_AS_UNICODE(unicode);
1350
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001351 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352 return NULL;
1353}
1354
Martin v. Löwis18e16552006-02-15 17:27:45 +00001355Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001356{
1357 if (!PyUnicode_Check(unicode)) {
1358 PyErr_BadArgument();
1359 goto onError;
1360 }
1361 return PyUnicode_GET_SIZE(unicode);
1362
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001363 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001364 return -1;
1365}
1366
Thomas Wouters78890102000-07-22 19:25:51 +00001367const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001368{
1369 return unicode_default_encoding;
1370}
1371
1372int PyUnicode_SetDefaultEncoding(const char *encoding)
1373{
1374 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001375
Fred Drakee4315f52000-05-09 19:53:39 +00001376 /* Make sure the encoding is valid. As side effect, this also
1377 loads the encoding into the codec registry cache. */
1378 v = _PyCodec_Lookup(encoding);
1379 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001380 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001381 Py_DECREF(v);
1382 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001383 encoding,
1384 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001385 return 0;
1386
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001387 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001388 return -1;
1389}
1390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001391/* error handling callback helper:
1392 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001393 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001394 and adjust various state variables.
1395 return 0 on success, -1 on error
1396*/
1397
1398static
1399int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001400 const char *encoding, const char *reason,
1401 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1402 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1403 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001404{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001405 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001406
1407 PyObject *restuple = NULL;
1408 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001409 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1410 Py_ssize_t requiredsize;
1411 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001412 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001413 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 int res = -1;
1415
1416 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001417 *errorHandler = PyCodec_LookupError(errors);
1418 if (*errorHandler == NULL)
1419 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420 }
1421
1422 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001423 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001424 encoding, input, insize, *startinpos, *endinpos, reason);
1425 if (*exceptionObject == NULL)
1426 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001427 }
1428 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001429 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1430 goto onError;
1431 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1432 goto onError;
1433 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1434 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435 }
1436
1437 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1438 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001439 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001440 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001441 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001442 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001443 }
1444 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001445 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001446 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001447 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001448 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001449 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1450 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001451 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001452
1453 /* need more space? (at least enough for what we
1454 have+the replacement+the rest of the string (starting
1455 at the new input position), so we won't have to check space
1456 when there are no errors in the rest of the string) */
1457 repptr = PyUnicode_AS_UNICODE(repunicode);
1458 repsize = PyUnicode_GET_SIZE(repunicode);
1459 requiredsize = *outpos + repsize + insize-newpos;
1460 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001461 if (requiredsize<2*outsize)
1462 requiredsize = 2*outsize;
1463 if (_PyUnicode_Resize(output, requiredsize) < 0)
1464 goto onError;
1465 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001466 }
1467 *endinpos = newpos;
1468 *inptr = input + newpos;
1469 Py_UNICODE_COPY(*outptr, repptr, repsize);
1470 *outptr += repsize;
1471 *outpos += repsize;
1472 /* we made it! */
1473 res = 0;
1474
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001475 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001476 Py_XDECREF(restuple);
1477 return res;
1478}
1479
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001480/* --- UTF-7 Codec -------------------------------------------------------- */
1481
Antoine Pitrou653dece2009-05-04 18:32:32 +00001482/* See RFC2152 for details. We encode conservatively and decode liberally. */
1483
1484/* Three simple macros defining base-64. */
1485
1486/* Is c a base-64 character? */
1487
1488#define IS_BASE64(c) \
1489 (isalnum(c) || (c) == '+' || (c) == '/')
1490
1491/* given that c is a base-64 character, what is its base-64 value? */
1492
1493#define FROM_BASE64(c) \
1494 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1495 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1496 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1497 (c) == '+' ? 62 : 63)
1498
1499/* What is the base-64 character of the bottom 6 bits of n? */
1500
1501#define TO_BASE64(n) \
1502 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1503
1504/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1505 * decoded as itself. We are permissive on decoding; the only ASCII
1506 * byte not decoding to itself is the + which begins a base64
1507 * string. */
1508
1509#define DECODE_DIRECT(c) \
1510 ((c) <= 127 && (c) != '+')
1511
1512/* The UTF-7 encoder treats ASCII characters differently according to
1513 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1514 * the above). See RFC2152. This array identifies these different
1515 * sets:
1516 * 0 : "Set D"
1517 * alphanumeric and '(),-./:?
1518 * 1 : "Set O"
1519 * !"#$%&*;<=>@[]^_`{|}
1520 * 2 : "whitespace"
1521 * ht nl cr sp
1522 * 3 : special (must be base64 encoded)
1523 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1524 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001525
Tim Petersced69f82003-09-16 20:30:58 +00001526static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001527char utf7_category[128] = {
1528/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1529 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1530/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1532/* sp ! " # $ % & ' ( ) * + , - . / */
1533 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1534/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1535 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1536/* @ A B C D E F G H I J K L M N O */
1537 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1538/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1540/* ` a b c d e f g h i j k l m n o */
1541 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1542/* p q r s t u v w x y z { | } ~ del */
1543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001544};
1545
Antoine Pitrou653dece2009-05-04 18:32:32 +00001546/* ENCODE_DIRECT: this character should be encoded as itself. The
1547 * answer depends on whether we are encoding set O as itself, and also
1548 * on whether we are encoding whitespace as itself. RFC2152 makes it
1549 * clear that the answers to these questions vary between
1550 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001551
Antoine Pitrou653dece2009-05-04 18:32:32 +00001552#define ENCODE_DIRECT(c, directO, directWS) \
1553 ((c) < 128 && (c) > 0 && \
1554 ((utf7_category[(c)] == 0) || \
1555 (directWS && (utf7_category[(c)] == 2)) || \
1556 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001557
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001558PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001559 Py_ssize_t size,
1560 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001561{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001562 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1563}
1564
Antoine Pitrou653dece2009-05-04 18:32:32 +00001565/* The decoder. The only state we preserve is our read position,
1566 * i.e. how many characters we have consumed. So if we end in the
1567 * middle of a shift sequence we have to back off the read position
1568 * and the output to the beginning of the sequence, otherwise we lose
1569 * all the shift state (seen bits, number of bits seen, high
1570 * surrogate). */
1571
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001572PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001573 Py_ssize_t size,
1574 const char *errors,
1575 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001576{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001577 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001578 Py_ssize_t startinpos;
1579 Py_ssize_t endinpos;
1580 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001581 const char *e;
1582 PyUnicodeObject *unicode;
1583 Py_UNICODE *p;
1584 const char *errmsg = "";
1585 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001586 Py_UNICODE *shiftOutStart;
1587 unsigned int base64bits = 0;
1588 unsigned long base64buffer = 0;
1589 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001590 PyObject *errorHandler = NULL;
1591 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001592
1593 unicode = _PyUnicode_New(size);
1594 if (!unicode)
1595 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001596 if (size == 0) {
1597 if (consumed)
1598 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001599 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001600 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001601
1602 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001603 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001604 e = s + size;
1605
1606 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001607 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001608
Antoine Pitrou653dece2009-05-04 18:32:32 +00001609 if (inShift) { /* in a base-64 section */
1610 if (IS_BASE64(ch)) { /* consume a base-64 character */
1611 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1612 base64bits += 6;
1613 s++;
1614 if (base64bits >= 16) {
1615 /* we have enough bits for a UTF-16 value */
1616 Py_UNICODE outCh = (Py_UNICODE)
1617 (base64buffer >> (base64bits-16));
1618 base64bits -= 16;
1619 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1620 if (surrogate) {
1621 /* expecting a second surrogate */
1622 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1623#ifdef Py_UNICODE_WIDE
1624 *p++ = (((surrogate & 0x3FF)<<10)
1625 | (outCh & 0x3FF)) + 0x10000;
1626#else
1627 *p++ = surrogate;
1628 *p++ = outCh;
1629#endif
1630 surrogate = 0;
1631 }
1632 else {
1633 surrogate = 0;
1634 errmsg = "second surrogate missing";
1635 goto utf7Error;
1636 }
1637 }
1638 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1639 /* first surrogate */
1640 surrogate = outCh;
1641 }
1642 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1643 errmsg = "unexpected second surrogate";
1644 goto utf7Error;
1645 }
1646 else {
1647 *p++ = outCh;
1648 }
1649 }
1650 }
1651 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001652 inShift = 0;
1653 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001654 if (surrogate) {
1655 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00001656 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001657 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001658 if (base64bits > 0) { /* left-over bits */
1659 if (base64bits >= 6) {
1660 /* We've seen at least one base-64 character */
1661 errmsg = "partial character in shift sequence";
1662 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001663 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001664 else {
1665 /* Some bits remain; they should be zero */
1666 if (base64buffer != 0) {
1667 errmsg = "non-zero padding bits in shift sequence";
1668 goto utf7Error;
1669 }
1670 }
1671 }
1672 if (ch != '-') {
1673 /* '-' is absorbed; other terminating
1674 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001675 *p++ = ch;
1676 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001677 }
1678 }
1679 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001680 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001681 s++; /* consume '+' */
1682 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001683 s++;
1684 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001685 }
1686 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001687 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001688 shiftOutStart = p;
1689 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001690 }
1691 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001692 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001693 *p++ = ch;
1694 s++;
1695 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001696 else {
1697 startinpos = s-starts;
1698 s++;
1699 errmsg = "unexpected special character";
1700 goto utf7Error;
1701 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001702 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001703utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001704 outpos = p-PyUnicode_AS_UNICODE(unicode);
1705 endinpos = s-starts;
1706 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001707 errors, &errorHandler,
1708 "utf7", errmsg,
1709 starts, size, &startinpos, &endinpos, &exc, &s,
1710 &unicode, &outpos, &p))
1711 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001712 }
1713
Antoine Pitrou653dece2009-05-04 18:32:32 +00001714 /* end of string */
1715
1716 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1717 /* if we're in an inconsistent state, that's an error */
1718 if (surrogate ||
1719 (base64bits >= 6) ||
1720 (base64bits > 0 && base64buffer != 0)) {
1721 outpos = p-PyUnicode_AS_UNICODE(unicode);
1722 endinpos = size;
1723 if (unicode_decode_call_errorhandler(
1724 errors, &errorHandler,
1725 "utf7", "unterminated shift sequence",
1726 starts, size, &startinpos, &endinpos, &exc, &s,
1727 &unicode, &outpos, &p))
1728 goto onError;
1729 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001730 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001731
1732 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001733 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001734 if (inShift) {
1735 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001736 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001737 }
1738 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001739 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001740 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001741 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001742
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001743 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744 goto onError;
1745
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 Py_XDECREF(errorHandler);
1747 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001748 return (PyObject *)unicode;
1749
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001750 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001751 Py_XDECREF(errorHandler);
1752 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001753 Py_DECREF(unicode);
1754 return NULL;
1755}
1756
1757
1758PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001759 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001760 int base64SetO,
1761 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001762 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001763{
1764 PyObject *v;
1765 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001766 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001767 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001768 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001769 unsigned int base64bits = 0;
1770 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001771 char * out;
1772 char * start;
1773
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001774 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001775 return PyErr_NoMemory();
1776
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001777 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001778 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001779
Antoine Pitrou653dece2009-05-04 18:32:32 +00001780 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001781 if (v == NULL)
1782 return NULL;
1783
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001784 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001785 for (;i < size; ++i) {
1786 Py_UNICODE ch = s[i];
1787
Antoine Pitrou653dece2009-05-04 18:32:32 +00001788 if (inShift) {
1789 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1790 /* shifting out */
1791 if (base64bits) { /* output remaining bits */
1792 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1793 base64buffer = 0;
1794 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001795 }
1796 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001797 /* Characters not in the BASE64 set implicitly unshift the sequence
1798 so no '-' is required, except if the character is itself a '-' */
1799 if (IS_BASE64(ch) || ch == '-') {
1800 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001801 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001802 *out++ = (char) ch;
1803 }
1804 else {
1805 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001806 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001807 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001808 else { /* not in a shift sequence */
1809 if (ch == '+') {
1810 *out++ = '+';
1811 *out++ = '-';
1812 }
1813 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1814 *out++ = (char) ch;
1815 }
1816 else {
1817 *out++ = '+';
1818 inShift = 1;
1819 goto encode_char;
1820 }
1821 }
1822 continue;
1823encode_char:
1824#ifdef Py_UNICODE_WIDE
1825 if (ch >= 0x10000) {
1826 /* code first surrogate */
1827 base64bits += 16;
1828 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1829 while (base64bits >= 6) {
1830 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1831 base64bits -= 6;
1832 }
1833 /* prepare second surrogate */
1834 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1835 }
1836#endif
1837 base64bits += 16;
1838 base64buffer = (base64buffer << 16) | ch;
1839 while (base64bits >= 6) {
1840 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1841 base64bits -= 6;
1842 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001843 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001844 if (base64bits)
1845 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1846 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001847 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001848
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001849 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001850 return v;
1851}
1852
Antoine Pitrou653dece2009-05-04 18:32:32 +00001853#undef IS_BASE64
1854#undef FROM_BASE64
1855#undef TO_BASE64
1856#undef DECODE_DIRECT
1857#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001858
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859/* --- UTF-8 Codec -------------------------------------------------------- */
1860
Tim Petersced69f82003-09-16 20:30:58 +00001861static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862char utf8_code_length[256] = {
1863 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1864 illegal prefix. see RFC 2279 for details */
1865 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1866 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1867 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1874 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1875 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1877 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1878 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1879 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1880 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1881};
1882
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001884 Py_ssize_t size,
1885 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886{
Walter Dörwald69652032004-09-07 20:24:22 +00001887 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1888}
1889
1890PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001891 Py_ssize_t size,
1892 const char *errors,
1893 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001894{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001895 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001896 int n;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001897 Py_ssize_t startinpos;
1898 Py_ssize_t endinpos;
1899 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001900 const char *e;
1901 PyUnicodeObject *unicode;
1902 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001903 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001904 PyObject *errorHandler = NULL;
1905 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001906
1907 /* Note: size will always be longer than the resulting Unicode
1908 character count */
1909 unicode = _PyUnicode_New(size);
1910 if (!unicode)
1911 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001912 if (size == 0) {
1913 if (consumed)
1914 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001916 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917
1918 /* Unpack UTF-8 encoded data */
1919 p = unicode->str;
1920 e = s + size;
1921
1922 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001923 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924
1925 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001926 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927 s++;
1928 continue;
1929 }
1930
1931 n = utf8_code_length[ch];
1932
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001933 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001934 if (consumed)
1935 break;
1936 else {
1937 errmsg = "unexpected end of data";
1938 startinpos = s-starts;
1939 endinpos = size;
1940 goto utf8Error;
1941 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001942 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943
1944 switch (n) {
1945
1946 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001947 errmsg = "unexpected code byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001948 startinpos = s-starts;
1949 endinpos = startinpos+1;
1950 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951
1952 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001953 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001954 startinpos = s-starts;
1955 endinpos = startinpos+1;
1956 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957
1958 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 if ((s[1] & 0xc0) != 0x80) {
1960 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001961 startinpos = s-starts;
1962 endinpos = startinpos+2;
1963 goto utf8Error;
1964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001965 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001966 if (ch < 0x80) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001967 startinpos = s-starts;
1968 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001969 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001970 goto utf8Error;
1971 }
1972 else
1973 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974 break;
1975
1976 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001977 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001978 (s[2] & 0xc0) != 0x80) {
1979 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001980 startinpos = s-starts;
1981 endinpos = startinpos+3;
1982 goto utf8Error;
1983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001985 if (ch < 0x0800) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001986 /* Note: UTF-8 encodings of surrogates are considered
1987 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001988
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001989 XXX For wide builds (UCS-4) we should probably try
1990 to recombine the surrogates into a single code
1991 unit.
1992 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001993 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001994 startinpos = s-starts;
1995 endinpos = startinpos+3;
1996 goto utf8Error;
1997 }
1998 else
1999 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002000 break;
2001
2002 case 4:
2003 if ((s[1] & 0xc0) != 0x80 ||
2004 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002005 (s[3] & 0xc0) != 0x80) {
2006 errmsg = "invalid data";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002007 startinpos = s-starts;
2008 endinpos = startinpos+4;
2009 goto utf8Error;
2010 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002011 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002012 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002013 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002014 if ((ch < 0x10000) /* minimum value allowed for 4
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002015 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002016 || (ch > 0x10ffff)) /* maximum value allowed for
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002017 UTF-16 */
2018 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002019 errmsg = "illegal encoding";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002020 startinpos = s-starts;
2021 endinpos = startinpos+4;
2022 goto utf8Error;
2023 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00002024#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002025 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002026#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002027 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002028
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002029 /* translate from 10000..10FFFF to 0..FFFF */
2030 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002031
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002032 /* high surrogate = top 10 bits added to D800 */
2033 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002034
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002035 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002036 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002037#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038 break;
2039
2040 default:
2041 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002042 errmsg = "unsupported Unicode code range";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002043 startinpos = s-starts;
2044 endinpos = startinpos+n;
2045 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046 }
2047 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002048 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002049
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002050 utf8Error:
2051 outpos = p-PyUnicode_AS_UNICODE(unicode);
2052 if (unicode_decode_call_errorhandler(
2053 errors, &errorHandler,
2054 "utf8", errmsg,
2055 starts, size, &startinpos, &endinpos, &exc, &s,
2056 &unicode, &outpos, &p))
2057 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 }
Walter Dörwald69652032004-09-07 20:24:22 +00002059 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002060 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061
2062 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002063 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064 goto onError;
2065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002066 Py_XDECREF(errorHandler);
2067 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 return (PyObject *)unicode;
2069
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002070 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002071 Py_XDECREF(errorHandler);
2072 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 Py_DECREF(unicode);
2074 return NULL;
2075}
2076
Tim Peters602f7402002-04-27 18:03:26 +00002077/* Allocation strategy: if the string is short, convert into a stack buffer
2078 and allocate exactly as much space needed at the end. Else allocate the
2079 maximum possible needed (4 result bytes per Unicode character), and return
2080 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002081*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002082PyObject *
2083PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002084 Py_ssize_t size,
2085 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086{
Tim Peters602f7402002-04-27 18:03:26 +00002087#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002088
Martin v. Löwis18e16552006-02-15 17:27:45 +00002089 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002090 PyObject *v; /* result string object */
2091 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002092 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002093 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002094 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002095
Tim Peters602f7402002-04-27 18:03:26 +00002096 assert(s != NULL);
2097 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002098
Tim Peters602f7402002-04-27 18:03:26 +00002099 if (size <= MAX_SHORT_UNICHARS) {
2100 /* Write into the stack buffer; nallocated can't overflow.
2101 * At the end, we'll allocate exactly as much heap space as it
2102 * turns out we need.
2103 */
2104 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2105 v = NULL; /* will allocate after we're done */
2106 p = stackbuf;
2107 }
2108 else {
2109 /* Overallocate on the heap, and give the excess back at the end. */
2110 nallocated = size * 4;
2111 if (nallocated / 4 != size) /* overflow! */
2112 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002113 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002114 if (v == NULL)
2115 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002116 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002117 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002118
Tim Peters602f7402002-04-27 18:03:26 +00002119 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002120 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002121
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002122 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002123 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002125
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002127 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002128 *p++ = (char)(0xc0 | (ch >> 6));
2129 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002130 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002131 else {
Tim Peters602f7402002-04-27 18:03:26 +00002132 /* Encode UCS2 Unicode ordinals */
2133 if (ch < 0x10000) {
2134 /* Special case: check for high surrogate */
2135 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2136 Py_UCS4 ch2 = s[i];
2137 /* Check for low surrogate and combine the two to
2138 form a UCS4 value */
2139 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002140 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002141 i++;
2142 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002143 }
Tim Peters602f7402002-04-27 18:03:26 +00002144 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002145 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002146 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002147 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2148 *p++ = (char)(0x80 | (ch & 0x3f));
2149 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002150 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002151 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002152 /* Encode UCS4 Unicode ordinals */
2153 *p++ = (char)(0xf0 | (ch >> 18));
2154 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2155 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2156 *p++ = (char)(0x80 | (ch & 0x3f));
2157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002159
Tim Peters602f7402002-04-27 18:03:26 +00002160 if (v == NULL) {
2161 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002162 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002163 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002164 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002165 }
2166 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002167 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002168 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002169 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002170 _PyString_Resize(&v, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002171 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002173
Tim Peters602f7402002-04-27 18:03:26 +00002174#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175}
2176
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2178{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002179 if (!PyUnicode_Check(unicode)) {
2180 PyErr_BadArgument();
2181 return NULL;
2182 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002183 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002184 PyUnicode_GET_SIZE(unicode),
2185 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186}
2187
Walter Dörwald6e390802007-08-17 16:41:28 +00002188/* --- UTF-32 Codec ------------------------------------------------------- */
2189
2190PyObject *
2191PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002192 Py_ssize_t size,
2193 const char *errors,
2194 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002195{
2196 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2197}
2198
2199PyObject *
2200PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002201 Py_ssize_t size,
2202 const char *errors,
2203 int *byteorder,
2204 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002205{
2206 const char *starts = s;
2207 Py_ssize_t startinpos;
2208 Py_ssize_t endinpos;
2209 Py_ssize_t outpos;
2210 PyUnicodeObject *unicode;
2211 Py_UNICODE *p;
2212#ifndef Py_UNICODE_WIDE
2213 int i, pairs;
2214#else
2215 const int pairs = 0;
2216#endif
2217 const unsigned char *q, *e;
2218 int bo = 0; /* assume native ordering by default */
2219 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002220 /* Offsets from q for retrieving bytes in the right order. */
2221#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2222 int iorder[] = {0, 1, 2, 3};
2223#else
2224 int iorder[] = {3, 2, 1, 0};
2225#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002226 PyObject *errorHandler = NULL;
2227 PyObject *exc = NULL;
Walter Dörwald6e390802007-08-17 16:41:28 +00002228 /* On narrow builds we split characters outside the BMP into two
2229 codepoints => count how much extra space we need. */
2230#ifndef Py_UNICODE_WIDE
2231 for (i = pairs = 0; i < size/4; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002232 if (((Py_UCS4 *)s)[i] >= 0x10000)
2233 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002234#endif
Walter Dörwald6e390802007-08-17 16:41:28 +00002235
2236 /* This might be one to much, because of a BOM */
2237 unicode = _PyUnicode_New((size+3)/4+pairs);
2238 if (!unicode)
2239 return NULL;
2240 if (size == 0)
2241 return (PyObject *)unicode;
2242
2243 /* Unpack UTF-32 encoded data */
2244 p = unicode->str;
2245 q = (unsigned char *)s;
2246 e = q + size;
2247
2248 if (byteorder)
2249 bo = *byteorder;
2250
2251 /* Check for BOM marks (U+FEFF) in the input and adjust current
2252 byte order setting accordingly. In native mode, the leading BOM
2253 mark is skipped, in all other modes, it is copied to the output
2254 stream as-is (giving a ZWNBSP character). */
2255 if (bo == 0) {
2256 if (size >= 4) {
2257 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002258 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002259#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002260 if (bom == 0x0000FEFF) {
2261 q += 4;
2262 bo = -1;
2263 }
2264 else if (bom == 0xFFFE0000) {
2265 q += 4;
2266 bo = 1;
2267 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002268#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002269 if (bom == 0x0000FEFF) {
2270 q += 4;
2271 bo = 1;
2272 }
2273 else if (bom == 0xFFFE0000) {
2274 q += 4;
2275 bo = -1;
2276 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002277#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002278 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002279 }
2280
2281 if (bo == -1) {
2282 /* force LE */
2283 iorder[0] = 0;
2284 iorder[1] = 1;
2285 iorder[2] = 2;
2286 iorder[3] = 3;
2287 }
2288 else if (bo == 1) {
2289 /* force BE */
2290 iorder[0] = 3;
2291 iorder[1] = 2;
2292 iorder[2] = 1;
2293 iorder[3] = 0;
2294 }
2295
2296 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002297 Py_UCS4 ch;
2298 /* remaining bytes at the end? (size should be divisible by 4) */
2299 if (e-q<4) {
2300 if (consumed)
2301 break;
2302 errmsg = "truncated data";
2303 startinpos = ((const char *)q)-starts;
2304 endinpos = ((const char *)e)-starts;
2305 goto utf32Error;
2306 /* The remaining input chars are ignored if the callback
2307 chooses to skip the input */
2308 }
2309 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2310 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002311
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002312 if (ch >= 0x110000)
2313 {
2314 errmsg = "codepoint not in range(0x110000)";
2315 startinpos = ((const char *)q)-starts;
2316 endinpos = startinpos+4;
2317 goto utf32Error;
2318 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002319#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002320 if (ch >= 0x10000)
2321 {
2322 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2323 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2324 }
2325 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002326#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002327 *p++ = ch;
2328 q += 4;
2329 continue;
2330 utf32Error:
2331 outpos = p-PyUnicode_AS_UNICODE(unicode);
2332 if (unicode_decode_call_errorhandler(
2333 errors, &errorHandler,
2334 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002335 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002336 &unicode, &outpos, &p))
2337 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002338 }
2339
2340 if (byteorder)
2341 *byteorder = bo;
2342
2343 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002344 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002345
2346 /* Adjust length */
2347 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2348 goto onError;
2349
2350 Py_XDECREF(errorHandler);
2351 Py_XDECREF(exc);
2352 return (PyObject *)unicode;
2353
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002354 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002355 Py_DECREF(unicode);
2356 Py_XDECREF(errorHandler);
2357 Py_XDECREF(exc);
2358 return NULL;
2359}
2360
2361PyObject *
2362PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002363 Py_ssize_t size,
2364 const char *errors,
2365 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002366{
2367 PyObject *v;
2368 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002369 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002370#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002371 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002372#else
2373 const int pairs = 0;
2374#endif
2375 /* Offsets from p for storing byte pairs in the right order. */
2376#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2377 int iorder[] = {0, 1, 2, 3};
2378#else
2379 int iorder[] = {3, 2, 1, 0};
2380#endif
2381
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002382#define STORECHAR(CH) \
2383 do { \
2384 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2385 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2386 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2387 p[iorder[0]] = (CH) & 0xff; \
2388 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002389 } while(0)
2390
2391 /* In narrow builds we can output surrogate pairs as one codepoint,
2392 so we need less space. */
2393#ifndef Py_UNICODE_WIDE
2394 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002395 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2396 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2397 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002398#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002399 nsize = (size - pairs + (byteorder == 0));
2400 bytesize = nsize * 4;
2401 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002402 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002403 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002404 if (v == NULL)
2405 return NULL;
2406
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002407 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002408 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002409 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002410 if (size == 0)
2411 return v;
2412
2413 if (byteorder == -1) {
2414 /* force LE */
2415 iorder[0] = 0;
2416 iorder[1] = 1;
2417 iorder[2] = 2;
2418 iorder[3] = 3;
2419 }
2420 else if (byteorder == 1) {
2421 /* force BE */
2422 iorder[0] = 3;
2423 iorder[1] = 2;
2424 iorder[2] = 1;
2425 iorder[3] = 0;
2426 }
2427
2428 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002429 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002430#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002431 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2432 Py_UCS4 ch2 = *s;
2433 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2434 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2435 s++;
2436 size--;
2437 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002438 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002439#endif
2440 STORECHAR(ch);
2441 }
2442 return v;
2443#undef STORECHAR
2444}
2445
2446PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2447{
2448 if (!PyUnicode_Check(unicode)) {
2449 PyErr_BadArgument();
2450 return NULL;
2451 }
2452 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002453 PyUnicode_GET_SIZE(unicode),
2454 NULL,
2455 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002456}
2457
Guido van Rossumd57fd912000-03-10 22:53:23 +00002458/* --- UTF-16 Codec ------------------------------------------------------- */
2459
Tim Peters772747b2001-08-09 22:21:55 +00002460PyObject *
2461PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002462 Py_ssize_t size,
2463 const char *errors,
2464 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465{
Walter Dörwald69652032004-09-07 20:24:22 +00002466 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2467}
2468
2469PyObject *
2470PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002471 Py_ssize_t size,
2472 const char *errors,
2473 int *byteorder,
2474 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002475{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002476 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002477 Py_ssize_t startinpos;
2478 Py_ssize_t endinpos;
2479 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 PyUnicodeObject *unicode;
2481 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002482 const unsigned char *q, *e;
2483 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002484 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002485 /* Offsets from q for retrieving byte pairs in the right order. */
2486#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2487 int ihi = 1, ilo = 0;
2488#else
2489 int ihi = 0, ilo = 1;
2490#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002491 PyObject *errorHandler = NULL;
2492 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493
2494 /* Note: size will always be longer than the resulting Unicode
2495 character count */
2496 unicode = _PyUnicode_New(size);
2497 if (!unicode)
2498 return NULL;
2499 if (size == 0)
2500 return (PyObject *)unicode;
2501
2502 /* Unpack UTF-16 encoded data */
2503 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002504 q = (unsigned char *)s;
2505 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506
2507 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002508 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002510 /* Check for BOM marks (U+FEFF) in the input and adjust current
2511 byte order setting accordingly. In native mode, the leading BOM
2512 mark is skipped, in all other modes, it is copied to the output
2513 stream as-is (giving a ZWNBSP character). */
2514 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002515 if (size >= 2) {
2516 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002517#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002518 if (bom == 0xFEFF) {
2519 q += 2;
2520 bo = -1;
2521 }
2522 else if (bom == 0xFFFE) {
2523 q += 2;
2524 bo = 1;
2525 }
Tim Petersced69f82003-09-16 20:30:58 +00002526#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002527 if (bom == 0xFEFF) {
2528 q += 2;
2529 bo = 1;
2530 }
2531 else if (bom == 0xFFFE) {
2532 q += 2;
2533 bo = -1;
2534 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002535#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002536 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538
Tim Peters772747b2001-08-09 22:21:55 +00002539 if (bo == -1) {
2540 /* force LE */
2541 ihi = 1;
2542 ilo = 0;
2543 }
2544 else if (bo == 1) {
2545 /* force BE */
2546 ihi = 0;
2547 ilo = 1;
2548 }
2549
2550 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002551 Py_UNICODE ch;
2552 /* remaining bytes at the end? (size should be even) */
2553 if (e-q<2) {
2554 if (consumed)
2555 break;
2556 errmsg = "truncated data";
2557 startinpos = ((const char *)q)-starts;
2558 endinpos = ((const char *)e)-starts;
2559 goto utf16Error;
2560 /* The remaining input chars are ignored if the callback
2561 chooses to skip the input */
2562 }
2563 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002564
Benjamin Peterson857ce152009-01-31 16:29:18 +00002565 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002566
2567 if (ch < 0xD800 || ch > 0xDFFF) {
2568 *p++ = ch;
2569 continue;
2570 }
2571
2572 /* UTF-16 code pair: */
2573 if (q >= e) {
2574 errmsg = "unexpected end of data";
2575 startinpos = (((const char *)q)-2)-starts;
2576 endinpos = ((const char *)e)-starts;
2577 goto utf16Error;
2578 }
2579 if (0xD800 <= ch && ch <= 0xDBFF) {
2580 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2581 q += 2;
2582 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002583#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002584 *p++ = ch;
2585 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002586#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002587 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002588#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002589 continue;
2590 }
2591 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002592 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002593 startinpos = (((const char *)q)-4)-starts;
2594 endinpos = startinpos+2;
2595 goto utf16Error;
2596 }
2597
Benjamin Peterson857ce152009-01-31 16:29:18 +00002598 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002599 errmsg = "illegal encoding";
2600 startinpos = (((const char *)q)-2)-starts;
2601 endinpos = startinpos+2;
2602 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002603
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002604 utf16Error:
2605 outpos = p-PyUnicode_AS_UNICODE(unicode);
2606 if (unicode_decode_call_errorhandler(
2607 errors, &errorHandler,
2608 "utf16", errmsg,
2609 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2610 &unicode, &outpos, &p))
2611 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612 }
2613
2614 if (byteorder)
2615 *byteorder = bo;
2616
Walter Dörwald69652032004-09-07 20:24:22 +00002617 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002618 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002619
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002621 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622 goto onError;
2623
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002624 Py_XDECREF(errorHandler);
2625 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626 return (PyObject *)unicode;
2627
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002628 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002630 Py_XDECREF(errorHandler);
2631 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632 return NULL;
2633}
2634
Tim Peters772747b2001-08-09 22:21:55 +00002635PyObject *
2636PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002637 Py_ssize_t size,
2638 const char *errors,
2639 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640{
2641 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002642 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002643 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002644#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002645 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002646#else
2647 const int pairs = 0;
2648#endif
Tim Peters772747b2001-08-09 22:21:55 +00002649 /* Offsets from p for storing byte pairs in the right order. */
2650#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2651 int ihi = 1, ilo = 0;
2652#else
2653 int ihi = 0, ilo = 1;
2654#endif
2655
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002656#define STORECHAR(CH) \
2657 do { \
2658 p[ihi] = ((CH) >> 8) & 0xff; \
2659 p[ilo] = (CH) & 0xff; \
2660 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002661 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002663#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002664 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002665 if (s[i] >= 0x10000)
2666 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002667#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002668 /* 2 * (size + pairs + (byteorder == 0)) */
2669 if (size > PY_SSIZE_T_MAX ||
2670 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002671 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002672 nsize = size + pairs + (byteorder == 0);
2673 bytesize = nsize * 2;
2674 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002675 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002676 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 if (v == NULL)
2678 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002680 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002682 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002683 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002684 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002685
2686 if (byteorder == -1) {
2687 /* force LE */
2688 ihi = 1;
2689 ilo = 0;
2690 }
2691 else if (byteorder == 1) {
2692 /* force BE */
2693 ihi = 0;
2694 ilo = 1;
2695 }
2696
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002697 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002698 Py_UNICODE ch = *s++;
2699 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002700#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002701 if (ch >= 0x10000) {
2702 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2703 ch = 0xD800 | ((ch-0x10000) >> 10);
2704 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002705#endif
Tim Peters772747b2001-08-09 22:21:55 +00002706 STORECHAR(ch);
2707 if (ch2)
2708 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002711#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712}
2713
2714PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2715{
2716 if (!PyUnicode_Check(unicode)) {
2717 PyErr_BadArgument();
2718 return NULL;
2719 }
2720 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002721 PyUnicode_GET_SIZE(unicode),
2722 NULL,
2723 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724}
2725
2726/* --- Unicode Escape Codec ----------------------------------------------- */
2727
Fredrik Lundh06d12682001-01-24 07:59:11 +00002728static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002729
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002731 Py_ssize_t size,
2732 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002735 Py_ssize_t startinpos;
2736 Py_ssize_t endinpos;
2737 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002740 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002742 char* message;
2743 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 PyObject *errorHandler = NULL;
2745 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002746
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 /* Escaped strings will always be longer than the resulting
2748 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002749 length after conversion to the true value.
2750 (but if the error callback returns a long replacement string
2751 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752 v = _PyUnicode_New(size);
2753 if (v == NULL)
2754 goto onError;
2755 if (size == 0)
2756 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002757
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002758 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002760
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 while (s < end) {
2762 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002763 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002764 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765
2766 /* Non-escape characters are interpreted as Unicode ordinals */
2767 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002768 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769 continue;
2770 }
2771
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002772 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 /* \ - Escapes */
2774 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002775 c = *s++;
2776 if (s > end)
2777 c = '\0'; /* Invalid after \ */
2778 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002780 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 case '\n': break;
2782 case '\\': *p++ = '\\'; break;
2783 case '\'': *p++ = '\''; break;
2784 case '\"': *p++ = '\"'; break;
2785 case 'b': *p++ = '\b'; break;
2786 case 'f': *p++ = '\014'; break; /* FF */
2787 case 't': *p++ = '\t'; break;
2788 case 'n': *p++ = '\n'; break;
2789 case 'r': *p++ = '\r'; break;
2790 case 'v': *p++ = '\013'; break; /* VT */
2791 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2792
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002793 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794 case '0': case '1': case '2': case '3':
2795 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002796 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002797 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002798 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002799 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002800 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002802 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 break;
2804
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002805 /* hex escapes */
2806 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002808 digits = 2;
2809 message = "truncated \\xXX escape";
2810 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002812 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002814 digits = 4;
2815 message = "truncated \\uXXXX escape";
2816 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002818 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002819 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002820 digits = 8;
2821 message = "truncated \\UXXXXXXXX escape";
2822 hexescape:
2823 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002824 outpos = p-PyUnicode_AS_UNICODE(v);
2825 if (s+digits>end) {
2826 endinpos = size;
2827 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002828 errors, &errorHandler,
2829 "unicodeescape", "end of string in escape sequence",
2830 starts, size, &startinpos, &endinpos, &exc, &s,
2831 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002832 goto onError;
2833 goto nextByte;
2834 }
2835 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002836 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002837 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002838 endinpos = (s+i+1)-starts;
2839 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002840 errors, &errorHandler,
2841 "unicodeescape", message,
2842 starts, size, &startinpos, &endinpos, &exc, &s,
2843 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002844 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002845 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002846 }
2847 chr = (chr<<4) & ~0xF;
2848 if (c >= '0' && c <= '9')
2849 chr += c - '0';
2850 else if (c >= 'a' && c <= 'f')
2851 chr += 10 + c - 'a';
2852 else
2853 chr += 10 + c - 'A';
2854 }
2855 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002856 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 /* _decoding_error will have already written into the
2858 target buffer. */
2859 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002860 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002861 /* when we get here, chr is a 32-bit unicode character */
2862 if (chr <= 0xffff)
2863 /* UCS-2 character */
2864 *p++ = (Py_UNICODE) chr;
2865 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002866 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002867 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002868#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002869 *p++ = chr;
2870#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002871 chr -= 0x10000L;
2872 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002873 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002874#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002875 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002876 endinpos = s-starts;
2877 outpos = p-PyUnicode_AS_UNICODE(v);
2878 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002879 errors, &errorHandler,
2880 "unicodeescape", "illegal Unicode character",
2881 starts, size, &startinpos, &endinpos, &exc, &s,
2882 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002883 goto onError;
2884 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002885 break;
2886
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002887 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002888 case 'N':
2889 message = "malformed \\N character escape";
2890 if (ucnhash_CAPI == NULL) {
2891 /* load the unicode data module */
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002892 PyObject *m, *api;
Christian Heimes000a0742008-01-03 22:16:32 +00002893 m = PyImport_ImportModuleNoBlock("unicodedata");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002894 if (m == NULL)
2895 goto ucnhashError;
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002896 api = PyObject_GetAttrString(m, "ucnhash_CAPI");
Fredrik Lundhccc74732001-02-18 22:13:49 +00002897 Py_DECREF(m);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002898 if (api == NULL)
Fredrik Lundhccc74732001-02-18 22:13:49 +00002899 goto ucnhashError;
Anthony Baxtera6286212006-04-11 07:42:36 +00002900 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCObject_AsVoidPtr(api);
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002901 Py_DECREF(api);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002902 if (ucnhash_CAPI == NULL)
2903 goto ucnhashError;
2904 }
2905 if (*s == '{') {
2906 const char *start = s+1;
2907 /* look for the closing brace */
2908 while (*s != '}' && s < end)
2909 s++;
2910 if (s > start && s < end && *s == '}') {
2911 /* found a name. look it up in the unicode database */
2912 message = "unknown Unicode character name";
2913 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002914 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002915 goto store;
2916 }
2917 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002918 endinpos = s-starts;
2919 outpos = p-PyUnicode_AS_UNICODE(v);
2920 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002921 errors, &errorHandler,
2922 "unicodeescape", message,
2923 starts, size, &startinpos, &endinpos, &exc, &s,
2924 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002925 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002926 break;
2927
2928 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002929 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002930 message = "\\ at end of string";
2931 s--;
2932 endinpos = s-starts;
2933 outpos = p-PyUnicode_AS_UNICODE(v);
2934 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002935 errors, &errorHandler,
2936 "unicodeescape", message,
2937 starts, size, &startinpos, &endinpos, &exc, &s,
2938 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002939 goto onError;
2940 }
2941 else {
2942 *p++ = '\\';
2943 *p++ = (unsigned char)s[-1];
2944 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002945 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002946 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002947 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002948 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002950 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002951 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002952 Py_XDECREF(errorHandler);
2953 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002954 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002955
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002956 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002957 PyErr_SetString(
2958 PyExc_UnicodeError,
2959 "\\N escapes not supported (can't load unicodedata module)"
2960 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002961 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002962 Py_XDECREF(errorHandler);
2963 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002964 return NULL;
2965
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002966 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002968 Py_XDECREF(errorHandler);
2969 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970 return NULL;
2971}
2972
2973/* Return a Unicode-Escape string version of the Unicode object.
2974
2975 If quotes is true, the string is enclosed in u"" or u'' quotes as
2976 appropriate.
2977
2978*/
2979
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002980Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002981 Py_ssize_t size,
2982 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002983{
2984 /* like wcschr, but doesn't stop at NULL characters */
2985
2986 while (size-- > 0) {
2987 if (*s == ch)
2988 return s;
2989 s++;
2990 }
2991
2992 return NULL;
2993}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002994
Guido van Rossumd57fd912000-03-10 22:53:23 +00002995static
2996PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002997 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 int quotes)
2999{
3000 PyObject *repr;
3001 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003002
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003003 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00003004#ifdef Py_UNICODE_WIDE
3005 const Py_ssize_t expandsize = 10;
3006#else
3007 const Py_ssize_t expandsize = 6;
3008#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00003009
Neal Norwitz17753ec2006-08-21 22:21:19 +00003010 /* XXX(nnorwitz): rather than over-allocating, it would be
3011 better to choose a different scheme. Perhaps scan the
3012 first N-chars of the string and allocate based on that size.
3013 */
3014 /* Initial allocation is based on the longest-possible unichr
3015 escape.
3016
3017 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3018 unichr, so in this case it's the longest unichr escape. In
3019 narrow (UTF-16) builds this is five chars per source unichr
3020 since there are two unichrs in the surrogate pair, so in narrow
3021 (UTF-16) builds it's not the longest unichr escape.
3022
3023 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3024 so in the narrow (UTF-16) build case it's the longest unichr
3025 escape.
3026 */
3027
Neal Norwitze7d8be82008-07-31 17:17:14 +00003028 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003029 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003030
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003031 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003032 2
3033 + expandsize*size
3034 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 if (repr == NULL)
3036 return NULL;
3037
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003038 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039
3040 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003042 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043 !findchar(s, size, '"')) ? '"' : '\'';
3044 }
3045 while (size-- > 0) {
3046 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003047
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003048 /* Escape quotes and backslashes */
3049 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003050 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 *p++ = '\\';
3052 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003053 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003054 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003055
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003056#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003057 /* Map 21-bit characters to '\U00xxxxxx' */
3058 else if (ch >= 0x10000) {
3059 *p++ = '\\';
3060 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003061 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3062 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3063 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3064 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3065 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3066 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3067 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003068 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003069 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003070 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003071#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003072 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3073 else if (ch >= 0xD800 && ch < 0xDC00) {
3074 Py_UNICODE ch2;
3075 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003076
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003077 ch2 = *s++;
3078 size--;
3079 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3080 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3081 *p++ = '\\';
3082 *p++ = 'U';
3083 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3084 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3085 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3086 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3087 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3088 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3089 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3090 *p++ = hexdigit[ucs & 0x0000000F];
3091 continue;
3092 }
3093 /* Fall through: isolated surrogates are copied as-is */
3094 s--;
3095 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003096 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003097#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003098
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003100 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101 *p++ = '\\';
3102 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003103 *p++ = hexdigit[(ch >> 12) & 0x000F];
3104 *p++ = hexdigit[(ch >> 8) & 0x000F];
3105 *p++ = hexdigit[(ch >> 4) & 0x000F];
3106 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003108
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003109 /* Map special whitespace to '\t', \n', '\r' */
3110 else if (ch == '\t') {
3111 *p++ = '\\';
3112 *p++ = 't';
3113 }
3114 else if (ch == '\n') {
3115 *p++ = '\\';
3116 *p++ = 'n';
3117 }
3118 else if (ch == '\r') {
3119 *p++ = '\\';
3120 *p++ = 'r';
3121 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003122
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003123 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003124 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003126 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003127 *p++ = hexdigit[(ch >> 4) & 0x000F];
3128 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003129 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003130
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131 /* Copy everything else as-is */
3132 else
3133 *p++ = (char) ch;
3134 }
3135 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003136 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003137
3138 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003139 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140 return repr;
3141}
3142
3143PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003144 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003145{
3146 return unicodeescape_string(s, size, 0);
3147}
3148
3149PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3150{
3151 if (!PyUnicode_Check(unicode)) {
3152 PyErr_BadArgument();
3153 return NULL;
3154 }
3155 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003156 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157}
3158
3159/* --- Raw Unicode Escape Codec ------------------------------------------- */
3160
3161PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003162 Py_ssize_t size,
3163 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003165 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003166 Py_ssize_t startinpos;
3167 Py_ssize_t endinpos;
3168 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003170 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171 const char *end;
3172 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003173 PyObject *errorHandler = NULL;
3174 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003175
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 /* Escaped strings will always be longer than the resulting
3177 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003178 length after conversion to the true value. (But decoding error
3179 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003180 v = _PyUnicode_New(size);
3181 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003182 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003184 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003185 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003186 end = s + size;
3187 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003188 unsigned char c;
3189 Py_UCS4 x;
3190 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003191 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003193 /* Non-escape characters are interpreted as Unicode ordinals */
3194 if (*s != '\\') {
3195 *p++ = (unsigned char)*s++;
3196 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003197 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003198 startinpos = s-starts;
3199
3200 /* \u-escapes are only interpreted iff the number of leading
3201 backslashes if odd */
3202 bs = s;
3203 for (;s < end;) {
3204 if (*s != '\\')
3205 break;
3206 *p++ = (unsigned char)*s++;
3207 }
3208 if (((s - bs) & 1) == 0 ||
3209 s >= end ||
3210 (*s != 'u' && *s != 'U')) {
3211 continue;
3212 }
3213 p--;
3214 count = *s=='u' ? 4 : 8;
3215 s++;
3216
3217 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3218 outpos = p-PyUnicode_AS_UNICODE(v);
3219 for (x = 0, i = 0; i < count; ++i, ++s) {
3220 c = (unsigned char)*s;
3221 if (!isxdigit(c)) {
3222 endinpos = s-starts;
3223 if (unicode_decode_call_errorhandler(
3224 errors, &errorHandler,
3225 "rawunicodeescape", "truncated \\uXXXX",
3226 starts, size, &startinpos, &endinpos, &exc, &s,
3227 &v, &outpos, &p))
3228 goto onError;
3229 goto nextByte;
3230 }
3231 x = (x<<4) & ~0xF;
3232 if (c >= '0' && c <= '9')
3233 x += c - '0';
3234 else if (c >= 'a' && c <= 'f')
3235 x += 10 + c - 'a';
3236 else
3237 x += 10 + c - 'A';
3238 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003239 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003240 /* UCS-2 character */
3241 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003242 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003243 /* UCS-4 character. Either store directly, or as
3244 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003245#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003246 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003247#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003248 x -= 0x10000L;
3249 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3250 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003251#endif
3252 } else {
3253 endinpos = s-starts;
3254 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003255 if (unicode_decode_call_errorhandler(
3256 errors, &errorHandler,
3257 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003258 starts, size, &startinpos, &endinpos, &exc, &s,
3259 &v, &outpos, &p))
3260 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003261 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003262 nextByte:
3263 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003265 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003266 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003267 Py_XDECREF(errorHandler);
3268 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003270
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003271 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003273 Py_XDECREF(errorHandler);
3274 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003275 return NULL;
3276}
3277
3278PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003279 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280{
3281 PyObject *repr;
3282 char *p;
3283 char *q;
3284
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003285 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003286#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003287 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003288#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003289 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003290#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003291
Neal Norwitze7d8be82008-07-31 17:17:14 +00003292 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003293 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003294
Neal Norwitze7d8be82008-07-31 17:17:14 +00003295 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003296 if (repr == NULL)
3297 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003298 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003299 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003301 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302 while (size-- > 0) {
3303 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003304#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003305 /* Map 32-bit characters to '\Uxxxxxxxx' */
3306 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003307 *p++ = '\\';
3308 *p++ = 'U';
3309 *p++ = hexdigit[(ch >> 28) & 0xf];
3310 *p++ = hexdigit[(ch >> 24) & 0xf];
3311 *p++ = hexdigit[(ch >> 20) & 0xf];
3312 *p++ = hexdigit[(ch >> 16) & 0xf];
3313 *p++ = hexdigit[(ch >> 12) & 0xf];
3314 *p++ = hexdigit[(ch >> 8) & 0xf];
3315 *p++ = hexdigit[(ch >> 4) & 0xf];
3316 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003317 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003318 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003319#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003320 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3321 if (ch >= 0xD800 && ch < 0xDC00) {
3322 Py_UNICODE ch2;
3323 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003324
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003325 ch2 = *s++;
3326 size--;
3327 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3328 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3329 *p++ = '\\';
3330 *p++ = 'U';
3331 *p++ = hexdigit[(ucs >> 28) & 0xf];
3332 *p++ = hexdigit[(ucs >> 24) & 0xf];
3333 *p++ = hexdigit[(ucs >> 20) & 0xf];
3334 *p++ = hexdigit[(ucs >> 16) & 0xf];
3335 *p++ = hexdigit[(ucs >> 12) & 0xf];
3336 *p++ = hexdigit[(ucs >> 8) & 0xf];
3337 *p++ = hexdigit[(ucs >> 4) & 0xf];
3338 *p++ = hexdigit[ucs & 0xf];
3339 continue;
3340 }
3341 /* Fall through: isolated surrogates are copied as-is */
3342 s--;
3343 size++;
3344 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003345#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003346 /* Map 16-bit characters to '\uxxxx' */
3347 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348 *p++ = '\\';
3349 *p++ = 'u';
3350 *p++ = hexdigit[(ch >> 12) & 0xf];
3351 *p++ = hexdigit[(ch >> 8) & 0xf];
3352 *p++ = hexdigit[(ch >> 4) & 0xf];
3353 *p++ = hexdigit[ch & 15];
3354 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003355 /* Copy everything else as-is */
3356 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357 *p++ = (char) ch;
3358 }
3359 *p = '\0';
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003360 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361 return repr;
3362}
3363
3364PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3365{
3366 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003367 PyErr_BadArgument();
3368 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369 }
3370 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003371 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372}
3373
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003374/* --- Unicode Internal Codec ------------------------------------------- */
3375
3376PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003377 Py_ssize_t size,
3378 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003379{
3380 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003381 Py_ssize_t startinpos;
3382 Py_ssize_t endinpos;
3383 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003384 PyUnicodeObject *v;
3385 Py_UNICODE *p;
3386 const char *end;
3387 const char *reason;
3388 PyObject *errorHandler = NULL;
3389 PyObject *exc = NULL;
3390
Neal Norwitzd43069c2006-01-08 01:12:10 +00003391#ifdef Py_UNICODE_WIDE
3392 Py_UNICODE unimax = PyUnicode_GetMax();
3393#endif
3394
Armin Rigo7ccbca92006-10-04 12:17:45 +00003395 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003396 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3397 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003398 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003399 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003400 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003401 p = PyUnicode_AS_UNICODE(v);
3402 end = s + size;
3403
3404 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003405 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003406 /* We have to sanity check the raw data, otherwise doom looms for
3407 some malformed UCS-4 data. */
3408 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003409#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003410 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003411#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003412 end-s < Py_UNICODE_SIZE
3413 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003414 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003415 startinpos = s - starts;
3416 if (end-s < Py_UNICODE_SIZE) {
3417 endinpos = end-starts;
3418 reason = "truncated input";
3419 }
3420 else {
3421 endinpos = s - starts + Py_UNICODE_SIZE;
3422 reason = "illegal code point (> 0x10FFFF)";
3423 }
3424 outpos = p - PyUnicode_AS_UNICODE(v);
3425 if (unicode_decode_call_errorhandler(
3426 errors, &errorHandler,
3427 "unicode_internal", reason,
3428 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003429 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003430 goto onError;
3431 }
3432 }
3433 else {
3434 p++;
3435 s += Py_UNICODE_SIZE;
3436 }
3437 }
3438
Martin v. Löwis412fb672006-04-13 06:34:32 +00003439 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003440 goto onError;
3441 Py_XDECREF(errorHandler);
3442 Py_XDECREF(exc);
3443 return (PyObject *)v;
3444
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003445 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003446 Py_XDECREF(v);
3447 Py_XDECREF(errorHandler);
3448 Py_XDECREF(exc);
3449 return NULL;
3450}
3451
Guido van Rossumd57fd912000-03-10 22:53:23 +00003452/* --- Latin-1 Codec ------------------------------------------------------ */
3453
3454PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003455 Py_ssize_t size,
3456 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457{
3458 PyUnicodeObject *v;
3459 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003460
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003462 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003463 Py_UNICODE r = *(unsigned char*)s;
3464 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003465 }
3466
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 v = _PyUnicode_New(size);
3468 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003469 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003471 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472 p = PyUnicode_AS_UNICODE(v);
3473 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003474 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003476
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003477 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478 Py_XDECREF(v);
3479 return NULL;
3480}
3481
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003482/* create or adjust a UnicodeEncodeError */
3483static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003484 const char *encoding,
3485 const Py_UNICODE *unicode, Py_ssize_t size,
3486 Py_ssize_t startpos, Py_ssize_t endpos,
3487 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003489 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003490 *exceptionObject = PyUnicodeEncodeError_Create(
3491 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492 }
3493 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003494 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3495 goto onError;
3496 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3497 goto onError;
3498 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3499 goto onError;
3500 return;
3501 onError:
3502 Py_DECREF(*exceptionObject);
3503 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504 }
3505}
3506
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507/* raises a UnicodeEncodeError */
3508static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003509 const char *encoding,
3510 const Py_UNICODE *unicode, Py_ssize_t size,
3511 Py_ssize_t startpos, Py_ssize_t endpos,
3512 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003513{
3514 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003515 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003516 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003517 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003518}
3519
3520/* error handling callback helper:
3521 build arguments, call the callback and check the arguments,
3522 put the result into newpos and return the replacement string, which
3523 has to be freed by the caller */
3524static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003525 PyObject **errorHandler,
3526 const char *encoding, const char *reason,
3527 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3528 Py_ssize_t startpos, Py_ssize_t endpos,
3529 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003531 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003532
3533 PyObject *restuple;
3534 PyObject *resunicode;
3535
3536 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003537 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003539 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540 }
3541
3542 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003543 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003545 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546
3547 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003548 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003549 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003550 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003552 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003553 Py_DECREF(restuple);
3554 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555 }
3556 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003557 &resunicode, newpos)) {
3558 Py_DECREF(restuple);
3559 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 }
3561 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003562 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003563 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003564 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3565 Py_DECREF(restuple);
3566 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003567 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568 Py_INCREF(resunicode);
3569 Py_DECREF(restuple);
3570 return resunicode;
3571}
3572
3573static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003574 Py_ssize_t size,
3575 const char *errors,
3576 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577{
3578 /* output object */
3579 PyObject *res;
3580 /* pointers to the beginning and end+1 of input */
3581 const Py_UNICODE *startp = p;
3582 const Py_UNICODE *endp = p + size;
3583 /* pointer to the beginning of the unencodable characters */
3584 /* const Py_UNICODE *badp = NULL; */
3585 /* pointer into the output */
3586 char *str;
3587 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003588 Py_ssize_t respos = 0;
3589 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003590 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3591 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 PyObject *errorHandler = NULL;
3593 PyObject *exc = NULL;
3594 /* the following variable is used for caching string comparisons
3595 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3596 int known_errorHandler = -1;
3597
3598 /* allocate enough for a simple encoding without
3599 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003600 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 if (res == NULL)
3602 goto onError;
3603 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003604 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003605 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003606 ressize = size;
3607
3608 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003609 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003610
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003611 /* can we encode this? */
3612 if (c<limit) {
3613 /* no overflow check, because we know that the space is enough */
3614 *str++ = (char)c;
3615 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003616 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003617 else {
3618 Py_ssize_t unicodepos = p-startp;
3619 Py_ssize_t requiredsize;
3620 PyObject *repunicode;
3621 Py_ssize_t repsize;
3622 Py_ssize_t newpos;
3623 Py_ssize_t respos;
3624 Py_UNICODE *uni2;
3625 /* startpos for collecting unencodable chars */
3626 const Py_UNICODE *collstart = p;
3627 const Py_UNICODE *collend = p;
3628 /* find all unecodable characters */
3629 while ((collend < endp) && ((*collend)>=limit))
3630 ++collend;
3631 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3632 if (known_errorHandler==-1) {
3633 if ((errors==NULL) || (!strcmp(errors, "strict")))
3634 known_errorHandler = 1;
3635 else if (!strcmp(errors, "replace"))
3636 known_errorHandler = 2;
3637 else if (!strcmp(errors, "ignore"))
3638 known_errorHandler = 3;
3639 else if (!strcmp(errors, "xmlcharrefreplace"))
3640 known_errorHandler = 4;
3641 else
3642 known_errorHandler = 0;
3643 }
3644 switch (known_errorHandler) {
3645 case 1: /* strict */
3646 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3647 goto onError;
3648 case 2: /* replace */
3649 while (collstart++<collend)
3650 *str++ = '?'; /* fall through */
3651 case 3: /* ignore */
3652 p = collend;
3653 break;
3654 case 4: /* xmlcharrefreplace */
3655 respos = str-PyString_AS_STRING(res);
3656 /* determine replacement size (temporarily (mis)uses p) */
3657 for (p = collstart, repsize = 0; p < collend; ++p) {
3658 if (*p<10)
3659 repsize += 2+1+1;
3660 else if (*p<100)
3661 repsize += 2+2+1;
3662 else if (*p<1000)
3663 repsize += 2+3+1;
3664 else if (*p<10000)
3665 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003666#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003667 else
3668 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003669#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003670 else if (*p<100000)
3671 repsize += 2+5+1;
3672 else if (*p<1000000)
3673 repsize += 2+6+1;
3674 else
3675 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003676#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003677 }
3678 requiredsize = respos+repsize+(endp-collend);
3679 if (requiredsize > ressize) {
3680 if (requiredsize<2*ressize)
3681 requiredsize = 2*ressize;
3682 if (_PyString_Resize(&res, requiredsize))
3683 goto onError;
3684 str = PyString_AS_STRING(res) + respos;
3685 ressize = requiredsize;
3686 }
3687 /* generate replacement (temporarily (mis)uses p) */
3688 for (p = collstart; p < collend; ++p) {
3689 str += sprintf(str, "&#%d;", (int)*p);
3690 }
3691 p = collend;
3692 break;
3693 default:
3694 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3695 encoding, reason, startp, size, &exc,
3696 collstart-startp, collend-startp, &newpos);
3697 if (repunicode == NULL)
3698 goto onError;
3699 /* need more space? (at least enough for what we have+the
3700 replacement+the rest of the string, so we won't have to
3701 check space for encodable characters) */
3702 respos = str-PyString_AS_STRING(res);
3703 repsize = PyUnicode_GET_SIZE(repunicode);
3704 requiredsize = respos+repsize+(endp-collend);
3705 if (requiredsize > ressize) {
3706 if (requiredsize<2*ressize)
3707 requiredsize = 2*ressize;
3708 if (_PyString_Resize(&res, requiredsize)) {
3709 Py_DECREF(repunicode);
3710 goto onError;
3711 }
3712 str = PyString_AS_STRING(res) + respos;
3713 ressize = requiredsize;
3714 }
3715 /* check if there is anything unencodable in the replacement
3716 and copy it to the output */
3717 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3718 c = *uni2;
3719 if (c >= limit) {
3720 raise_encode_exception(&exc, encoding, startp, size,
3721 unicodepos, unicodepos+1, reason);
3722 Py_DECREF(repunicode);
3723 goto onError;
3724 }
3725 *str = (char)c;
3726 }
3727 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003728 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003729 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003730 }
3731 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003732 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003733 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003734 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003735 /* If this falls res will be NULL */
3736 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 Py_XDECREF(errorHandler);
3738 Py_XDECREF(exc);
3739 return res;
3740
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003741 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003742 Py_XDECREF(res);
3743 Py_XDECREF(errorHandler);
3744 Py_XDECREF(exc);
3745 return NULL;
3746}
3747
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003749 Py_ssize_t size,
3750 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753}
3754
3755PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3756{
3757 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003758 PyErr_BadArgument();
3759 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760 }
3761 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003762 PyUnicode_GET_SIZE(unicode),
3763 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764}
3765
3766/* --- 7-bit ASCII Codec -------------------------------------------------- */
3767
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003769 Py_ssize_t size,
3770 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003772 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 PyUnicodeObject *v;
3774 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003775 Py_ssize_t startinpos;
3776 Py_ssize_t endinpos;
3777 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778 const char *e;
3779 PyObject *errorHandler = NULL;
3780 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003781
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003783 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003784 Py_UNICODE r = *(unsigned char*)s;
3785 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003786 }
Tim Petersced69f82003-09-16 20:30:58 +00003787
Guido van Rossumd57fd912000-03-10 22:53:23 +00003788 v = _PyUnicode_New(size);
3789 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003790 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003792 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003794 e = s + size;
3795 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003796 register unsigned char c = (unsigned char)*s;
3797 if (c < 128) {
3798 *p++ = c;
3799 ++s;
3800 }
3801 else {
3802 startinpos = s-starts;
3803 endinpos = startinpos + 1;
3804 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3805 if (unicode_decode_call_errorhandler(
3806 errors, &errorHandler,
3807 "ascii", "ordinal not in range(128)",
3808 starts, size, &startinpos, &endinpos, &exc, &s,
3809 &v, &outpos, &p))
3810 goto onError;
3811 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003813 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003814 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3815 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816 Py_XDECREF(errorHandler);
3817 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003819
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003820 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003822 Py_XDECREF(errorHandler);
3823 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824 return NULL;
3825}
3826
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003828 Py_ssize_t size,
3829 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003831 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832}
3833
3834PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3835{
3836 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003837 PyErr_BadArgument();
3838 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 }
3840 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003841 PyUnicode_GET_SIZE(unicode),
3842 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843}
3844
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003845#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003846
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003847/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003848
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003849#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003850#define NEED_RETRY
3851#endif
3852
3853/* XXX This code is limited to "true" double-byte encodings, as
3854 a) it assumes an incomplete character consists of a single byte, and
3855 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003856 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003857
3858static int is_dbcs_lead_byte(const char *s, int offset)
3859{
3860 const char *curr = s + offset;
3861
3862 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003863 const char *prev = CharPrev(s, curr);
3864 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003865 }
3866 return 0;
3867}
3868
3869/*
3870 * Decode MBCS string into unicode object. If 'final' is set, converts
3871 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3872 */
3873static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003874 const char *s, /* MBCS string */
3875 int size, /* sizeof MBCS string */
3876 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003877{
3878 Py_UNICODE *p;
3879 Py_ssize_t n = 0;
3880 int usize = 0;
3881
3882 assert(size >= 0);
3883
3884 /* Skip trailing lead-byte unless 'final' is set */
3885 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003886 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003887
3888 /* First get the size of the result */
3889 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003890 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3891 if (usize == 0) {
3892 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3893 return -1;
3894 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003895 }
3896
3897 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003898 /* Create unicode object */
3899 *v = _PyUnicode_New(usize);
3900 if (*v == NULL)
3901 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003902 }
3903 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003904 /* Extend unicode object */
3905 n = PyUnicode_GET_SIZE(*v);
3906 if (_PyUnicode_Resize(v, n + usize) < 0)
3907 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003908 }
3909
3910 /* Do the conversion */
3911 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003912 p = PyUnicode_AS_UNICODE(*v) + n;
3913 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3914 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3915 return -1;
3916 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003917 }
3918
3919 return size;
3920}
3921
3922PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003923 Py_ssize_t size,
3924 const char *errors,
3925 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003926{
3927 PyUnicodeObject *v = NULL;
3928 int done;
3929
3930 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003931 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003932
3933#ifdef NEED_RETRY
3934 retry:
3935 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003936 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003937 else
3938#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003939 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003940
3941 if (done < 0) {
3942 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003943 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003944 }
3945
3946 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003947 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003948
3949#ifdef NEED_RETRY
3950 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003951 s += done;
3952 size -= done;
3953 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003954 }
3955#endif
3956
3957 return (PyObject *)v;
3958}
3959
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003960PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003961 Py_ssize_t size,
3962 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003963{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003964 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3965}
3966
3967/*
3968 * Convert unicode into string object (MBCS).
3969 * Returns 0 if succeed, -1 otherwise.
3970 */
3971static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003972 const Py_UNICODE *p, /* unicode */
3973 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003974{
3975 int mbcssize = 0;
3976 Py_ssize_t n = 0;
3977
3978 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003979
3980 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003981 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003982 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3983 if (mbcssize == 0) {
3984 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3985 return -1;
3986 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003987 }
3988
Martin v. Löwisd8251432006-06-14 05:21:04 +00003989 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003990 /* Create string object */
3991 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3992 if (*repr == NULL)
3993 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003994 }
3995 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003996 /* Extend string object */
3997 n = PyString_Size(*repr);
3998 if (_PyString_Resize(repr, n + mbcssize) < 0)
3999 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004000 }
4001
4002 /* Do the conversion */
4003 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004004 char *s = PyString_AS_STRING(*repr) + n;
4005 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4006 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4007 return -1;
4008 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004009 }
4010
4011 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004012}
4013
4014PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004015 Py_ssize_t size,
4016 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004017{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004018 PyObject *repr = NULL;
4019 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004020
Martin v. Löwisd8251432006-06-14 05:21:04 +00004021#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004022 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004023 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004024 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004025 else
4026#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004027 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004028
Martin v. Löwisd8251432006-06-14 05:21:04 +00004029 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004030 Py_XDECREF(repr);
4031 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004032 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004033
4034#ifdef NEED_RETRY
4035 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004036 p += INT_MAX;
4037 size -= INT_MAX;
4038 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004039 }
4040#endif
4041
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004042 return repr;
4043}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004044
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004045PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4046{
4047 if (!PyUnicode_Check(unicode)) {
4048 PyErr_BadArgument();
4049 return NULL;
4050 }
4051 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004052 PyUnicode_GET_SIZE(unicode),
4053 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004054}
4055
Martin v. Löwisd8251432006-06-14 05:21:04 +00004056#undef NEED_RETRY
4057
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004058#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004059
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060/* --- Character Mapping Codec -------------------------------------------- */
4061
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004063 Py_ssize_t size,
4064 PyObject *mapping,
4065 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004068 Py_ssize_t startinpos;
4069 Py_ssize_t endinpos;
4070 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004071 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 PyUnicodeObject *v;
4073 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004074 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004075 PyObject *errorHandler = NULL;
4076 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004077 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004078 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004079
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080 /* Default to Latin-1 */
4081 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004082 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083
4084 v = _PyUnicode_New(size);
4085 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004086 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004088 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004090 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004091 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004092 mapstring = PyUnicode_AS_UNICODE(mapping);
4093 maplen = PyUnicode_GET_SIZE(mapping);
4094 while (s < e) {
4095 unsigned char ch = *s;
4096 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004098 if (ch < maplen)
4099 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004101 if (x == 0xfffe) {
4102 /* undefined mapping */
4103 outpos = p-PyUnicode_AS_UNICODE(v);
4104 startinpos = s-starts;
4105 endinpos = startinpos+1;
4106 if (unicode_decode_call_errorhandler(
4107 errors, &errorHandler,
4108 "charmap", "character maps to <undefined>",
4109 starts, size, &startinpos, &endinpos, &exc, &s,
4110 &v, &outpos, &p)) {
4111 goto onError;
4112 }
4113 continue;
4114 }
4115 *p++ = x;
4116 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004117 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004118 }
4119 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004120 while (s < e) {
4121 unsigned char ch = *s;
4122 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004123
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004124 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4125 w = PyInt_FromLong((long)ch);
4126 if (w == NULL)
4127 goto onError;
4128 x = PyObject_GetItem(mapping, w);
4129 Py_DECREF(w);
4130 if (x == NULL) {
4131 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4132 /* No mapping found means: mapping is undefined. */
4133 PyErr_Clear();
4134 x = Py_None;
4135 Py_INCREF(x);
4136 } else
4137 goto onError;
4138 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004139
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004140 /* Apply mapping */
4141 if (PyInt_Check(x)) {
4142 long value = PyInt_AS_LONG(x);
4143 if (value < 0 || value > 65535) {
4144 PyErr_SetString(PyExc_TypeError,
4145 "character mapping must be in range(65536)");
4146 Py_DECREF(x);
4147 goto onError;
4148 }
4149 *p++ = (Py_UNICODE)value;
4150 }
4151 else if (x == Py_None) {
4152 /* undefined mapping */
4153 outpos = p-PyUnicode_AS_UNICODE(v);
4154 startinpos = s-starts;
4155 endinpos = startinpos+1;
4156 if (unicode_decode_call_errorhandler(
4157 errors, &errorHandler,
4158 "charmap", "character maps to <undefined>",
4159 starts, size, &startinpos, &endinpos, &exc, &s,
4160 &v, &outpos, &p)) {
4161 Py_DECREF(x);
4162 goto onError;
4163 }
4164 Py_DECREF(x);
4165 continue;
4166 }
4167 else if (PyUnicode_Check(x)) {
4168 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004169
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004170 if (targetsize == 1)
4171 /* 1-1 mapping */
4172 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004173
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004174 else if (targetsize > 1) {
4175 /* 1-n mapping */
4176 if (targetsize > extrachars) {
4177 /* resize first */
4178 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4179 Py_ssize_t needed = (targetsize - extrachars) + \
4180 (targetsize << 2);
4181 extrachars += needed;
4182 /* XXX overflow detection missing */
4183 if (_PyUnicode_Resize(&v,
4184 PyUnicode_GET_SIZE(v) + needed) < 0) {
4185 Py_DECREF(x);
4186 goto onError;
4187 }
4188 p = PyUnicode_AS_UNICODE(v) + oldpos;
4189 }
4190 Py_UNICODE_COPY(p,
4191 PyUnicode_AS_UNICODE(x),
4192 targetsize);
4193 p += targetsize;
4194 extrachars -= targetsize;
4195 }
4196 /* 1-0 mapping: skip the character */
4197 }
4198 else {
4199 /* wrong return value */
4200 PyErr_SetString(PyExc_TypeError,
4201 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004202 Py_DECREF(x);
4203 goto onError;
4204 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004205 Py_DECREF(x);
4206 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004207 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004208 }
4209 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004210 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4211 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212 Py_XDECREF(errorHandler);
4213 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004215
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004216 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004217 Py_XDECREF(errorHandler);
4218 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004219 Py_XDECREF(v);
4220 return NULL;
4221}
4222
Martin v. Löwis3f767792006-06-04 19:36:28 +00004223/* Charmap encoding: the lookup table */
4224
4225struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004226 PyObject_HEAD
4227 unsigned char level1[32];
4228 int count2, count3;
4229 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004230};
4231
4232static PyObject*
4233encoding_map_size(PyObject *obj, PyObject* args)
4234{
4235 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004236 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004237 128*map->count3);
4238}
4239
4240static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004241 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004242 PyDoc_STR("Return the size (in bytes) of this object") },
4243 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004244};
4245
4246static void
4247encoding_map_dealloc(PyObject* o)
4248{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004249 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004250}
4251
4252static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004253 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004254 "EncodingMap", /*tp_name*/
4255 sizeof(struct encoding_map), /*tp_basicsize*/
4256 0, /*tp_itemsize*/
4257 /* methods */
4258 encoding_map_dealloc, /*tp_dealloc*/
4259 0, /*tp_print*/
4260 0, /*tp_getattr*/
4261 0, /*tp_setattr*/
4262 0, /*tp_compare*/
4263 0, /*tp_repr*/
4264 0, /*tp_as_number*/
4265 0, /*tp_as_sequence*/
4266 0, /*tp_as_mapping*/
4267 0, /*tp_hash*/
4268 0, /*tp_call*/
4269 0, /*tp_str*/
4270 0, /*tp_getattro*/
4271 0, /*tp_setattro*/
4272 0, /*tp_as_buffer*/
4273 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4274 0, /*tp_doc*/
4275 0, /*tp_traverse*/
4276 0, /*tp_clear*/
4277 0, /*tp_richcompare*/
4278 0, /*tp_weaklistoffset*/
4279 0, /*tp_iter*/
4280 0, /*tp_iternext*/
4281 encoding_map_methods, /*tp_methods*/
4282 0, /*tp_members*/
4283 0, /*tp_getset*/
4284 0, /*tp_base*/
4285 0, /*tp_dict*/
4286 0, /*tp_descr_get*/
4287 0, /*tp_descr_set*/
4288 0, /*tp_dictoffset*/
4289 0, /*tp_init*/
4290 0, /*tp_alloc*/
4291 0, /*tp_new*/
4292 0, /*tp_free*/
4293 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004294};
4295
4296PyObject*
4297PyUnicode_BuildEncodingMap(PyObject* string)
4298{
4299 Py_UNICODE *decode;
4300 PyObject *result;
4301 struct encoding_map *mresult;
4302 int i;
4303 int need_dict = 0;
4304 unsigned char level1[32];
4305 unsigned char level2[512];
4306 unsigned char *mlevel1, *mlevel2, *mlevel3;
4307 int count2 = 0, count3 = 0;
4308
4309 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4310 PyErr_BadArgument();
4311 return NULL;
4312 }
4313 decode = PyUnicode_AS_UNICODE(string);
4314 memset(level1, 0xFF, sizeof level1);
4315 memset(level2, 0xFF, sizeof level2);
4316
4317 /* If there isn't a one-to-one mapping of NULL to \0,
4318 or if there are non-BMP characters, we need to use
4319 a mapping dictionary. */
4320 if (decode[0] != 0)
4321 need_dict = 1;
4322 for (i = 1; i < 256; i++) {
4323 int l1, l2;
4324 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004325#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004326 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004327#endif
4328 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004329 need_dict = 1;
4330 break;
4331 }
4332 if (decode[i] == 0xFFFE)
4333 /* unmapped character */
4334 continue;
4335 l1 = decode[i] >> 11;
4336 l2 = decode[i] >> 7;
4337 if (level1[l1] == 0xFF)
4338 level1[l1] = count2++;
4339 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004340 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004341 }
4342
4343 if (count2 >= 0xFF || count3 >= 0xFF)
4344 need_dict = 1;
4345
4346 if (need_dict) {
4347 PyObject *result = PyDict_New();
4348 PyObject *key, *value;
4349 if (!result)
4350 return NULL;
4351 for (i = 0; i < 256; i++) {
4352 key = value = NULL;
4353 key = PyInt_FromLong(decode[i]);
4354 value = PyInt_FromLong(i);
4355 if (!key || !value)
4356 goto failed1;
4357 if (PyDict_SetItem(result, key, value) == -1)
4358 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004359 Py_DECREF(key);
4360 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004361 }
4362 return result;
4363 failed1:
4364 Py_XDECREF(key);
4365 Py_XDECREF(value);
4366 Py_DECREF(result);
4367 return NULL;
4368 }
4369
4370 /* Create a three-level trie */
4371 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4372 16*count2 + 128*count3 - 1);
4373 if (!result)
4374 return PyErr_NoMemory();
4375 PyObject_Init(result, &EncodingMapType);
4376 mresult = (struct encoding_map*)result;
4377 mresult->count2 = count2;
4378 mresult->count3 = count3;
4379 mlevel1 = mresult->level1;
4380 mlevel2 = mresult->level23;
4381 mlevel3 = mresult->level23 + 16*count2;
4382 memcpy(mlevel1, level1, 32);
4383 memset(mlevel2, 0xFF, 16*count2);
4384 memset(mlevel3, 0, 128*count3);
4385 count3 = 0;
4386 for (i = 1; i < 256; i++) {
4387 int o1, o2, o3, i2, i3;
4388 if (decode[i] == 0xFFFE)
4389 /* unmapped character */
4390 continue;
4391 o1 = decode[i]>>11;
4392 o2 = (decode[i]>>7) & 0xF;
4393 i2 = 16*mlevel1[o1] + o2;
4394 if (mlevel2[i2] == 0xFF)
4395 mlevel2[i2] = count3++;
4396 o3 = decode[i] & 0x7F;
4397 i3 = 128*mlevel2[i2] + o3;
4398 mlevel3[i3] = i;
4399 }
4400 return result;
4401}
4402
4403static int
4404encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4405{
4406 struct encoding_map *map = (struct encoding_map*)mapping;
4407 int l1 = c>>11;
4408 int l2 = (c>>7) & 0xF;
4409 int l3 = c & 0x7F;
4410 int i;
4411
4412#ifdef Py_UNICODE_WIDE
4413 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004414 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004415 }
4416#endif
4417 if (c == 0)
4418 return 0;
4419 /* level 1*/
4420 i = map->level1[l1];
4421 if (i == 0xFF) {
4422 return -1;
4423 }
4424 /* level 2*/
4425 i = map->level23[16*i+l2];
4426 if (i == 0xFF) {
4427 return -1;
4428 }
4429 /* level 3 */
4430 i = map->level23[16*map->count2 + 128*i + l3];
4431 if (i == 0) {
4432 return -1;
4433 }
4434 return i;
4435}
4436
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437/* Lookup the character ch in the mapping. If the character
4438 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004439 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 PyObject *w = PyInt_FromLong((long)c);
4443 PyObject *x;
4444
4445 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004446 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004447 x = PyObject_GetItem(mapping, w);
4448 Py_DECREF(w);
4449 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004450 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4451 /* No mapping found means: mapping is undefined. */
4452 PyErr_Clear();
4453 x = Py_None;
4454 Py_INCREF(x);
4455 return x;
4456 } else
4457 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004459 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004460 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004462 long value = PyInt_AS_LONG(x);
4463 if (value < 0 || value > 255) {
4464 PyErr_SetString(PyExc_TypeError,
4465 "character mapping must be in range(256)");
4466 Py_DECREF(x);
4467 return NULL;
4468 }
4469 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004471 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004472 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004474 /* wrong return value */
4475 PyErr_SetString(PyExc_TypeError,
4476 "character mapping must return integer, None or str");
4477 Py_DECREF(x);
4478 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479 }
4480}
4481
Martin v. Löwis3f767792006-06-04 19:36:28 +00004482static int
4483charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4484{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004485 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4486 /* exponentially overallocate to minimize reallocations */
4487 if (requiredsize < 2*outsize)
4488 requiredsize = 2*outsize;
4489 if (_PyString_Resize(outobj, requiredsize)) {
4490 return 0;
4491 }
4492 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004493}
4494
Benjamin Peterson857ce152009-01-31 16:29:18 +00004495typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004496 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004497}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004498/* lookup the character, put the result in the output string and adjust
4499 various state variables. Reallocate the output string if not enough
4500 space is available. Return a new reference to the object that
4501 was put in the output buffer, or Py_None, if the mapping was undefined
4502 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004503 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004505charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004506 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004507{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004508 PyObject *rep;
4509 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004510 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511
Christian Heimese93237d2007-12-19 02:37:44 +00004512 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004513 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004514 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004515 if (res == -1)
4516 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004517 if (outsize<requiredsize)
4518 if (!charmapencode_resize(outobj, outpos, requiredsize))
4519 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004520 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004521 outstart[(*outpos)++] = (char)res;
4522 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004523 }
4524
4525 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004527 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004528 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004529 Py_DECREF(rep);
4530 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004531 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004532 if (PyInt_Check(rep)) {
4533 Py_ssize_t requiredsize = *outpos+1;
4534 if (outsize<requiredsize)
4535 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4536 Py_DECREF(rep);
4537 return enc_EXCEPTION;
4538 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004539 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004540 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004541 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004542 else {
4543 const char *repchars = PyString_AS_STRING(rep);
4544 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4545 Py_ssize_t requiredsize = *outpos+repsize;
4546 if (outsize<requiredsize)
4547 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4548 Py_DECREF(rep);
4549 return enc_EXCEPTION;
4550 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004551 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004552 memcpy(outstart + *outpos, repchars, repsize);
4553 *outpos += repsize;
4554 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555 }
Georg Brandl9f167602006-06-04 21:46:16 +00004556 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004557 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558}
4559
4560/* handle an error in PyUnicode_EncodeCharmap
4561 Return 0 on success, -1 on error */
4562static
4563int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004564 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004566 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004567 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568{
4569 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004570 Py_ssize_t repsize;
4571 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 Py_UNICODE *uni2;
4573 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004574 Py_ssize_t collstartpos = *inpos;
4575 Py_ssize_t collendpos = *inpos+1;
4576 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577 char *encoding = "charmap";
4578 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004579 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004581 /* find all unencodable characters */
4582 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004583 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004584 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004585 int res = encoding_map_lookup(p[collendpos], mapping);
4586 if (res != -1)
4587 break;
4588 ++collendpos;
4589 continue;
4590 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004591
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004592 rep = charmapencode_lookup(p[collendpos], mapping);
4593 if (rep==NULL)
4594 return -1;
4595 else if (rep!=Py_None) {
4596 Py_DECREF(rep);
4597 break;
4598 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004599 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004600 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004601 }
4602 /* cache callback name lookup
4603 * (if not done yet, i.e. it's the first error) */
4604 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004605 if ((errors==NULL) || (!strcmp(errors, "strict")))
4606 *known_errorHandler = 1;
4607 else if (!strcmp(errors, "replace"))
4608 *known_errorHandler = 2;
4609 else if (!strcmp(errors, "ignore"))
4610 *known_errorHandler = 3;
4611 else if (!strcmp(errors, "xmlcharrefreplace"))
4612 *known_errorHandler = 4;
4613 else
4614 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004615 }
4616 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004617 case 1: /* strict */
4618 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4619 return -1;
4620 case 2: /* replace */
4621 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004622 x = charmapencode_output('?', mapping, res, respos);
4623 if (x==enc_EXCEPTION) {
4624 return -1;
4625 }
4626 else if (x==enc_FAILED) {
4627 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4628 return -1;
4629 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004630 }
4631 /* fall through */
4632 case 3: /* ignore */
4633 *inpos = collendpos;
4634 break;
4635 case 4: /* xmlcharrefreplace */
4636 /* generate replacement (temporarily (mis)uses p) */
4637 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004638 char buffer[2+29+1+1];
4639 char *cp;
4640 sprintf(buffer, "&#%d;", (int)p[collpos]);
4641 for (cp = buffer; *cp; ++cp) {
4642 x = charmapencode_output(*cp, mapping, res, respos);
4643 if (x==enc_EXCEPTION)
4644 return -1;
4645 else if (x==enc_FAILED) {
4646 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4647 return -1;
4648 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004649 }
4650 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004651 *inpos = collendpos;
4652 break;
4653 default:
4654 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004655 encoding, reason, p, size, exceptionObject,
4656 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004657 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004658 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004659 /* generate replacement */
4660 repsize = PyUnicode_GET_SIZE(repunicode);
4661 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004662 x = charmapencode_output(*uni2, mapping, res, respos);
4663 if (x==enc_EXCEPTION) {
4664 return -1;
4665 }
4666 else if (x==enc_FAILED) {
4667 Py_DECREF(repunicode);
4668 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4669 return -1;
4670 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004671 }
4672 *inpos = newpos;
4673 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004674 }
4675 return 0;
4676}
4677
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004679 Py_ssize_t size,
4680 PyObject *mapping,
4681 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004683 /* output object */
4684 PyObject *res = NULL;
4685 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004686 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004687 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004688 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004689 PyObject *errorHandler = NULL;
4690 PyObject *exc = NULL;
4691 /* the following variable is used for caching string comparisons
4692 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4693 * 3=ignore, 4=xmlcharrefreplace */
4694 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004695
4696 /* Default to Latin-1 */
4697 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004698 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004700 /* allocate enough for a simple encoding without
4701 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004702 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004703 if (res == NULL)
4704 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004705 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004706 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004708 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004709 /* try to encode it */
4710 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4711 if (x==enc_EXCEPTION) /* error */
4712 goto onError;
4713 if (x==enc_FAILED) { /* unencodable character */
4714 if (charmap_encoding_error(p, size, &inpos, mapping,
4715 &exc,
4716 &known_errorHandler, &errorHandler, errors,
4717 &res, &respos)) {
4718 goto onError;
4719 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004720 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004721 else
4722 /* done with this character => adjust input position */
4723 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004726 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004727 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004728 if (_PyString_Resize(&res, respos))
4729 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004730 }
4731 Py_XDECREF(exc);
4732 Py_XDECREF(errorHandler);
4733 return res;
4734
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004735 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004736 Py_XDECREF(res);
4737 Py_XDECREF(exc);
4738 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 return NULL;
4740}
4741
4742PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004743 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744{
4745 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004746 PyErr_BadArgument();
4747 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 }
4749 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004750 PyUnicode_GET_SIZE(unicode),
4751 mapping,
4752 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753}
4754
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004755/* create or adjust a UnicodeTranslateError */
4756static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004757 const Py_UNICODE *unicode, Py_ssize_t size,
4758 Py_ssize_t startpos, Py_ssize_t endpos,
4759 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004761 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004762 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004763 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764 }
4765 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004766 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4767 goto onError;
4768 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4769 goto onError;
4770 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4771 goto onError;
4772 return;
4773 onError:
4774 Py_DECREF(*exceptionObject);
4775 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 }
4777}
4778
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004779/* raises a UnicodeTranslateError */
4780static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004781 const Py_UNICODE *unicode, Py_ssize_t size,
4782 Py_ssize_t startpos, Py_ssize_t endpos,
4783 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004784{
4785 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004786 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004787 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004788 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004789}
4790
4791/* error handling callback helper:
4792 build arguments, call the callback and check the arguments,
4793 put the result into newpos and return the replacement string, which
4794 has to be freed by the caller */
4795static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004796 PyObject **errorHandler,
4797 const char *reason,
4798 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4799 Py_ssize_t startpos, Py_ssize_t endpos,
4800 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004802 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803
Martin v. Löwis412fb672006-04-13 06:34:32 +00004804 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 PyObject *restuple;
4806 PyObject *resunicode;
4807
4808 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004809 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004810 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004811 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 }
4813
4814 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004815 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004817 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818
4819 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004820 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004821 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004822 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004824 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004825 Py_DECREF(restuple);
4826 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004827 }
4828 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004829 &resunicode, &i_newpos)) {
4830 Py_DECREF(restuple);
4831 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004832 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004833 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004834 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004835 else
4836 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004837 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004838 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4839 Py_DECREF(restuple);
4840 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004841 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004842 Py_INCREF(resunicode);
4843 Py_DECREF(restuple);
4844 return resunicode;
4845}
4846
4847/* Lookup the character ch in the mapping and put the result in result,
4848 which must be decrefed by the caller.
4849 Return 0 on success, -1 on error */
4850static
4851int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4852{
4853 PyObject *w = PyInt_FromLong((long)c);
4854 PyObject *x;
4855
4856 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004857 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004858 x = PyObject_GetItem(mapping, w);
4859 Py_DECREF(w);
4860 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004861 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4862 /* No mapping found means: use 1:1 mapping. */
4863 PyErr_Clear();
4864 *result = NULL;
4865 return 0;
4866 } else
4867 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004868 }
4869 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004870 *result = x;
4871 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004872 }
4873 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004874 long value = PyInt_AS_LONG(x);
4875 long max = PyUnicode_GetMax();
4876 if (value < 0 || value > max) {
4877 PyErr_Format(PyExc_TypeError,
4878 "character mapping must be in range(0x%lx)", max+1);
4879 Py_DECREF(x);
4880 return -1;
4881 }
4882 *result = x;
4883 return 0;
4884 }
4885 else if (PyUnicode_Check(x)) {
4886 *result = x;
4887 return 0;
4888 }
4889 else {
4890 /* wrong return value */
4891 PyErr_SetString(PyExc_TypeError,
4892 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004893 Py_DECREF(x);
4894 return -1;
4895 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896}
4897/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004898 if not reallocate and adjust various state variables.
4899 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004900static
Walter Dörwald4894c302003-10-24 14:25:28 +00004901int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004902 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004903{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004904 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004905 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004906 /* remember old output position */
4907 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4908 /* exponentially overallocate to minimize reallocations */
4909 if (requiredsize < 2 * oldsize)
4910 requiredsize = 2 * oldsize;
4911 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4912 return -1;
4913 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004914 }
4915 return 0;
4916}
4917/* lookup the character, put the result in the output string and adjust
4918 various state variables. Return a new reference to the object that
4919 was put in the output buffer in *result, or Py_None, if the mapping was
4920 undefined (in which case no character was written).
4921 The called must decref result.
4922 Return 0 on success, -1 on error. */
4923static
Walter Dörwald4894c302003-10-24 14:25:28 +00004924int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004925 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4926 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004927{
Walter Dörwald4894c302003-10-24 14:25:28 +00004928 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004929 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004930 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004931 /* not found => default to 1:1 mapping */
4932 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004933 }
4934 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004935 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004936 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004937 /* no overflow check, because we know that the space is enough */
4938 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004939 }
4940 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004941 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4942 if (repsize==1) {
4943 /* no overflow check, because we know that the space is enough */
4944 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4945 }
4946 else if (repsize!=0) {
4947 /* more than one character */
4948 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4949 (insize - (curinp-startinp)) +
4950 repsize - 1;
4951 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4952 return -1;
4953 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4954 *outp += repsize;
4955 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004956 }
4957 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004958 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004959 return 0;
4960}
4961
4962PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004963 Py_ssize_t size,
4964 PyObject *mapping,
4965 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004967 /* output object */
4968 PyObject *res = NULL;
4969 /* pointers to the beginning and end+1 of input */
4970 const Py_UNICODE *startp = p;
4971 const Py_UNICODE *endp = p + size;
4972 /* pointer into the output */
4973 Py_UNICODE *str;
4974 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004975 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004976 char *reason = "character maps to <undefined>";
4977 PyObject *errorHandler = NULL;
4978 PyObject *exc = NULL;
4979 /* the following variable is used for caching string comparisons
4980 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4981 * 3=ignore, 4=xmlcharrefreplace */
4982 int known_errorHandler = -1;
4983
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004985 PyErr_BadArgument();
4986 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004988
4989 /* allocate enough for a simple 1:1 translation without
4990 replacements, if we need more, we'll resize */
4991 res = PyUnicode_FromUnicode(NULL, size);
4992 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004993 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004995 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004996 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004997
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004998 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004999 /* try to encode it */
5000 PyObject *x = NULL;
5001 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5002 Py_XDECREF(x);
5003 goto onError;
5004 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005005 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005006 if (x!=Py_None) /* it worked => adjust input pointer */
5007 ++p;
5008 else { /* untranslatable character */
5009 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5010 Py_ssize_t repsize;
5011 Py_ssize_t newpos;
5012 Py_UNICODE *uni2;
5013 /* startpos for collecting untranslatable chars */
5014 const Py_UNICODE *collstart = p;
5015 const Py_UNICODE *collend = p+1;
5016 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005017
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005018 /* find all untranslatable characters */
5019 while (collend < endp) {
5020 if (charmaptranslate_lookup(*collend, mapping, &x))
5021 goto onError;
5022 Py_XDECREF(x);
5023 if (x!=Py_None)
5024 break;
5025 ++collend;
5026 }
5027 /* cache callback name lookup
5028 * (if not done yet, i.e. it's the first error) */
5029 if (known_errorHandler==-1) {
5030 if ((errors==NULL) || (!strcmp(errors, "strict")))
5031 known_errorHandler = 1;
5032 else if (!strcmp(errors, "replace"))
5033 known_errorHandler = 2;
5034 else if (!strcmp(errors, "ignore"))
5035 known_errorHandler = 3;
5036 else if (!strcmp(errors, "xmlcharrefreplace"))
5037 known_errorHandler = 4;
5038 else
5039 known_errorHandler = 0;
5040 }
5041 switch (known_errorHandler) {
5042 case 1: /* strict */
5043 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005044 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005045 case 2: /* replace */
5046 /* No need to check for space, this is a 1:1 replacement */
5047 for (coll = collstart; coll<collend; ++coll)
5048 *str++ = '?';
5049 /* fall through */
5050 case 3: /* ignore */
5051 p = collend;
5052 break;
5053 case 4: /* xmlcharrefreplace */
5054 /* generate replacement (temporarily (mis)uses p) */
5055 for (p = collstart; p < collend; ++p) {
5056 char buffer[2+29+1+1];
5057 char *cp;
5058 sprintf(buffer, "&#%d;", (int)*p);
5059 if (charmaptranslate_makespace(&res, &str,
5060 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5061 goto onError;
5062 for (cp = buffer; *cp; ++cp)
5063 *str++ = *cp;
5064 }
5065 p = collend;
5066 break;
5067 default:
5068 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5069 reason, startp, size, &exc,
5070 collstart-startp, collend-startp, &newpos);
5071 if (repunicode == NULL)
5072 goto onError;
5073 /* generate replacement */
5074 repsize = PyUnicode_GET_SIZE(repunicode);
5075 if (charmaptranslate_makespace(&res, &str,
5076 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5077 Py_DECREF(repunicode);
5078 goto onError;
5079 }
5080 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5081 *str++ = *uni2;
5082 p = startp + newpos;
5083 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005084 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005085 }
5086 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005087 /* Resize if we allocated to much */
5088 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005089 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005090 if (PyUnicode_Resize(&res, respos) < 0)
5091 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005092 }
5093 Py_XDECREF(exc);
5094 Py_XDECREF(errorHandler);
5095 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005097 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005098 Py_XDECREF(res);
5099 Py_XDECREF(exc);
5100 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101 return NULL;
5102}
5103
5104PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005105 PyObject *mapping,
5106 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107{
5108 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005109
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 str = PyUnicode_FromObject(str);
5111 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005112 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005114 PyUnicode_GET_SIZE(str),
5115 mapping,
5116 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005117 Py_DECREF(str);
5118 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005119
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005120 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121 Py_XDECREF(str);
5122 return NULL;
5123}
Tim Petersced69f82003-09-16 20:30:58 +00005124
Guido van Rossum9e896b32000-04-05 20:11:21 +00005125/* --- Decimal Encoder ---------------------------------------------------- */
5126
5127int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005128 Py_ssize_t length,
5129 char *output,
5130 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005131{
5132 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005133 PyObject *errorHandler = NULL;
5134 PyObject *exc = NULL;
5135 const char *encoding = "decimal";
5136 const char *reason = "invalid decimal Unicode string";
5137 /* the following variable is used for caching string comparisons
5138 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5139 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005140
5141 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005142 PyErr_BadArgument();
5143 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005144 }
5145
5146 p = s;
5147 end = s + length;
5148 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005149 register Py_UNICODE ch = *p;
5150 int decimal;
5151 PyObject *repunicode;
5152 Py_ssize_t repsize;
5153 Py_ssize_t newpos;
5154 Py_UNICODE *uni2;
5155 Py_UNICODE *collstart;
5156 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005157
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005158 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005159 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005160 ++p;
5161 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005162 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005163 decimal = Py_UNICODE_TODECIMAL(ch);
5164 if (decimal >= 0) {
5165 *output++ = '0' + decimal;
5166 ++p;
5167 continue;
5168 }
5169 if (0 < ch && ch < 256) {
5170 *output++ = (char)ch;
5171 ++p;
5172 continue;
5173 }
5174 /* All other characters are considered unencodable */
5175 collstart = p;
5176 collend = p+1;
5177 while (collend < end) {
5178 if ((0 < *collend && *collend < 256) ||
5179 !Py_UNICODE_ISSPACE(*collend) ||
5180 Py_UNICODE_TODECIMAL(*collend))
5181 break;
5182 }
5183 /* cache callback name lookup
5184 * (if not done yet, i.e. it's the first error) */
5185 if (known_errorHandler==-1) {
5186 if ((errors==NULL) || (!strcmp(errors, "strict")))
5187 known_errorHandler = 1;
5188 else if (!strcmp(errors, "replace"))
5189 known_errorHandler = 2;
5190 else if (!strcmp(errors, "ignore"))
5191 known_errorHandler = 3;
5192 else if (!strcmp(errors, "xmlcharrefreplace"))
5193 known_errorHandler = 4;
5194 else
5195 known_errorHandler = 0;
5196 }
5197 switch (known_errorHandler) {
5198 case 1: /* strict */
5199 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5200 goto onError;
5201 case 2: /* replace */
5202 for (p = collstart; p < collend; ++p)
5203 *output++ = '?';
5204 /* fall through */
5205 case 3: /* ignore */
5206 p = collend;
5207 break;
5208 case 4: /* xmlcharrefreplace */
5209 /* generate replacement (temporarily (mis)uses p) */
5210 for (p = collstart; p < collend; ++p)
5211 output += sprintf(output, "&#%d;", (int)*p);
5212 p = collend;
5213 break;
5214 default:
5215 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5216 encoding, reason, s, length, &exc,
5217 collstart-s, collend-s, &newpos);
5218 if (repunicode == NULL)
5219 goto onError;
5220 /* generate replacement */
5221 repsize = PyUnicode_GET_SIZE(repunicode);
5222 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5223 Py_UNICODE ch = *uni2;
5224 if (Py_UNICODE_ISSPACE(ch))
5225 *output++ = ' ';
5226 else {
5227 decimal = Py_UNICODE_TODECIMAL(ch);
5228 if (decimal >= 0)
5229 *output++ = '0' + decimal;
5230 else if (0 < ch && ch < 256)
5231 *output++ = (char)ch;
5232 else {
5233 Py_DECREF(repunicode);
5234 raise_encode_exception(&exc, encoding,
5235 s, length, collstart-s, collend-s, reason);
5236 goto onError;
5237 }
5238 }
5239 }
5240 p = s + newpos;
5241 Py_DECREF(repunicode);
5242 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005243 }
5244 /* 0-terminate the output string */
5245 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005246 Py_XDECREF(exc);
5247 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005248 return 0;
5249
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005250 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005251 Py_XDECREF(exc);
5252 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005253 return -1;
5254}
5255
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256/* --- Helpers ------------------------------------------------------------ */
5257
Eric Smitha9f7d622008-02-17 19:46:49 +00005258#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005259#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005260
5261#include "stringlib/count.h"
5262#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005263#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005264#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005265
Fredrik Lundhc8162812006-05-26 19:33:03 +00005266/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005267#define ADJUST_INDICES(start, end, len) \
5268 if (end > len) \
5269 end = len; \
5270 else if (end < 0) { \
5271 end += len; \
5272 if (end < 0) \
5273 end = 0; \
5274 } \
5275 if (start < 0) { \
5276 start += len; \
5277 if (start < 0) \
5278 start = 0; \
5279 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005280
Martin v. Löwis18e16552006-02-15 17:27:45 +00005281Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005282 PyObject *substr,
5283 Py_ssize_t start,
5284 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005286 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005287 PyUnicodeObject* str_obj;
5288 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005289
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005290 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5291 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005292 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005293 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5294 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005295 Py_DECREF(str_obj);
5296 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 }
Tim Petersced69f82003-09-16 20:30:58 +00005298
Antoine Pitrou64672132010-01-13 07:55:48 +00005299 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005300 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005301 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5302 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005303 );
5304
5305 Py_DECREF(sub_obj);
5306 Py_DECREF(str_obj);
5307
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308 return result;
5309}
5310
Martin v. Löwis18e16552006-02-15 17:27:45 +00005311Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005312 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005313 Py_ssize_t start,
5314 Py_ssize_t end,
5315 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005317 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005318
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005319 str = PyUnicode_FromObject(str);
5320 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005321 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005322 sub = PyUnicode_FromObject(sub);
5323 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005324 Py_DECREF(str);
5325 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 }
Tim Petersced69f82003-09-16 20:30:58 +00005327
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005328 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005329 result = stringlib_find_slice(
5330 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5331 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5332 start, end
5333 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005334 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005335 result = stringlib_rfind_slice(
5336 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5337 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5338 start, end
5339 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005340
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005341 Py_DECREF(str);
5342 Py_DECREF(sub);
5343
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344 return result;
5345}
5346
Tim Petersced69f82003-09-16 20:30:58 +00005347static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005349 PyUnicodeObject *substring,
5350 Py_ssize_t start,
5351 Py_ssize_t end,
5352 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 if (substring->length == 0)
5355 return 1;
5356
Antoine Pitrou64672132010-01-13 07:55:48 +00005357 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 end -= substring->length;
5359 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005360 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361
5362 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005363 if (Py_UNICODE_MATCH(self, end, substring))
5364 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 } else {
5366 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005367 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 }
5369
5370 return 0;
5371}
5372
Martin v. Löwis18e16552006-02-15 17:27:45 +00005373Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005374 PyObject *substr,
5375 Py_ssize_t start,
5376 Py_ssize_t end,
5377 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005379 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005380
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 str = PyUnicode_FromObject(str);
5382 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005383 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 substr = PyUnicode_FromObject(substr);
5385 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005386 Py_DECREF(str);
5387 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388 }
Tim Petersced69f82003-09-16 20:30:58 +00005389
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005391 (PyUnicodeObject *)substr,
5392 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393 Py_DECREF(str);
5394 Py_DECREF(substr);
5395 return result;
5396}
5397
Guido van Rossumd57fd912000-03-10 22:53:23 +00005398/* Apply fixfct filter to the Unicode object self and return a
5399 reference to the modified object */
5400
Tim Petersced69f82003-09-16 20:30:58 +00005401static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005403 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404{
5405
5406 PyUnicodeObject *u;
5407
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005408 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005410 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005411
5412 Py_UNICODE_COPY(u->str, self->str, self->length);
5413
Tim Peters7a29bd52001-09-12 03:03:31 +00005414 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005415 /* fixfct should return TRUE if it modified the buffer. If
5416 FALSE, return a reference to the original buffer instead
5417 (to save space, not time) */
5418 Py_INCREF(self);
5419 Py_DECREF(u);
5420 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421 }
5422 return (PyObject*) u;
5423}
5424
Tim Petersced69f82003-09-16 20:30:58 +00005425static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426int fixupper(PyUnicodeObject *self)
5427{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005428 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429 Py_UNICODE *s = self->str;
5430 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005431
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005433 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005434
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005435 ch = Py_UNICODE_TOUPPER(*s);
5436 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005438 *s = ch;
5439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 s++;
5441 }
5442
5443 return status;
5444}
5445
Tim Petersced69f82003-09-16 20:30:58 +00005446static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447int fixlower(PyUnicodeObject *self)
5448{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005449 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 Py_UNICODE *s = self->str;
5451 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005452
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005454 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005455
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005456 ch = Py_UNICODE_TOLOWER(*s);
5457 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005459 *s = ch;
5460 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 s++;
5462 }
5463
5464 return status;
5465}
5466
Tim Petersced69f82003-09-16 20:30:58 +00005467static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468int fixswapcase(PyUnicodeObject *self)
5469{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005470 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 Py_UNICODE *s = self->str;
5472 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005473
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 while (len-- > 0) {
5475 if (Py_UNICODE_ISUPPER(*s)) {
5476 *s = Py_UNICODE_TOLOWER(*s);
5477 status = 1;
5478 } else if (Py_UNICODE_ISLOWER(*s)) {
5479 *s = Py_UNICODE_TOUPPER(*s);
5480 status = 1;
5481 }
5482 s++;
5483 }
5484
5485 return status;
5486}
5487
Tim Petersced69f82003-09-16 20:30:58 +00005488static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489int fixcapitalize(PyUnicodeObject *self)
5490{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005491 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005492 Py_UNICODE *s = self->str;
5493 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005494
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005495 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005496 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005497 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005498 *s = Py_UNICODE_TOUPPER(*s);
5499 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005501 s++;
5502 while (--len > 0) {
5503 if (Py_UNICODE_ISUPPER(*s)) {
5504 *s = Py_UNICODE_TOLOWER(*s);
5505 status = 1;
5506 }
5507 s++;
5508 }
5509 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510}
5511
5512static
5513int fixtitle(PyUnicodeObject *self)
5514{
5515 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5516 register Py_UNICODE *e;
5517 int previous_is_cased;
5518
5519 /* Shortcut for single character strings */
5520 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005521 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5522 if (*p != ch) {
5523 *p = ch;
5524 return 1;
5525 }
5526 else
5527 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 }
Tim Petersced69f82003-09-16 20:30:58 +00005529
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530 e = p + PyUnicode_GET_SIZE(self);
5531 previous_is_cased = 0;
5532 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005533 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005534
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005535 if (previous_is_cased)
5536 *p = Py_UNICODE_TOLOWER(ch);
5537 else
5538 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005539
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005540 if (Py_UNICODE_ISLOWER(ch) ||
5541 Py_UNICODE_ISUPPER(ch) ||
5542 Py_UNICODE_ISTITLE(ch))
5543 previous_is_cased = 1;
5544 else
5545 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546 }
5547 return 1;
5548}
5549
Tim Peters8ce9f162004-08-27 01:49:32 +00005550PyObject *
5551PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552{
Tim Peters8ce9f162004-08-27 01:49:32 +00005553 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005554 const Py_UNICODE blank = ' ';
5555 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005556 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005557 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005558 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5559 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005560 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5561 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005562 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005563 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005564 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565
Tim Peters05eba1f2004-08-27 21:32:02 +00005566 fseq = PySequence_Fast(seq, "");
5567 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005568 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005569 }
5570
Tim Peters91879ab2004-08-27 22:35:44 +00005571 /* Grrrr. A codec may be invoked to convert str objects to
5572 * Unicode, and so it's possible to call back into Python code
5573 * during PyUnicode_FromObject(), and so it's possible for a sick
5574 * codec to change the size of fseq (if seq is a list). Therefore
5575 * we have to keep refetching the size -- can't assume seqlen
5576 * is invariant.
5577 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005578 seqlen = PySequence_Fast_GET_SIZE(fseq);
5579 /* If empty sequence, return u"". */
5580 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005581 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5582 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005583 }
5584 /* If singleton sequence with an exact Unicode, return that. */
5585 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005586 item = PySequence_Fast_GET_ITEM(fseq, 0);
5587 if (PyUnicode_CheckExact(item)) {
5588 Py_INCREF(item);
5589 res = (PyUnicodeObject *)item;
5590 goto Done;
5591 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005592 }
5593
Tim Peters05eba1f2004-08-27 21:32:02 +00005594 /* At least two items to join, or one that isn't exact Unicode. */
5595 if (seqlen > 1) {
5596 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005597 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005598 sep = &blank;
5599 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005600 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005601 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005602 internal_separator = PyUnicode_FromObject(separator);
5603 if (internal_separator == NULL)
5604 goto onError;
5605 sep = PyUnicode_AS_UNICODE(internal_separator);
5606 seplen = PyUnicode_GET_SIZE(internal_separator);
5607 /* In case PyUnicode_FromObject() mutated seq. */
5608 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005609 }
5610 }
5611
5612 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005613 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005614 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005615 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005616 res_p = PyUnicode_AS_UNICODE(res);
5617 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005618
Tim Peters05eba1f2004-08-27 21:32:02 +00005619 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005620 Py_ssize_t itemlen;
5621 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005622
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005623 item = PySequence_Fast_GET_ITEM(fseq, i);
5624 /* Convert item to Unicode. */
5625 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5626 PyErr_Format(PyExc_TypeError,
5627 "sequence item %zd: expected string or Unicode,"
5628 " %.80s found",
5629 i, Py_TYPE(item)->tp_name);
5630 goto onError;
5631 }
5632 item = PyUnicode_FromObject(item);
5633 if (item == NULL)
5634 goto onError;
5635 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005636
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005637 /* In case PyUnicode_FromObject() mutated seq. */
5638 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005639
Tim Peters8ce9f162004-08-27 01:49:32 +00005640 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005641 itemlen = PyUnicode_GET_SIZE(item);
5642 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005643 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005644 goto Overflow;
5645 if (i < seqlen - 1) {
5646 new_res_used += seplen;
5647 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005648 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005649 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005650 if (new_res_used > res_alloc) {
5651 /* double allocated size until it's big enough */
5652 do {
5653 res_alloc += res_alloc;
5654 if (res_alloc <= 0)
5655 goto Overflow;
5656 } while (new_res_used > res_alloc);
5657 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5658 Py_DECREF(item);
5659 goto onError;
5660 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005661 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005662 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005663
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005664 /* Copy item, and maybe the separator. */
5665 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5666 res_p += itemlen;
5667 if (i < seqlen - 1) {
5668 Py_UNICODE_COPY(res_p, sep, seplen);
5669 res_p += seplen;
5670 }
5671 Py_DECREF(item);
5672 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005673 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005674
Tim Peters05eba1f2004-08-27 21:32:02 +00005675 /* Shrink res to match the used area; this probably can't fail,
5676 * but it's cheap to check.
5677 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005678 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005679 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005680
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005681 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005682 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005683 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 return (PyObject *)res;
5685
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005686 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005687 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005688 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005689 Py_DECREF(item);
5690 /* fall through */
5691
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005692 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005693 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005694 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005695 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 return NULL;
5697}
5698
Tim Petersced69f82003-09-16 20:30:58 +00005699static
5700PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005701 Py_ssize_t left,
5702 Py_ssize_t right,
5703 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704{
5705 PyUnicodeObject *u;
5706
5707 if (left < 0)
5708 left = 0;
5709 if (right < 0)
5710 right = 0;
5711
Tim Peters7a29bd52001-09-12 03:03:31 +00005712 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 Py_INCREF(self);
5714 return self;
5715 }
5716
Neal Norwitze7d8be82008-07-31 17:17:14 +00005717 if (left > PY_SSIZE_T_MAX - self->length ||
5718 right > PY_SSIZE_T_MAX - (left + self->length)) {
5719 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5720 return NULL;
5721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722 u = _PyUnicode_New(left + self->length + right);
5723 if (u) {
5724 if (left)
5725 Py_UNICODE_FILL(u->str, fill, left);
5726 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5727 if (right)
5728 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5729 }
5730
5731 return u;
5732}
5733
Antoine Pitrou64672132010-01-13 07:55:48 +00005734PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737
5738 string = PyUnicode_FromObject(string);
5739 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005740 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741
Antoine Pitrou64672132010-01-13 07:55:48 +00005742 list = stringlib_splitlines(
5743 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5744 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745
5746 Py_DECREF(string);
5747 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748}
5749
Tim Petersced69f82003-09-16 20:30:58 +00005750static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005752 PyUnicodeObject *substring,
5753 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005756 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005759 return stringlib_split_whitespace(
5760 (PyObject*) self, self->str, self->length, maxcount
5761 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762
Antoine Pitrou64672132010-01-13 07:55:48 +00005763 return stringlib_split(
5764 (PyObject*) self, self->str, self->length,
5765 substring->str, substring->length,
5766 maxcount
5767 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768}
5769
Tim Petersced69f82003-09-16 20:30:58 +00005770static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005771PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005772 PyUnicodeObject *substring,
5773 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005774{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005775 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005776 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005777
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005778 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005779 return stringlib_rsplit_whitespace(
5780 (PyObject*) self, self->str, self->length, maxcount
5781 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005782
Antoine Pitrou64672132010-01-13 07:55:48 +00005783 return stringlib_rsplit(
5784 (PyObject*) self, self->str, self->length,
5785 substring->str, substring->length,
5786 maxcount
5787 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005788}
5789
5790static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005792 PyUnicodeObject *str1,
5793 PyUnicodeObject *str2,
5794 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795{
5796 PyUnicodeObject *u;
5797
5798 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005799 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005800 else if (maxcount == 0 || self->length == 0)
5801 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802
Fredrik Lundh347ee272006-05-24 16:35:18 +00005803 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005804 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005805 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005806 if (str1->length == 0)
5807 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005808 if (str1->length == 1) {
5809 /* replace characters */
5810 Py_UNICODE u1, u2;
5811 if (!findchar(self->str, self->length, str1->str[0]))
5812 goto nothing;
5813 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5814 if (!u)
5815 return NULL;
5816 Py_UNICODE_COPY(u->str, self->str, self->length);
5817 u1 = str1->str[0];
5818 u2 = str2->str[0];
5819 for (i = 0; i < u->length; i++)
5820 if (u->str[i] == u1) {
5821 if (--maxcount < 0)
5822 break;
5823 u->str[i] = u2;
5824 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005826 i = stringlib_find(
5827 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005829 if (i < 0)
5830 goto nothing;
5831 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5832 if (!u)
5833 return NULL;
5834 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005835
5836 /* change everything in-place, starting with this one */
5837 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5838 i += str1->length;
5839
5840 while ( --maxcount > 0) {
5841 i = stringlib_find(self->str+i, self->length-i,
5842 str1->str, str1->length,
5843 i);
5844 if (i == -1)
5845 break;
5846 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5847 i += str1->length;
5848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005851
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005852 Py_ssize_t n, i, j, e;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005853 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 Py_UNICODE *p;
5855
5856 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005857 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5858 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005859 if (n == 0)
5860 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005861 /* new_size = self->length + n * (str2->length - str1->length)); */
5862 delta = (str2->length - str1->length);
5863 if (delta == 0) {
5864 new_size = self->length;
5865 } else {
5866 product = n * (str2->length - str1->length);
5867 if ((product / (str2->length - str1->length)) != n) {
5868 PyErr_SetString(PyExc_OverflowError,
5869 "replace string is too long");
5870 return NULL;
5871 }
5872 new_size = self->length + product;
5873 if (new_size < 0) {
5874 PyErr_SetString(PyExc_OverflowError,
5875 "replace string is too long");
5876 return NULL;
5877 }
5878 }
5879 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005880 if (!u)
5881 return NULL;
5882 i = 0;
5883 p = u->str;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005884 e = self->length - str1->length;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005885 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005886 while (n-- > 0) {
5887 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005888 j = stringlib_find(self->str+i, self->length-i,
5889 str1->str, str1->length,
5890 i);
5891 if (j == -1)
5892 break;
5893 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005894 /* copy unchanged part [i:j] */
5895 Py_UNICODE_COPY(p, self->str+i, j-i);
5896 p += j - i;
5897 }
5898 /* copy substitution string */
5899 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005900 Py_UNICODE_COPY(p, str2->str, str2->length);
5901 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005902 }
5903 i = j + str1->length;
5904 }
5905 if (i < self->length)
5906 /* copy tail [i:] */
5907 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005908 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005909 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005910 while (n > 0) {
5911 Py_UNICODE_COPY(p, str2->str, str2->length);
5912 p += str2->length;
5913 if (--n <= 0)
5914 break;
5915 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005917 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 }
5919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005921
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005922 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005923 /* nothing to replace; return original string (when possible) */
5924 if (PyUnicode_CheckExact(self)) {
5925 Py_INCREF(self);
5926 return (PyObject *) self;
5927 }
5928 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929}
5930
5931/* --- Unicode Object Methods --------------------------------------------- */
5932
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005933PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005934 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935\n\
5936Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005937characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938
5939static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005940unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 return fixup(self, fixtitle);
5943}
5944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005945PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005946 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947\n\
5948Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005949have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950
5951static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005952unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 return fixup(self, fixcapitalize);
5955}
5956
5957#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005958PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005959 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960\n\
5961Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005962normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963
5964static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005965unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966{
5967 PyObject *list;
5968 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005969 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 /* Split into words */
5972 list = split(self, NULL, -1);
5973 if (!list)
5974 return NULL;
5975
5976 /* Capitalize each word */
5977 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5978 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005979 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 if (item == NULL)
5981 goto onError;
5982 Py_DECREF(PyList_GET_ITEM(list, i));
5983 PyList_SET_ITEM(list, i, item);
5984 }
5985
5986 /* Join the words to form a new string */
5987 item = PyUnicode_Join(NULL, list);
5988
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005989 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 Py_DECREF(list);
5991 return (PyObject *)item;
5992}
5993#endif
5994
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005995/* Argument converter. Coerces to a single unicode character */
5996
5997static int
5998convert_uc(PyObject *obj, void *addr)
5999{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006000 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6001 PyObject *uniobj;
6002 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006003
Benjamin Peterson857ce152009-01-31 16:29:18 +00006004 uniobj = PyUnicode_FromObject(obj);
6005 if (uniobj == NULL) {
6006 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006007 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006008 return 0;
6009 }
6010 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6011 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006012 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006013 Py_DECREF(uniobj);
6014 return 0;
6015 }
6016 unistr = PyUnicode_AS_UNICODE(uniobj);
6017 *fillcharloc = unistr[0];
6018 Py_DECREF(uniobj);
6019 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006020}
6021
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006022PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006023 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006025Return S centered in a Unicode string of length width. Padding is\n\
6026done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027
6028static PyObject *
6029unicode_center(PyUnicodeObject *self, PyObject *args)
6030{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006031 Py_ssize_t marg, left;
6032 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006033 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034
Thomas Woutersde017742006-02-16 19:34:37 +00006035 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 return NULL;
6037
Tim Peters7a29bd52001-09-12 03:03:31 +00006038 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 Py_INCREF(self);
6040 return (PyObject*) self;
6041 }
6042
6043 marg = width - self->length;
6044 left = marg / 2 + (marg & width & 1);
6045
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006046 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047}
6048
Marc-André Lemburge5034372000-08-08 08:04:29 +00006049#if 0
6050
6051/* This code should go into some future Unicode collation support
6052 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006053 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006054
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006055/* speedy UTF-16 code point order comparison */
6056/* gleaned from: */
6057/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6058
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006059static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006060{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006061 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006062 0, 0, 0, 0, 0, 0, 0, 0,
6063 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006064 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006065};
6066
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067static int
6068unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6069{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006070 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006071
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 Py_UNICODE *s1 = str1->str;
6073 Py_UNICODE *s2 = str2->str;
6074
6075 len1 = str1->length;
6076 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006077
Guido van Rossumd57fd912000-03-10 22:53:23 +00006078 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006079 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006080
6081 c1 = *s1++;
6082 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006083
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006084 if (c1 > (1<<11) * 26)
6085 c1 += utf16Fixup[c1>>11];
6086 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006087 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006088 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006089
6090 if (c1 != c2)
6091 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006092
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006093 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 }
6095
6096 return (len1 < len2) ? -1 : (len1 != len2);
6097}
6098
Marc-André Lemburge5034372000-08-08 08:04:29 +00006099#else
6100
6101static int
6102unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6103{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006104 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006105
6106 Py_UNICODE *s1 = str1->str;
6107 Py_UNICODE *s2 = str2->str;
6108
6109 len1 = str1->length;
6110 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006111
Marc-André Lemburge5034372000-08-08 08:04:29 +00006112 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006113 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006114
Fredrik Lundh45714e92001-06-26 16:39:36 +00006115 c1 = *s1++;
6116 c2 = *s2++;
6117
6118 if (c1 != c2)
6119 return (c1 < c2) ? -1 : 1;
6120
Marc-André Lemburge5034372000-08-08 08:04:29 +00006121 len1--; len2--;
6122 }
6123
6124 return (len1 < len2) ? -1 : (len1 != len2);
6125}
6126
6127#endif
6128
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006130 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131{
6132 PyUnicodeObject *u = NULL, *v = NULL;
6133 int result;
6134
6135 /* Coerce the two arguments */
6136 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6137 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006138 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6140 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006141 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142
Thomas Wouters7e474022000-07-16 12:04:32 +00006143 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006145 Py_DECREF(u);
6146 Py_DECREF(v);
6147 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 }
6149
6150 result = unicode_compare(u, v);
6151
6152 Py_DECREF(u);
6153 Py_DECREF(v);
6154 return result;
6155
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006156 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 Py_XDECREF(u);
6158 Py_XDECREF(v);
6159 return -1;
6160}
6161
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006162PyObject *PyUnicode_RichCompare(PyObject *left,
6163 PyObject *right,
6164 int op)
6165{
6166 int result;
6167
6168 result = PyUnicode_Compare(left, right);
6169 if (result == -1 && PyErr_Occurred())
6170 goto onError;
6171
6172 /* Convert the return value to a Boolean */
6173 switch (op) {
6174 case Py_EQ:
6175 result = (result == 0);
6176 break;
6177 case Py_NE:
6178 result = (result != 0);
6179 break;
6180 case Py_LE:
6181 result = (result <= 0);
6182 break;
6183 case Py_GE:
6184 result = (result >= 0);
6185 break;
6186 case Py_LT:
6187 result = (result == -1);
6188 break;
6189 case Py_GT:
6190 result = (result == 1);
6191 break;
6192 }
6193 return PyBool_FromLong(result);
6194
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006195 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006196
6197 /* Standard case
6198
6199 Type errors mean that PyUnicode_FromObject() could not convert
6200 one of the arguments (usually the right hand side) to Unicode,
6201 ie. we can't handle the comparison request. However, it is
6202 possible that the other object knows a comparison method, which
6203 is why we return Py_NotImplemented to give the other object a
6204 chance.
6205
6206 */
6207 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6208 PyErr_Clear();
6209 Py_INCREF(Py_NotImplemented);
6210 return Py_NotImplemented;
6211 }
6212 if (op != Py_EQ && op != Py_NE)
6213 return NULL;
6214
6215 /* Equality comparison.
6216
6217 This is a special case: we silence any PyExc_UnicodeDecodeError
6218 and instead turn it into a PyErr_UnicodeWarning.
6219
6220 */
6221 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6222 return NULL;
6223 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006224 if (PyErr_Warn(PyExc_UnicodeWarning,
6225 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006226 "Unicode equal comparison "
6227 "failed to convert both arguments to Unicode - "
6228 "interpreting them as being unequal" :
6229 "Unicode unequal comparison "
6230 "failed to convert both arguments to Unicode - "
6231 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006232 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006233 return NULL;
6234 result = (op == Py_NE);
6235 return PyBool_FromLong(result);
6236}
6237
Guido van Rossum403d68b2000-03-13 15:55:09 +00006238int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006239 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006240{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006241 PyObject *str, *sub;
6242 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006243
6244 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006245 sub = PyUnicode_FromObject(element);
6246 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006247 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006248 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006249
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006250 str = PyUnicode_FromObject(container);
6251 if (!str) {
6252 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006253 return -1;
6254 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006255
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006256 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006257
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006258 Py_DECREF(str);
6259 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006260
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006261 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006262}
6263
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264/* Concat to string or Unicode object giving a new Unicode object. */
6265
6266PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006267 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268{
6269 PyUnicodeObject *u = NULL, *v = NULL, *w;
6270
6271 /* Coerce the two arguments */
6272 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6273 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006274 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6276 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006277 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278
6279 /* Shortcuts */
6280 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006281 Py_DECREF(v);
6282 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 }
6284 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006285 Py_DECREF(u);
6286 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 }
6288
6289 /* Concat the two Unicode strings */
6290 w = _PyUnicode_New(u->length + v->length);
6291 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006292 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293 Py_UNICODE_COPY(w->str, u->str, u->length);
6294 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6295
6296 Py_DECREF(u);
6297 Py_DECREF(v);
6298 return (PyObject *)w;
6299
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006300 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 Py_XDECREF(u);
6302 Py_XDECREF(v);
6303 return NULL;
6304}
6305
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006306PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006307 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006309Return the number of non-overlapping occurrences of substring sub in\n\
6310Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006311interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312
6313static PyObject *
6314unicode_count(PyUnicodeObject *self, PyObject *args)
6315{
6316 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006317 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006318 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 PyObject *result;
6320
Guido van Rossumb8872e62000-05-09 14:14:27 +00006321 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006322 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 return NULL;
6324
6325 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006326 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006328 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006329
Antoine Pitrou64672132010-01-13 07:55:48 +00006330 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006331 result = PyInt_FromSsize_t(
6332 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006333 substring->str, substring->length,
6334 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006335 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336
6337 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006338
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339 return result;
6340}
6341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006342PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006343 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006345Encodes S using the codec registered for encoding. encoding defaults\n\
6346to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006347handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006348a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6349'xmlcharrefreplace' as well as any other name registered with\n\
6350codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351
6352static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006353unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006355 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356 char *encoding = NULL;
6357 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006358 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006359
Benjamin Peterson332d7212009-09-18 21:14:55 +00006360 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6361 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006363 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006364 if (v == NULL)
6365 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006366 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006367 PyErr_Format(PyExc_TypeError,
6368 "encoder did not return a string/unicode object "
6369 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006370 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006371 Py_DECREF(v);
6372 return NULL;
6373 }
6374 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006375
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006376 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006377 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006378}
6379
6380PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006381 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006382\n\
6383Decodes S using the codec registered for encoding. encoding defaults\n\
6384to the default encoding. errors may be given to set a different error\n\
6385handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6386a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6387as well as any other name registerd with codecs.register_error that is\n\
6388able to handle UnicodeDecodeErrors.");
6389
6390static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006391unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006392{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006393 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006394 char *encoding = NULL;
6395 char *errors = NULL;
6396 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006397
Benjamin Peterson332d7212009-09-18 21:14:55 +00006398 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6399 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006400 return NULL;
6401 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006402 if (v == NULL)
6403 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006404 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006405 PyErr_Format(PyExc_TypeError,
6406 "decoder did not return a string/unicode object "
6407 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006408 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006409 Py_DECREF(v);
6410 return NULL;
6411 }
6412 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006413
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006414 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006415 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416}
6417
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006418PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006419 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420\n\
6421Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006422If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423
6424static PyObject*
6425unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6426{
6427 Py_UNICODE *e;
6428 Py_UNICODE *p;
6429 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006430 Py_UNICODE *qe;
6431 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 PyUnicodeObject *u;
6433 int tabsize = 8;
6434
6435 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006436 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006437
Thomas Wouters7e474022000-07-16 12:04:32 +00006438 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006439 i = 0; /* chars up to and including most recent \n or \r */
6440 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6441 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 for (p = self->str; p < e; p++)
6443 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006444 if (tabsize > 0) {
6445 incr = tabsize - (j % tabsize); /* cannot overflow */
6446 if (j > PY_SSIZE_T_MAX - incr)
6447 goto overflow1;
6448 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006449 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006450 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006452 if (j > PY_SSIZE_T_MAX - 1)
6453 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 j++;
6455 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006456 if (i > PY_SSIZE_T_MAX - j)
6457 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006459 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 }
6461 }
6462
Guido van Rossum5bdff602008-03-11 21:18:06 +00006463 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006464 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006465
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 /* Second pass: create output string and fill it */
6467 u = _PyUnicode_New(i + j);
6468 if (!u)
6469 return NULL;
6470
Guido van Rossum5bdff602008-03-11 21:18:06 +00006471 j = 0; /* same as in first pass */
6472 q = u->str; /* next output char */
6473 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474
6475 for (p = self->str; p < e; p++)
6476 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006477 if (tabsize > 0) {
6478 i = tabsize - (j % tabsize);
6479 j += i;
6480 while (i--) {
6481 if (q >= qe)
6482 goto overflow2;
6483 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006484 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006485 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006486 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006487 else {
6488 if (q >= qe)
6489 goto overflow2;
6490 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006491 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492 if (*p == '\n' || *p == '\r')
6493 j = 0;
6494 }
6495
6496 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006497
6498 overflow2:
6499 Py_DECREF(u);
6500 overflow1:
6501 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6502 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503}
6504
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006505PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006506 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507\n\
6508Return the lowest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00006509such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510arguments start and end are interpreted as in slice notation.\n\
6511\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006512Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513
6514static PyObject *
6515unicode_find(PyUnicodeObject *self, PyObject *args)
6516{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006517 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006518 Py_ssize_t start;
6519 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006520 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521
Facundo Batista57d56692007-11-16 18:04:14 +00006522 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006525 result = stringlib_find_slice(
6526 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6527 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6528 start, end
6529 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530
6531 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006532
6533 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534}
6535
6536static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006537unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538{
6539 if (index < 0 || index >= self->length) {
6540 PyErr_SetString(PyExc_IndexError, "string index out of range");
6541 return NULL;
6542 }
6543
6544 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6545}
6546
6547static long
6548unicode_hash(PyUnicodeObject *self)
6549{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006550 /* Since Unicode objects compare equal to their ASCII string
6551 counterparts, they should use the individual character values
6552 as basis for their hash value. This is needed to assure that
6553 strings and Unicode objects behave in the same way as
6554 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555
Martin v. Löwis18e16552006-02-15 17:27:45 +00006556 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006557 register Py_UNICODE *p;
6558 register long x;
6559
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006561 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006562 len = PyUnicode_GET_SIZE(self);
6563 p = PyUnicode_AS_UNICODE(self);
6564 x = *p << 7;
6565 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006566 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006567 x ^= PyUnicode_GET_SIZE(self);
6568 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006569 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006570 self->hash = x;
6571 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572}
6573
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006574PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006575 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006577Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578
6579static PyObject *
6580unicode_index(PyUnicodeObject *self, PyObject *args)
6581{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006582 Py_ssize_t result;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006583 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006584 Py_ssize_t start;
6585 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586
Facundo Batista57d56692007-11-16 18:04:14 +00006587 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006590 result = stringlib_find_slice(
6591 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6592 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6593 start, end
6594 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595
6596 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006597
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 if (result < 0) {
6599 PyErr_SetString(PyExc_ValueError, "substring not found");
6600 return NULL;
6601 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006602
Martin v. Löwis18e16552006-02-15 17:27:45 +00006603 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604}
6605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006606PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006607 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006609Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006610at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611
6612static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006613unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614{
6615 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6616 register const Py_UNICODE *e;
6617 int cased;
6618
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 /* Shortcut for single character strings */
6620 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006621 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006623 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006624 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006625 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006626
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 e = p + PyUnicode_GET_SIZE(self);
6628 cased = 0;
6629 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006630 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006631
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006632 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6633 return PyBool_FromLong(0);
6634 else if (!cased && Py_UNICODE_ISLOWER(ch))
6635 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006637 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638}
6639
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006640PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006641 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006643Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006644at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645
6646static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006647unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648{
6649 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6650 register const Py_UNICODE *e;
6651 int cased;
6652
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 /* Shortcut for single character strings */
6654 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006655 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006657 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006658 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006659 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006660
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 e = p + PyUnicode_GET_SIZE(self);
6662 cased = 0;
6663 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006664 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006665
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006666 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6667 return PyBool_FromLong(0);
6668 else if (!cased && Py_UNICODE_ISUPPER(ch))
6669 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006671 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672}
6673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006674PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006675 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006677Return True if S is a titlecased string and there is at least one\n\
6678character in S, i.e. upper- and titlecase characters may only\n\
6679follow uncased characters and lowercase characters only cased ones.\n\
6680Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681
6682static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006683unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684{
6685 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6686 register const Py_UNICODE *e;
6687 int cased, previous_is_cased;
6688
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 /* Shortcut for single character strings */
6690 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006691 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6692 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006694 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006695 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006696 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006697
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 e = p + PyUnicode_GET_SIZE(self);
6699 cased = 0;
6700 previous_is_cased = 0;
6701 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006702 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006703
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006704 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6705 if (previous_is_cased)
6706 return PyBool_FromLong(0);
6707 previous_is_cased = 1;
6708 cased = 1;
6709 }
6710 else if (Py_UNICODE_ISLOWER(ch)) {
6711 if (!previous_is_cased)
6712 return PyBool_FromLong(0);
6713 previous_is_cased = 1;
6714 cased = 1;
6715 }
6716 else
6717 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006719 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720}
6721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006722PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006723 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006725Return True if all characters in S are whitespace\n\
6726and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727
6728static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006729unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730{
6731 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6732 register const Py_UNICODE *e;
6733
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 /* Shortcut for single character strings */
6735 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006736 Py_UNICODE_ISSPACE(*p))
6737 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006739 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006740 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006741 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006742
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 e = p + PyUnicode_GET_SIZE(self);
6744 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006745 if (!Py_UNICODE_ISSPACE(*p))
6746 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006748 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749}
6750
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006751PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006752 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006753\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006754Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006755and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006756
6757static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006758unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006759{
6760 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6761 register const Py_UNICODE *e;
6762
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006763 /* Shortcut for single character strings */
6764 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006765 Py_UNICODE_ISALPHA(*p))
6766 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006767
6768 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006769 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006770 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006771
6772 e = p + PyUnicode_GET_SIZE(self);
6773 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006774 if (!Py_UNICODE_ISALPHA(*p))
6775 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006776 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006777 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006778}
6779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006780PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006781 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006782\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006783Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006784and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006785
6786static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006787unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006788{
6789 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6790 register const Py_UNICODE *e;
6791
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006792 /* Shortcut for single character strings */
6793 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006794 Py_UNICODE_ISALNUM(*p))
6795 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006796
6797 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006798 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006799 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006800
6801 e = p + PyUnicode_GET_SIZE(self);
6802 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006803 if (!Py_UNICODE_ISALNUM(*p))
6804 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006805 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006806 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006807}
6808
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006809PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006810 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006812Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006813False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814
6815static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006816unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817{
6818 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6819 register const Py_UNICODE *e;
6820
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 /* Shortcut for single character strings */
6822 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006823 Py_UNICODE_ISDECIMAL(*p))
6824 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006825
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006826 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006827 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006828 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006829
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830 e = p + PyUnicode_GET_SIZE(self);
6831 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006832 if (!Py_UNICODE_ISDECIMAL(*p))
6833 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006835 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836}
6837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006838PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006839 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006841Return True if all characters in S are digits\n\
6842and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843
6844static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006845unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846{
6847 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6848 register const Py_UNICODE *e;
6849
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850 /* Shortcut for single character strings */
6851 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006852 Py_UNICODE_ISDIGIT(*p))
6853 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006855 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006856 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006857 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006858
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 e = p + PyUnicode_GET_SIZE(self);
6860 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006861 if (!Py_UNICODE_ISDIGIT(*p))
6862 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006864 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865}
6866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006867PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006868 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006870Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006871False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872
6873static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006874unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875{
6876 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6877 register const Py_UNICODE *e;
6878
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 /* Shortcut for single character strings */
6880 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006881 Py_UNICODE_ISNUMERIC(*p))
6882 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006884 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006885 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006886 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006887
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 e = p + PyUnicode_GET_SIZE(self);
6889 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006890 if (!Py_UNICODE_ISNUMERIC(*p))
6891 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006893 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894}
6895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006896PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006897 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898\n\
6899Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006900iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901
6902static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006903unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006905 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906}
6907
Martin v. Löwis18e16552006-02-15 17:27:45 +00006908static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909unicode_length(PyUnicodeObject *self)
6910{
6911 return self->length;
6912}
6913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006914PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006915 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006917Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006918done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919
6920static PyObject *
6921unicode_ljust(PyUnicodeObject *self, PyObject *args)
6922{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006923 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006924 Py_UNICODE fillchar = ' ';
6925
Martin v. Löwis412fb672006-04-13 06:34:32 +00006926 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927 return NULL;
6928
Tim Peters7a29bd52001-09-12 03:03:31 +00006929 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930 Py_INCREF(self);
6931 return (PyObject*) self;
6932 }
6933
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006934 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935}
6936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006937PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006938 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006940Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941
6942static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006943unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 return fixup(self, fixlower);
6946}
6947
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006948#define LEFTSTRIP 0
6949#define RIGHTSTRIP 1
6950#define BOTHSTRIP 2
6951
6952/* Arrays indexed by above */
6953static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6954
6955#define STRIPNAME(i) (stripformat[i]+3)
6956
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006957/* externally visible for str.strip(unicode) */
6958PyObject *
6959_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6960{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006961 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6962 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6963 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6964 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6965 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006966
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006967 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006968
Benjamin Peterson857ce152009-01-31 16:29:18 +00006969 i = 0;
6970 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006971 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6972 i++;
6973 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006974 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006975
Benjamin Peterson857ce152009-01-31 16:29:18 +00006976 j = len;
6977 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006978 do {
6979 j--;
6980 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6981 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006982 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006983
Benjamin Peterson857ce152009-01-31 16:29:18 +00006984 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006985 Py_INCREF(self);
6986 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006987 }
6988 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006989 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006990}
6991
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992
6993static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006994do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006996 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6997 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006998
Benjamin Peterson857ce152009-01-31 16:29:18 +00006999 i = 0;
7000 if (striptype != RIGHTSTRIP) {
7001 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7002 i++;
7003 }
7004 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007005
Benjamin Peterson857ce152009-01-31 16:29:18 +00007006 j = len;
7007 if (striptype != LEFTSTRIP) {
7008 do {
7009 j--;
7010 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7011 j++;
7012 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007013
Benjamin Peterson857ce152009-01-31 16:29:18 +00007014 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7015 Py_INCREF(self);
7016 return (PyObject*)self;
7017 }
7018 else
7019 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020}
7021
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007022
7023static PyObject *
7024do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7025{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007026 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007027
Benjamin Peterson857ce152009-01-31 16:29:18 +00007028 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7029 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007030
Benjamin Peterson857ce152009-01-31 16:29:18 +00007031 if (sep != NULL && sep != Py_None) {
7032 if (PyUnicode_Check(sep))
7033 return _PyUnicode_XStrip(self, striptype, sep);
7034 else if (PyString_Check(sep)) {
7035 PyObject *res;
7036 sep = PyUnicode_FromObject(sep);
7037 if (sep==NULL)
7038 return NULL;
7039 res = _PyUnicode_XStrip(self, striptype, sep);
7040 Py_DECREF(sep);
7041 return res;
7042 }
7043 else {
7044 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007045 "%s arg must be None, unicode or str",
7046 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007047 return NULL;
7048 }
7049 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007050
Benjamin Peterson857ce152009-01-31 16:29:18 +00007051 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007052}
7053
7054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007055PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007056 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007057\n\
7058Return a copy of the string S with leading and trailing\n\
7059whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007060If chars is given and not None, remove characters in chars instead.\n\
7061If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007062
7063static PyObject *
7064unicode_strip(PyUnicodeObject *self, PyObject *args)
7065{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007066 if (PyTuple_GET_SIZE(args) == 0)
7067 return do_strip(self, BOTHSTRIP); /* Common case */
7068 else
7069 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007070}
7071
7072
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007073PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007074 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007075\n\
7076Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007077If chars is given and not None, remove characters in chars instead.\n\
7078If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007079
7080static PyObject *
7081unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7082{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007083 if (PyTuple_GET_SIZE(args) == 0)
7084 return do_strip(self, LEFTSTRIP); /* Common case */
7085 else
7086 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007087}
7088
7089
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007090PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007091 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007092\n\
7093Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007094If chars is given and not None, remove characters in chars instead.\n\
7095If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007096
7097static PyObject *
7098unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7099{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007100 if (PyTuple_GET_SIZE(args) == 0)
7101 return do_strip(self, RIGHTSTRIP); /* Common case */
7102 else
7103 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007104}
7105
7106
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007108unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109{
7110 PyUnicodeObject *u;
7111 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007112 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007113 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114
7115 if (len < 0)
7116 len = 0;
7117
Tim Peters7a29bd52001-09-12 03:03:31 +00007118 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119 /* no repeat, return original string */
7120 Py_INCREF(str);
7121 return (PyObject*) str;
7122 }
Tim Peters8f422462000-09-09 06:13:41 +00007123
7124 /* ensure # of chars needed doesn't overflow int and # of bytes
7125 * needed doesn't overflow size_t
7126 */
7127 nchars = len * str->length;
7128 if (len && nchars / len != str->length) {
7129 PyErr_SetString(PyExc_OverflowError,
7130 "repeated string is too long");
7131 return NULL;
7132 }
7133 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7134 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7135 PyErr_SetString(PyExc_OverflowError,
7136 "repeated string is too long");
7137 return NULL;
7138 }
7139 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140 if (!u)
7141 return NULL;
7142
7143 p = u->str;
7144
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007145 if (str->length == 1 && len > 0) {
7146 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007147 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007148 Py_ssize_t done = 0; /* number of characters copied this far */
7149 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007150 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007151 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007152 }
7153 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007154 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007155 Py_UNICODE_COPY(p+done, p, n);
7156 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007157 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007158 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159
7160 return (PyObject*) u;
7161}
7162
7163PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007164 PyObject *subobj,
7165 PyObject *replobj,
7166 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167{
7168 PyObject *self;
7169 PyObject *str1;
7170 PyObject *str2;
7171 PyObject *result;
7172
7173 self = PyUnicode_FromObject(obj);
7174 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007175 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176 str1 = PyUnicode_FromObject(subobj);
7177 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007178 Py_DECREF(self);
7179 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180 }
7181 str2 = PyUnicode_FromObject(replobj);
7182 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007183 Py_DECREF(self);
7184 Py_DECREF(str1);
7185 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186 }
Tim Petersced69f82003-09-16 20:30:58 +00007187 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007188 (PyUnicodeObject *)str1,
7189 (PyUnicodeObject *)str2,
7190 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 Py_DECREF(self);
7192 Py_DECREF(str1);
7193 Py_DECREF(str2);
7194 return result;
7195}
7196
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007197PyDoc_STRVAR(replace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007198 "S.replace (old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199\n\
7200Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007201old replaced by new. If the optional argument count is\n\
7202given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203
7204static PyObject*
7205unicode_replace(PyUnicodeObject *self, PyObject *args)
7206{
7207 PyUnicodeObject *str1;
7208 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007209 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210 PyObject *result;
7211
Martin v. Löwis18e16552006-02-15 17:27:45 +00007212 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213 return NULL;
7214 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7215 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007216 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007218 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007219 Py_DECREF(str1);
7220 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007221 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222
7223 result = replace(self, str1, str2, maxcount);
7224
7225 Py_DECREF(str1);
7226 Py_DECREF(str2);
7227 return result;
7228}
7229
7230static
7231PyObject *unicode_repr(PyObject *unicode)
7232{
7233 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007234 PyUnicode_GET_SIZE(unicode),
7235 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236}
7237
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007238PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007239 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240\n\
7241Return the highest index in S where substring sub is found,\n\
Georg Brandl9efd9b62007-07-29 17:38:35 +00007242such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243arguments start and end are interpreted as in slice notation.\n\
7244\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007245Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246
7247static PyObject *
7248unicode_rfind(PyUnicodeObject *self, PyObject *args)
7249{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007250 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007251 Py_ssize_t start;
7252 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007253 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254
Facundo Batista57d56692007-11-16 18:04:14 +00007255 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007258 result = stringlib_rfind_slice(
7259 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7260 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7261 start, end
7262 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263
7264 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007265
7266 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267}
7268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007269PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007270 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007272Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273
7274static PyObject *
7275unicode_rindex(PyUnicodeObject *self, PyObject *args)
7276{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007277 PyObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007278 Py_ssize_t start;
7279 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007280 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281
Facundo Batista57d56692007-11-16 18:04:14 +00007282 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007285 result = stringlib_rfind_slice(
7286 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7287 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7288 start, end
7289 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290
7291 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007292
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293 if (result < 0) {
7294 PyErr_SetString(PyExc_ValueError, "substring not found");
7295 return NULL;
7296 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007297 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298}
7299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007300PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007301 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007303Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007304done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305
7306static PyObject *
7307unicode_rjust(PyUnicodeObject *self, PyObject *args)
7308{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007309 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007310 Py_UNICODE fillchar = ' ';
7311
Martin v. Löwis412fb672006-04-13 06:34:32 +00007312 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 return NULL;
7314
Tim Peters7a29bd52001-09-12 03:03:31 +00007315 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316 Py_INCREF(self);
7317 return (PyObject*) self;
7318 }
7319
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007320 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321}
7322
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007324unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007325{
7326 /* standard clamping */
7327 if (start < 0)
7328 start = 0;
7329 if (end < 0)
7330 end = 0;
7331 if (end > self->length)
7332 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007333 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334 /* full slice, return original string */
7335 Py_INCREF(self);
7336 return (PyObject*) self;
7337 }
7338 if (start > end)
7339 start = end;
7340 /* copy slice */
7341 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007342 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343}
7344
7345PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007346 PyObject *sep,
7347 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348{
7349 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007350
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351 s = PyUnicode_FromObject(s);
7352 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007353 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007354 if (sep != NULL) {
7355 sep = PyUnicode_FromObject(sep);
7356 if (sep == NULL) {
7357 Py_DECREF(s);
7358 return NULL;
7359 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360 }
7361
7362 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7363
7364 Py_DECREF(s);
7365 Py_XDECREF(sep);
7366 return result;
7367}
7368
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007369PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007370 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371\n\
7372Return a list of the words in S, using sep as the\n\
7373delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007374splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007375whitespace string is a separator and empty strings are\n\
7376removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377
7378static PyObject*
7379unicode_split(PyUnicodeObject *self, PyObject *args)
7380{
7381 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007382 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383
Martin v. Löwis18e16552006-02-15 17:27:45 +00007384 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385 return NULL;
7386
7387 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007388 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007390 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007391 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007392 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393}
7394
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007395PyObject *
7396PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7397{
7398 PyObject* str_obj;
7399 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007400 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007401
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007402 str_obj = PyUnicode_FromObject(str_in);
7403 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007404 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007405 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007406 if (!sep_obj) {
7407 Py_DECREF(str_obj);
7408 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007409 }
7410
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007411 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007412 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7413 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7414 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007415
Fredrik Lundhb9479482006-05-26 17:22:38 +00007416 Py_DECREF(sep_obj);
7417 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007418
7419 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007420}
7421
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007422
7423PyObject *
7424PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7425{
7426 PyObject* str_obj;
7427 PyObject* sep_obj;
7428 PyObject* out;
7429
7430 str_obj = PyUnicode_FromObject(str_in);
7431 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007432 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007433 sep_obj = PyUnicode_FromObject(sep_in);
7434 if (!sep_obj) {
7435 Py_DECREF(str_obj);
7436 return NULL;
7437 }
7438
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007439 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007440 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7441 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7442 );
7443
7444 Py_DECREF(sep_obj);
7445 Py_DECREF(str_obj);
7446
7447 return out;
7448}
7449
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007450PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007451 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007452\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007453Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007454the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007455found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007456
7457static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007458unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007459{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007460 return PyUnicode_Partition((PyObject *)self, separator);
7461}
7462
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007463PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007464 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007465\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007466Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007467the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007468separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007469
7470static PyObject*
7471unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7472{
7473 return PyUnicode_RPartition((PyObject *)self, separator);
7474}
7475
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007476PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007477 PyObject *sep,
7478 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007479{
7480 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007481
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007482 s = PyUnicode_FromObject(s);
7483 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007484 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007485 if (sep != NULL) {
7486 sep = PyUnicode_FromObject(sep);
7487 if (sep == NULL) {
7488 Py_DECREF(s);
7489 return NULL;
7490 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007491 }
7492
7493 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7494
7495 Py_DECREF(s);
7496 Py_XDECREF(sep);
7497 return result;
7498}
7499
7500PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007501 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007502\n\
7503Return a list of the words in S, using sep as the\n\
7504delimiter string, starting at the end of the string and\n\
7505working to the front. If maxsplit is given, at most maxsplit\n\
7506splits are done. If sep is not specified, any whitespace string\n\
7507is a separator.");
7508
7509static PyObject*
7510unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7511{
7512 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007513 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007514
Martin v. Löwis18e16552006-02-15 17:27:45 +00007515 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007516 return NULL;
7517
7518 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007519 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007520 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007521 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007522 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007523 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007524}
7525
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007526PyDoc_STRVAR(splitlines__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007527 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528\n\
7529Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007530Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007531is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532
7533static PyObject*
7534unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7535{
Guido van Rossum86662912000-04-11 15:38:46 +00007536 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537
Guido van Rossum86662912000-04-11 15:38:46 +00007538 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539 return NULL;
7540
Guido van Rossum86662912000-04-11 15:38:46 +00007541 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542}
7543
7544static
7545PyObject *unicode_str(PyUnicodeObject *self)
7546{
Fred Drakee4315f52000-05-09 19:53:39 +00007547 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548}
7549
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007550PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007551 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552\n\
7553Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007554and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555
7556static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007557unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559 return fixup(self, fixswapcase);
7560}
7561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007562PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007563 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564\n\
7565Return a copy of the string S, where all characters have been mapped\n\
7566through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007567Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7568Unmapped characters are left untouched. Characters mapped to None\n\
7569are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570
7571static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007572unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573{
Tim Petersced69f82003-09-16 20:30:58 +00007574 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007575 self->length,
7576 table,
7577 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578}
7579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007580PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007581 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007583Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584
7585static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007586unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588 return fixup(self, fixupper);
7589}
7590
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007591PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007592 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593\n\
Georg Brandl98064072008-09-09 19:26:00 +00007594Pad a numeric string S with zeros on the left, to fill a field\n\
7595of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596
7597static PyObject *
7598unicode_zfill(PyUnicodeObject *self, PyObject *args)
7599{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007600 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601 PyUnicodeObject *u;
7602
Martin v. Löwis18e16552006-02-15 17:27:45 +00007603 Py_ssize_t width;
7604 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605 return NULL;
7606
7607 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007608 if (PyUnicode_CheckExact(self)) {
7609 Py_INCREF(self);
7610 return (PyObject*) self;
7611 }
7612 else
7613 return PyUnicode_FromUnicode(
7614 PyUnicode_AS_UNICODE(self),
7615 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007616 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617 }
7618
7619 fill = width - self->length;
7620
7621 u = pad(self, fill, 0, '0');
7622
Walter Dörwald068325e2002-04-15 13:36:47 +00007623 if (u == NULL)
7624 return NULL;
7625
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626 if (u->str[fill] == '+' || u->str[fill] == '-') {
7627 /* move sign to beginning of string */
7628 u->str[0] = u->str[fill];
7629 u->str[fill] = '0';
7630 }
7631
7632 return (PyObject*) u;
7633}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634
7635#if 0
7636static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007637free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007639 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640}
7641#endif
7642
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007643PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007644 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007646Return True if S starts with the specified prefix, False otherwise.\n\
7647With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007648With optional end, stop comparing S at that position.\n\
7649prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650
7651static PyObject *
7652unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007653 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654{
Georg Brandl24250812006-06-09 18:45:48 +00007655 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007656 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007657 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007658 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007659 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660
Georg Brandl24250812006-06-09 18:45:48 +00007661 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007662 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7663 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007664 if (PyTuple_Check(subobj)) {
7665 Py_ssize_t i;
7666 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7667 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007668 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007669 if (substring == NULL)
7670 return NULL;
7671 result = tailmatch(self, substring, start, end, -1);
7672 Py_DECREF(substring);
7673 if (result) {
7674 Py_RETURN_TRUE;
7675 }
7676 }
7677 /* nothing matched */
7678 Py_RETURN_FALSE;
7679 }
7680 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007682 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007683 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007685 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686}
7687
7688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007689PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007690 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007692Return True if S ends with the specified suffix, False otherwise.\n\
7693With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007694With optional end, stop comparing S at that position.\n\
7695suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696
7697static PyObject *
7698unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007699 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700{
Georg Brandl24250812006-06-09 18:45:48 +00007701 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007703 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007704 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007705 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706
Georg Brandl24250812006-06-09 18:45:48 +00007707 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007708 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7709 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007710 if (PyTuple_Check(subobj)) {
7711 Py_ssize_t i;
7712 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7713 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007714 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007715 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007716 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007717 result = tailmatch(self, substring, start, end, +1);
7718 Py_DECREF(substring);
7719 if (result) {
7720 Py_RETURN_TRUE;
7721 }
7722 }
7723 Py_RETURN_FALSE;
7724 }
7725 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007727 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728
Georg Brandl24250812006-06-09 18:45:48 +00007729 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007731 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732}
7733
7734
Eric Smitha9f7d622008-02-17 19:46:49 +00007735/* Implements do_string_format, which is unicode because of stringlib */
7736#include "stringlib/string_format.h"
7737
7738PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007739 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007740\n\
7741");
7742
Eric Smithdc13b792008-05-30 18:10:04 +00007743static PyObject *
7744unicode__format__(PyObject *self, PyObject *args)
7745{
7746 PyObject *format_spec;
7747 PyObject *result = NULL;
7748 PyObject *tmp = NULL;
7749
7750 /* If 2.x, convert format_spec to the same type as value */
7751 /* This is to allow things like u''.format('') */
7752 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7753 goto done;
7754 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7755 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007756 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007757 goto done;
7758 }
7759 tmp = PyObject_Unicode(format_spec);
7760 if (tmp == NULL)
7761 goto done;
7762 format_spec = tmp;
7763
7764 result = _PyUnicode_FormatAdvanced(self,
7765 PyUnicode_AS_UNICODE(format_spec),
7766 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007767 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007768 Py_XDECREF(tmp);
7769 return result;
7770}
7771
Eric Smitha9f7d622008-02-17 19:46:49 +00007772PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007773 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007774\n\
7775");
7776
Robert Schuppenies901c9972008-06-10 10:10:31 +00007777static PyObject *
7778unicode__sizeof__(PyUnicodeObject *v)
7779{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007780 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7781 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007782}
7783
7784PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007785 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007786\n\
7787");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007788
7789static PyObject *
7790unicode_getnewargs(PyUnicodeObject *v)
7791{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007792 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007793}
7794
7795
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796static PyMethodDef unicode_methods[] = {
7797
7798 /* Order is according to common usage: often used methods should
7799 appear first, since lookup is done sequentially. */
7800
Benjamin Peterson332d7212009-09-18 21:14:55 +00007801 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007802 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7803 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007804 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007805 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7806 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7807 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7808 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7809 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7810 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7811 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007812 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007813 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7814 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7815 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007816 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007817 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007818/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7819 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7820 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7821 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007822 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007823 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007824 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007825 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007826 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7827 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7828 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7829 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7830 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7831 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7832 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7833 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7834 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7835 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7836 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7837 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7838 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7839 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007840 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007841 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7842 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7843 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7844 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007845 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007846#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007847 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848#endif
7849
7850#if 0
7851 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007852 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853#endif
7854
Benjamin Peterson857ce152009-01-31 16:29:18 +00007855 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856 {NULL, NULL}
7857};
7858
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007859static PyObject *
7860unicode_mod(PyObject *v, PyObject *w)
7861{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007862 if (!PyUnicode_Check(v)) {
7863 Py_INCREF(Py_NotImplemented);
7864 return Py_NotImplemented;
7865 }
7866 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007867}
7868
7869static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007870 0, /*nb_add*/
7871 0, /*nb_subtract*/
7872 0, /*nb_multiply*/
7873 0, /*nb_divide*/
7874 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007875};
7876
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007878 (lenfunc) unicode_length, /* sq_length */
7879 PyUnicode_Concat, /* sq_concat */
7880 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7881 (ssizeargfunc) unicode_getitem, /* sq_item */
7882 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7883 0, /* sq_ass_item */
7884 0, /* sq_ass_slice */
7885 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886};
7887
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007888static PyObject*
7889unicode_subscript(PyUnicodeObject* self, PyObject* item)
7890{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007891 if (PyIndex_Check(item)) {
7892 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007893 if (i == -1 && PyErr_Occurred())
7894 return NULL;
7895 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007896 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007897 return unicode_getitem(self, i);
7898 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007899 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007900 Py_UNICODE* source_buf;
7901 Py_UNICODE* result_buf;
7902 PyObject* result;
7903
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007904 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007905 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007906 return NULL;
7907 }
7908
7909 if (slicelength <= 0) {
7910 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007911 } else if (start == 0 && step == 1 && slicelength == self->length &&
7912 PyUnicode_CheckExact(self)) {
7913 Py_INCREF(self);
7914 return (PyObject *)self;
7915 } else if (step == 1) {
7916 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007917 } else {
7918 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007919 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7920 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007921
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007922 if (result_buf == NULL)
7923 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007924
7925 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7926 result_buf[i] = source_buf[cur];
7927 }
Tim Petersced69f82003-09-16 20:30:58 +00007928
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007929 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007930 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007931 return result;
7932 }
7933 } else {
7934 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7935 return NULL;
7936 }
7937}
7938
7939static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007940 (lenfunc)unicode_length, /* mp_length */
7941 (binaryfunc)unicode_subscript, /* mp_subscript */
7942 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007943};
7944
Martin v. Löwis18e16552006-02-15 17:27:45 +00007945static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007947 Py_ssize_t index,
7948 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949{
7950 if (index != 0) {
7951 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007952 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953 return -1;
7954 }
7955 *ptr = (void *) self->str;
7956 return PyUnicode_GET_DATA_SIZE(self);
7957}
7958
Martin v. Löwis18e16552006-02-15 17:27:45 +00007959static Py_ssize_t
7960unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007961 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962{
7963 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007964 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965 return -1;
7966}
7967
7968static int
7969unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007970 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971{
7972 if (lenp)
7973 *lenp = PyUnicode_GET_DATA_SIZE(self);
7974 return 1;
7975}
7976
Martin v. Löwiseb079f12006-02-16 14:32:27 +00007977static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007979 Py_ssize_t index,
7980 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981{
7982 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00007983
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984 if (index != 0) {
7985 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007986 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 return -1;
7988 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007989 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007991 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00007992 *ptr = (void *) PyString_AS_STRING(str);
7993 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994}
7995
7996/* Helpers for PyUnicode_Format() */
7997
7998static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007999getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008001 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008003 (*p_argidx)++;
8004 if (arglen < 0)
8005 return args;
8006 else
8007 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008 }
8009 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008010 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011 return NULL;
8012}
8013
8014#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008015#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008017#define F_ALT (1<<3)
8018#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019
Martin v. Löwis18e16552006-02-15 17:27:45 +00008020static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008021strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008022{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008023 register Py_ssize_t i;
8024 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008026 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 return len;
8029}
8030
Neal Norwitzfc76d632006-01-10 06:03:13 +00008031static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008032longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8033{
Tim Peters15231542006-02-16 01:08:01 +00008034 Py_ssize_t result;
8035
Neal Norwitzfc76d632006-01-10 06:03:13 +00008036 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008037 result = strtounicode(buffer, (char *)buffer);
8038 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008039}
8040
Guido van Rossum078151d2002-08-11 04:24:12 +00008041/* XXX To save some code duplication, formatfloat/long/int could have been
8042 shared with stringobject.c, converting from 8-bit to Unicode after the
8043 formatting is done. */
8044
Mark Dickinson18cfada2009-11-23 18:46:41 +00008045/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8046
8047static PyObject *
8048formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008050 char *p;
8051 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008053
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054 x = PyFloat_AsDouble(v);
8055 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008056 return NULL;
8057
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008059 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008060
Mark Dickinson18cfada2009-11-23 18:46:41 +00008061 p = PyOS_double_to_string(x, type, prec,
8062 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8063 if (p == NULL)
8064 return NULL;
8065 result = PyUnicode_FromStringAndSize(p, strlen(p));
8066 PyMem_Free(p);
8067 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068}
8069
Tim Peters38fd5b62000-09-21 05:43:11 +00008070static PyObject*
8071formatlong(PyObject *val, int flags, int prec, int type)
8072{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008073 char *buf;
8074 int i, len;
8075 PyObject *str; /* temporary string object. */
8076 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008077
Benjamin Peterson857ce152009-01-31 16:29:18 +00008078 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8079 if (!str)
8080 return NULL;
8081 result = _PyUnicode_New(len);
8082 if (!result) {
8083 Py_DECREF(str);
8084 return NULL;
8085 }
8086 for (i = 0; i < len; i++)
8087 result->str[i] = buf[i];
8088 result->str[len] = 0;
8089 Py_DECREF(str);
8090 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008091}
8092
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093static int
8094formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008095 size_t buflen,
8096 int flags,
8097 int prec,
8098 int type,
8099 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008101 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008102 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8103 * + 1 + 1
8104 * = 24
8105 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008106 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008107 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108 long x;
8109
8110 x = PyInt_AsLong(v);
8111 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008112 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008113 if (x < 0 && type == 'u') {
8114 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008115 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008116 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8117 sign = "-";
8118 else
8119 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008121 prec = 1;
8122
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008123 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8124 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008125 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008126 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008127 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008128 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008129 return -1;
8130 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008131
8132 if ((flags & F_ALT) &&
8133 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008134 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008135 * of issues that cause pain:
8136 * - when 0 is being converted, the C standard leaves off
8137 * the '0x' or '0X', which is inconsistent with other
8138 * %#x/%#X conversions and inconsistent with Python's
8139 * hex() function
8140 * - there are platforms that violate the standard and
8141 * convert 0 with the '0x' or '0X'
8142 * (Metrowerks, Compaq Tru64)
8143 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008144 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008145 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008146 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008147 * We can achieve the desired consistency by inserting our
8148 * own '0x' or '0X' prefix, and substituting %x/%X in place
8149 * of %#x/%#X.
8150 *
8151 * Note that this is the same approach as used in
8152 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008153 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008154 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8155 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008156 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008157 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008158 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8159 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008160 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008161 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008162 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008163 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008164 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008165 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166}
8167
8168static int
8169formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008170 size_t buflen,
8171 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008173 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008174 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008175 if (PyUnicode_GET_SIZE(v) != 1)
8176 goto onError;
8177 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008178 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008179
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008180 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008181 if (PyString_GET_SIZE(v) != 1)
8182 goto onError;
8183 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185
8186 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008187 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008189 x = PyInt_AsLong(v);
8190 if (x == -1 && PyErr_Occurred())
8191 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008192#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008193 if (x < 0 || x > 0x10ffff) {
8194 PyErr_SetString(PyExc_OverflowError,
8195 "%c arg not in range(0x110000) "
8196 "(wide Python build)");
8197 return -1;
8198 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008199#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008200 if (x < 0 || x > 0xffff) {
8201 PyErr_SetString(PyExc_OverflowError,
8202 "%c arg not in range(0x10000) "
8203 "(narrow Python build)");
8204 return -1;
8205 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008206#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008207 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008208 }
8209 buf[1] = '\0';
8210 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008211
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008212 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008213 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008214 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008215 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216}
8217
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008218/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8219
Mark Dickinson18cfada2009-11-23 18:46:41 +00008220 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008221 chars are formatted. XXX This is a magic number. Each formatting
8222 routine does bounds checking to ensure no overflow, but a better
8223 solution may be to malloc a buffer of appropriate size for each
8224 format. For now, the current solution is sufficient.
8225*/
8226#define FORMATBUFLEN (size_t)120
8227
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008229 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230{
8231 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008232 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233 int args_owned = 0;
8234 PyUnicodeObject *result = NULL;
8235 PyObject *dict = NULL;
8236 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008237
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008239 PyErr_BadInternalCall();
8240 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241 }
8242 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008243 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008244 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245 fmt = PyUnicode_AS_UNICODE(uformat);
8246 fmtcnt = PyUnicode_GET_SIZE(uformat);
8247
8248 reslen = rescnt = fmtcnt + 100;
8249 result = _PyUnicode_New(reslen);
8250 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008251 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 res = PyUnicode_AS_UNICODE(result);
8253
8254 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008255 arglen = PyTuple_Size(args);
8256 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257 }
8258 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008259 arglen = -1;
8260 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261 }
Christian Heimese93237d2007-12-19 02:37:44 +00008262 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008263 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008264 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265
8266 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008267 if (*fmt != '%') {
8268 if (--rescnt < 0) {
8269 rescnt = fmtcnt + 100;
8270 reslen += rescnt;
8271 if (_PyUnicode_Resize(&result, reslen) < 0)
8272 goto onError;
8273 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8274 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008275 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008276 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008277 }
8278 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008279 /* Got a format specifier */
8280 int flags = 0;
8281 Py_ssize_t width = -1;
8282 int prec = -1;
8283 Py_UNICODE c = '\0';
8284 Py_UNICODE fill;
8285 int isnumok;
8286 PyObject *v = NULL;
8287 PyObject *temp = NULL;
8288 Py_UNICODE *pbuf;
8289 Py_UNICODE sign;
8290 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008291 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008292
8293 fmt++;
8294 if (*fmt == '(') {
8295 Py_UNICODE *keystart;
8296 Py_ssize_t keylen;
8297 PyObject *key;
8298 int pcount = 1;
8299
8300 if (dict == NULL) {
8301 PyErr_SetString(PyExc_TypeError,
8302 "format requires a mapping");
8303 goto onError;
8304 }
8305 ++fmt;
8306 --fmtcnt;
8307 keystart = fmt;
8308 /* Skip over balanced parentheses */
8309 while (pcount > 0 && --fmtcnt >= 0) {
8310 if (*fmt == ')')
8311 --pcount;
8312 else if (*fmt == '(')
8313 ++pcount;
8314 fmt++;
8315 }
8316 keylen = fmt - keystart - 1;
8317 if (fmtcnt < 0 || pcount > 0) {
8318 PyErr_SetString(PyExc_ValueError,
8319 "incomplete format key");
8320 goto onError;
8321 }
8322#if 0
8323 /* keys are converted to strings using UTF-8 and
8324 then looked up since Python uses strings to hold
8325 variables names etc. in its namespaces and we
8326 wouldn't want to break common idioms. */
8327 key = PyUnicode_EncodeUTF8(keystart,
8328 keylen,
8329 NULL);
8330#else
8331 key = PyUnicode_FromUnicode(keystart, keylen);
8332#endif
8333 if (key == NULL)
8334 goto onError;
8335 if (args_owned) {
8336 Py_DECREF(args);
8337 args_owned = 0;
8338 }
8339 args = PyObject_GetItem(dict, key);
8340 Py_DECREF(key);
8341 if (args == NULL) {
8342 goto onError;
8343 }
8344 args_owned = 1;
8345 arglen = -1;
8346 argidx = -2;
8347 }
8348 while (--fmtcnt >= 0) {
8349 switch (c = *fmt++) {
8350 case '-': flags |= F_LJUST; continue;
8351 case '+': flags |= F_SIGN; continue;
8352 case ' ': flags |= F_BLANK; continue;
8353 case '#': flags |= F_ALT; continue;
8354 case '0': flags |= F_ZERO; continue;
8355 }
8356 break;
8357 }
8358 if (c == '*') {
8359 v = getnextarg(args, arglen, &argidx);
8360 if (v == NULL)
8361 goto onError;
8362 if (!PyInt_Check(v)) {
8363 PyErr_SetString(PyExc_TypeError,
8364 "* wants int");
8365 goto onError;
8366 }
8367 width = PyInt_AsLong(v);
8368 if (width < 0) {
8369 flags |= F_LJUST;
8370 width = -width;
8371 }
8372 if (--fmtcnt >= 0)
8373 c = *fmt++;
8374 }
8375 else if (c >= '0' && c <= '9') {
8376 width = c - '0';
8377 while (--fmtcnt >= 0) {
8378 c = *fmt++;
8379 if (c < '0' || c > '9')
8380 break;
8381 if ((width*10) / 10 != width) {
8382 PyErr_SetString(PyExc_ValueError,
8383 "width too big");
8384 goto onError;
8385 }
8386 width = width*10 + (c - '0');
8387 }
8388 }
8389 if (c == '.') {
8390 prec = 0;
8391 if (--fmtcnt >= 0)
8392 c = *fmt++;
8393 if (c == '*') {
8394 v = getnextarg(args, arglen, &argidx);
8395 if (v == NULL)
8396 goto onError;
8397 if (!PyInt_Check(v)) {
8398 PyErr_SetString(PyExc_TypeError,
8399 "* wants int");
8400 goto onError;
8401 }
8402 prec = PyInt_AsLong(v);
8403 if (prec < 0)
8404 prec = 0;
8405 if (--fmtcnt >= 0)
8406 c = *fmt++;
8407 }
8408 else if (c >= '0' && c <= '9') {
8409 prec = c - '0';
8410 while (--fmtcnt >= 0) {
8411 c = Py_CHARMASK(*fmt++);
8412 if (c < '0' || c > '9')
8413 break;
8414 if ((prec*10) / 10 != prec) {
8415 PyErr_SetString(PyExc_ValueError,
8416 "prec too big");
8417 goto onError;
8418 }
8419 prec = prec*10 + (c - '0');
8420 }
8421 }
8422 } /* prec */
8423 if (fmtcnt >= 0) {
8424 if (c == 'h' || c == 'l' || c == 'L') {
8425 if (--fmtcnt >= 0)
8426 c = *fmt++;
8427 }
8428 }
8429 if (fmtcnt < 0) {
8430 PyErr_SetString(PyExc_ValueError,
8431 "incomplete format");
8432 goto onError;
8433 }
8434 if (c != '%') {
8435 v = getnextarg(args, arglen, &argidx);
8436 if (v == NULL)
8437 goto onError;
8438 }
8439 sign = 0;
8440 fill = ' ';
8441 switch (c) {
8442
8443 case '%':
8444 pbuf = formatbuf;
8445 /* presume that buffer length is at least 1 */
8446 pbuf[0] = '%';
8447 len = 1;
8448 break;
8449
8450 case 's':
8451 case 'r':
8452 if (PyUnicode_Check(v) && c == 's') {
8453 temp = v;
8454 Py_INCREF(temp);
8455 }
8456 else {
8457 PyObject *unicode;
8458 if (c == 's')
8459 temp = PyObject_Unicode(v);
8460 else
8461 temp = PyObject_Repr(v);
8462 if (temp == NULL)
8463 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008464 if (PyUnicode_Check(temp))
8465 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008466 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008467 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008468 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8469 PyString_GET_SIZE(temp),
8470 NULL,
8471 "strict");
8472 Py_DECREF(temp);
8473 temp = unicode;
8474 if (temp == NULL)
8475 goto onError;
8476 }
8477 else {
8478 Py_DECREF(temp);
8479 PyErr_SetString(PyExc_TypeError,
8480 "%s argument has non-string str()");
8481 goto onError;
8482 }
8483 }
8484 pbuf = PyUnicode_AS_UNICODE(temp);
8485 len = PyUnicode_GET_SIZE(temp);
8486 if (prec >= 0 && len > prec)
8487 len = prec;
8488 break;
8489
8490 case 'i':
8491 case 'd':
8492 case 'u':
8493 case 'o':
8494 case 'x':
8495 case 'X':
8496 if (c == 'i')
8497 c = 'd';
8498 isnumok = 0;
8499 if (PyNumber_Check(v)) {
8500 PyObject *iobj=NULL;
8501
8502 if (PyInt_Check(v) || (PyLong_Check(v))) {
8503 iobj = v;
8504 Py_INCREF(iobj);
8505 }
8506 else {
8507 iobj = PyNumber_Int(v);
8508 if (iobj==NULL) iobj = PyNumber_Long(v);
8509 }
8510 if (iobj!=NULL) {
8511 if (PyInt_Check(iobj)) {
8512 isnumok = 1;
8513 pbuf = formatbuf;
8514 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8515 flags, prec, c, iobj);
8516 Py_DECREF(iobj);
8517 if (len < 0)
8518 goto onError;
8519 sign = 1;
8520 }
8521 else if (PyLong_Check(iobj)) {
8522 isnumok = 1;
8523 temp = formatlong(iobj, flags, prec, c);
8524 Py_DECREF(iobj);
8525 if (!temp)
8526 goto onError;
8527 pbuf = PyUnicode_AS_UNICODE(temp);
8528 len = PyUnicode_GET_SIZE(temp);
8529 sign = 1;
8530 }
8531 else {
8532 Py_DECREF(iobj);
8533 }
8534 }
8535 }
8536 if (!isnumok) {
8537 PyErr_Format(PyExc_TypeError,
8538 "%%%c format: a number is required, "
8539 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8540 goto onError;
8541 }
8542 if (flags & F_ZERO)
8543 fill = '0';
8544 break;
8545
8546 case 'e':
8547 case 'E':
8548 case 'f':
8549 case 'F':
8550 case 'g':
8551 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008552 temp = formatfloat(v, flags, prec, c);
8553 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008554 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008555 pbuf = PyUnicode_AS_UNICODE(temp);
8556 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008557 sign = 1;
8558 if (flags & F_ZERO)
8559 fill = '0';
8560 break;
8561
8562 case 'c':
8563 pbuf = formatbuf;
8564 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8565 if (len < 0)
8566 goto onError;
8567 break;
8568
8569 default:
8570 PyErr_Format(PyExc_ValueError,
8571 "unsupported format character '%c' (0x%x) "
8572 "at index %zd",
8573 (31<=c && c<=126) ? (char)c : '?',
8574 (int)c,
8575 (Py_ssize_t)(fmt - 1 -
8576 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008577 goto onError;
8578 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008579 if (sign) {
8580 if (*pbuf == '-' || *pbuf == '+') {
8581 sign = *pbuf++;
8582 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008583 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008584 else if (flags & F_SIGN)
8585 sign = '+';
8586 else if (flags & F_BLANK)
8587 sign = ' ';
8588 else
8589 sign = 0;
8590 }
8591 if (width < len)
8592 width = len;
8593 if (rescnt - (sign != 0) < width) {
8594 reslen -= rescnt;
8595 rescnt = width + fmtcnt + 100;
8596 reslen += rescnt;
8597 if (reslen < 0) {
8598 Py_XDECREF(temp);
8599 PyErr_NoMemory();
8600 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008601 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008602 if (_PyUnicode_Resize(&result, reslen) < 0) {
8603 Py_XDECREF(temp);
8604 goto onError;
8605 }
8606 res = PyUnicode_AS_UNICODE(result)
8607 + reslen - rescnt;
8608 }
8609 if (sign) {
8610 if (fill != ' ')
8611 *res++ = sign;
8612 rescnt--;
8613 if (width > len)
8614 width--;
8615 }
8616 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8617 assert(pbuf[0] == '0');
8618 assert(pbuf[1] == c);
8619 if (fill != ' ') {
8620 *res++ = *pbuf++;
8621 *res++ = *pbuf++;
8622 }
8623 rescnt -= 2;
8624 width -= 2;
8625 if (width < 0)
8626 width = 0;
8627 len -= 2;
8628 }
8629 if (width > len && !(flags & F_LJUST)) {
8630 do {
8631 --rescnt;
8632 *res++ = fill;
8633 } while (--width > len);
8634 }
8635 if (fill == ' ') {
8636 if (sign)
8637 *res++ = sign;
8638 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8639 assert(pbuf[0] == '0');
8640 assert(pbuf[1] == c);
8641 *res++ = *pbuf++;
8642 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008643 }
8644 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008645 Py_UNICODE_COPY(res, pbuf, len);
8646 res += len;
8647 rescnt -= len;
8648 while (--width >= len) {
8649 --rescnt;
8650 *res++ = ' ';
8651 }
8652 if (dict && (argidx < arglen) && c != '%') {
8653 PyErr_SetString(PyExc_TypeError,
8654 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008655 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008656 goto onError;
8657 }
8658 Py_XDECREF(temp);
8659 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660 } /* until end */
8661 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008662 PyErr_SetString(PyExc_TypeError,
8663 "not all arguments converted during string formatting");
8664 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 }
8666
Thomas Woutersa96affe2006-03-12 00:29:36 +00008667 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008668 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008670 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 }
8672 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 return (PyObject *)result;
8674
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008675 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676 Py_XDECREF(result);
8677 Py_DECREF(uformat);
8678 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008679 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680 }
8681 return NULL;
8682}
8683
8684static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008685 (readbufferproc) unicode_buffer_getreadbuf,
8686 (writebufferproc) unicode_buffer_getwritebuf,
8687 (segcountproc) unicode_buffer_getsegcount,
8688 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689};
8690
Jeremy Hylton938ace62002-07-17 16:30:39 +00008691static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008692unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8693
Tim Peters6d6c1a32001-08-02 04:15:00 +00008694static PyObject *
8695unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8696{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008697 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008698 static char *kwlist[] = {"string", "encoding", "errors", 0};
8699 char *encoding = NULL;
8700 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008701
Benjamin Peterson857ce152009-01-31 16:29:18 +00008702 if (type != &PyUnicode_Type)
8703 return unicode_subtype_new(type, args, kwds);
8704 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008705 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008706 return NULL;
8707 if (x == NULL)
8708 return (PyObject *)_PyUnicode_New(0);
8709 if (encoding == NULL && errors == NULL)
8710 return PyObject_Unicode(x);
8711 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008712 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008713}
8714
Guido van Rossume023fe02001-08-30 03:12:59 +00008715static PyObject *
8716unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8717{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008718 PyUnicodeObject *tmp, *pnew;
8719 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008720
Benjamin Peterson857ce152009-01-31 16:29:18 +00008721 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8722 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8723 if (tmp == NULL)
8724 return NULL;
8725 assert(PyUnicode_Check(tmp));
8726 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8727 if (pnew == NULL) {
8728 Py_DECREF(tmp);
8729 return NULL;
8730 }
8731 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8732 if (pnew->str == NULL) {
8733 _Py_ForgetReference((PyObject *)pnew);
8734 PyObject_Del(pnew);
8735 Py_DECREF(tmp);
8736 return PyErr_NoMemory();
8737 }
8738 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8739 pnew->length = n;
8740 pnew->hash = tmp->hash;
8741 Py_DECREF(tmp);
8742 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008743}
8744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008745PyDoc_STRVAR(unicode_doc,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008746 "unicode(string [, encoding[, errors]]) -> object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008747\n\
8748Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008749encoding defaults to the current default string encoding.\n\
8750errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008751
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008753 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008754 "unicode", /* tp_name */
8755 sizeof(PyUnicodeObject), /* tp_size */
8756 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008758 (destructor)unicode_dealloc, /* tp_dealloc */
8759 0, /* tp_print */
8760 0, /* tp_getattr */
8761 0, /* tp_setattr */
8762 0, /* tp_compare */
8763 unicode_repr, /* tp_repr */
8764 &unicode_as_number, /* tp_as_number */
8765 &unicode_as_sequence, /* tp_as_sequence */
8766 &unicode_as_mapping, /* tp_as_mapping */
8767 (hashfunc) unicode_hash, /* tp_hash*/
8768 0, /* tp_call*/
8769 (reprfunc) unicode_str, /* tp_str */
8770 PyObject_GenericGetAttr, /* tp_getattro */
8771 0, /* tp_setattro */
8772 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008773 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008774 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008775 unicode_doc, /* tp_doc */
8776 0, /* tp_traverse */
8777 0, /* tp_clear */
8778 PyUnicode_RichCompare, /* tp_richcompare */
8779 0, /* tp_weaklistoffset */
8780 0, /* tp_iter */
8781 0, /* tp_iternext */
8782 unicode_methods, /* tp_methods */
8783 0, /* tp_members */
8784 0, /* tp_getset */
8785 &PyBaseString_Type, /* tp_base */
8786 0, /* tp_dict */
8787 0, /* tp_descr_get */
8788 0, /* tp_descr_set */
8789 0, /* tp_dictoffset */
8790 0, /* tp_init */
8791 0, /* tp_alloc */
8792 unicode_new, /* tp_new */
8793 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794};
8795
8796/* Initialize the Unicode implementation */
8797
Thomas Wouters78890102000-07-22 19:25:51 +00008798void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008800 int i;
8801
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008802 /* XXX - move this array to unicodectype.c ? */
8803 Py_UNICODE linebreak[] = {
8804 0x000A, /* LINE FEED */
8805 0x000D, /* CARRIAGE RETURN */
8806 0x001C, /* FILE SEPARATOR */
8807 0x001D, /* GROUP SEPARATOR */
8808 0x001E, /* RECORD SEPARATOR */
8809 0x0085, /* NEXT LINE */
8810 0x2028, /* LINE SEPARATOR */
8811 0x2029, /* PARAGRAPH SEPARATOR */
8812 };
8813
Fred Drakee4315f52000-05-09 19:53:39 +00008814 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008815 free_list = NULL;
8816 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008817 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008818 if (!unicode_empty)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008819 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00008820
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008821 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008822 for (i = 0; i < 256; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008823 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008824 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008825 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008826
8827 /* initialize the linebreak bloom filter */
8828 bloom_linebreak = make_bloom_mask(
8829 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8830 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008831
8832 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008833}
8834
8835/* Finalize the Unicode implementation */
8836
Christian Heimes3b718a72008-02-14 12:47:33 +00008837int
8838PyUnicode_ClearFreeList(void)
8839{
8840 int freelist_size = numfree;
8841 PyUnicodeObject *u;
8842
8843 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008844 PyUnicodeObject *v = u;
8845 u = *(PyUnicodeObject **)u;
8846 if (v->str)
8847 PyObject_DEL(v->str);
8848 Py_XDECREF(v->defenc);
8849 PyObject_Del(v);
8850 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008851 }
8852 free_list = NULL;
8853 assert(numfree == 0);
8854 return freelist_size;
8855}
8856
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857void
Thomas Wouters78890102000-07-22 19:25:51 +00008858_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008860 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008861
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008862 Py_XDECREF(unicode_empty);
8863 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008864
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008865 for (i = 0; i < 256; i++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008866 if (unicode_latin1[i]) {
8867 Py_DECREF(unicode_latin1[i]);
8868 unicode_latin1[i] = NULL;
8869 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008870 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008871 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008873
Anthony Baxterac6bd462006-04-13 02:06:09 +00008874#ifdef __cplusplus
8875}
8876#endif