blob: 7713b5497e1d7f920ca43565084107bf8a5f3600 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000118/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000119/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000120/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000149/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000150/* 0x000B, * LINE TABULATION */
151/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000152/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000153 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000154 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000155/* 0x001C, * FILE SEPARATOR */
156/* 0x001D, * GROUP SEPARATOR */
157/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000163
Benjamin Peterson857ce152009-01-31 16:29:18 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000172};
173
174
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000176PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000178#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000179 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000180#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184#endif
185}
186
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
Antoine Pitrou10042922010-01-13 14:01:26 +0000195#if LONG_BIT >= 128
196#define BLOOM_WIDTH 128
197#elif LONG_BIT >= 64
198#define BLOOM_WIDTH 64
199#elif LONG_BIT >= 32
200#define BLOOM_WIDTH 32
201#else
202#error "LONG_BIT is smaller than 32"
203#endif
204
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000205#define BLOOM_MASK unsigned long
206
207static BLOOM_MASK bloom_linebreak;
208
Antoine Pitrou10042922010-01-13 14:01:26 +0000209#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000211
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000212#define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000215
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000217{
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
Antoine Pitrou10042922010-01-13 14:01:26 +0000220 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000225 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226
227 return mask;
228}
229
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000231{
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
Fredrik Lundh77633512006-05-23 19:47:35 +0000238 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239}
240
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000241#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244/* --- Unicode Object ----------------------------------------------------- */
245
246static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000247int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000248 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249{
250 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000251
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259
Benjamin Peterson857ce152009-01-31 16:29:18 +0000260 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
263 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 return -1;
267 }
268
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000271 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272 it contains). */
273
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000275 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000276 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000278 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 PyErr_NoMemory();
280 return -1;
281 }
282 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000283 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000285 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000287 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000288 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 }
290 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 return 0;
293}
294
295/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000296 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000299 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Andrew Dalkee0df7622006-05-27 11:04:36 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
Neal Norwitze7d8be82008-07-31 17:17:14 +0000314 /* Ensure we won't overflow the size. */
315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316 return (PyUnicodeObject *)PyErr_NoMemory();
317 }
318
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000320 if (free_list) {
321 unicode = free_list;
322 free_list = *(PyUnicodeObject **)unicode;
323 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000324 if (unicode->str) {
325 /* Keep-Alive optimization: we only upsize the buffer,
326 never downsize it. */
327 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000328 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000329 PyObject_DEL(unicode->str);
330 unicode->str = NULL;
331 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000332 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000336 }
337 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000340 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 if (unicode == NULL)
343 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000346 }
347
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000349 PyErr_NoMemory();
350 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000351 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000352 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000353 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in
357 * that case.
358 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000359 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000361 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000363 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000365
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000366 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000367 /* XXX UNREF/NEWREF interface should be more symmetrical */
368 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000369 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000370 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372}
373
374static
Guido van Rossum9475a232001-10-05 20:51:39 +0000375void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000377 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000378 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000379 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000380 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
381 PyObject_DEL(unicode->str);
382 unicode->str = NULL;
383 unicode->length = 0;
384 }
385 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000386 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000387 }
388 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000389 *(PyUnicodeObject **)unicode = free_list;
390 free_list = unicode;
391 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392 }
393 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000394 PyObject_DEL(unicode->str);
395 Py_XDECREF(unicode->defenc);
396 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397 }
398}
399
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000400static
401int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000402{
403 register PyUnicodeObject *v;
404
405 /* Argument checks */
406 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyErr_BadInternalCall();
408 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000409 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000410 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000411 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000412 PyErr_BadInternalCall();
413 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000414 }
415
416 /* Resizing unicode_empty and single character objects is not
417 possible since these are being shared. We simply return a fresh
418 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000419 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 (v == unicode_empty || v->length == 1)) {
421 PyUnicodeObject *w = _PyUnicode_New(length);
422 if (w == NULL)
423 return -1;
424 Py_UNICODE_COPY(w->str, v->str,
425 length < v->length ? length : v->length);
426 Py_DECREF(*unicode);
427 *unicode = w;
428 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429 }
430
431 /* Note that we don't have to modify *unicode for unshared Unicode
432 objects, since we can modify them in-place. */
433 return unicode_resize(v, length);
434}
435
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000436int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
437{
438 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
439}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000442 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443{
444 PyUnicodeObject *unicode;
445
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000446 /* If the Unicode data is known at construction time, we can apply
447 some optimizations which share commonly used objects. */
448 if (u != NULL) {
449
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000450 /* Optimization for empty strings */
451 if (size == 0 && unicode_empty != NULL) {
452 Py_INCREF(unicode_empty);
453 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000454 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455
456 /* Single character Unicode objects in the Latin-1 range are
457 shared when using this constructor */
458 if (size == 1 && *u < 256) {
459 unicode = unicode_latin1[*u];
460 if (!unicode) {
461 unicode = _PyUnicode_New(1);
462 if (!unicode)
463 return NULL;
464 unicode->str[0] = *u;
465 unicode_latin1[*u] = unicode;
466 }
467 Py_INCREF(unicode);
468 return (PyObject *)unicode;
469 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000470 }
Tim Petersced69f82003-09-16 20:30:58 +0000471
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472 unicode = _PyUnicode_New(size);
473 if (!unicode)
474 return NULL;
475
476 /* Copy the Unicode data into the new object */
477 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000478 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479
480 return (PyObject *)unicode;
481}
482
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000483PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
484{
485 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000486
Benjamin Peterson857ce152009-01-31 16:29:18 +0000487 if (size < 0) {
488 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000490 return NULL;
491 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000492
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000493 /* If the Unicode data is known at construction time, we can apply
494 some optimizations which share commonly used objects.
495 Also, this means the input must be UTF-8, so fall back to the
496 UTF-8 decoder at the end. */
497 if (u != NULL) {
498
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000499 /* Optimization for empty strings */
500 if (size == 0 && unicode_empty != NULL) {
501 Py_INCREF(unicode_empty);
502 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000503 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000504
505 /* Single characters are shared when using this constructor.
506 Restrict to ASCII, since the input must be UTF-8. */
507 if (size == 1 && Py_CHARMASK(*u) < 128) {
508 unicode = unicode_latin1[Py_CHARMASK(*u)];
509 if (!unicode) {
510 unicode = _PyUnicode_New(1);
511 if (!unicode)
512 return NULL;
513 unicode->str[0] = Py_CHARMASK(*u);
514 unicode_latin1[Py_CHARMASK(*u)] = unicode;
515 }
516 Py_INCREF(unicode);
517 return (PyObject *)unicode;
518 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000519
520 return PyUnicode_DecodeUTF8(u, size, NULL);
521 }
522
523 unicode = _PyUnicode_New(size);
524 if (!unicode)
525 return NULL;
526
527 return (PyObject *)unicode;
528}
529
530PyObject *PyUnicode_FromString(const char *u)
531{
532 size_t size = strlen(u);
533 if (size > PY_SSIZE_T_MAX) {
534 PyErr_SetString(PyExc_OverflowError, "input too long");
535 return NULL;
536 }
537
538 return PyUnicode_FromStringAndSize(u, size);
539}
540
Guido van Rossumd57fd912000-03-10 22:53:23 +0000541#ifdef HAVE_WCHAR_H
542
Mark Dickinson6b265f12009-03-18 16:07:26 +0000543#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
544# define CONVERT_WCHAR_TO_SURROGATES
545#endif
546
547#ifdef CONVERT_WCHAR_TO_SURROGATES
548
549/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
550 to convert from UTF32 to UTF16. */
551
552PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
553 Py_ssize_t size)
554{
555 PyUnicodeObject *unicode;
556 register Py_ssize_t i;
557 Py_ssize_t alloc;
558 const wchar_t *orig_w;
559
560 if (w == NULL) {
561 PyErr_BadInternalCall();
562 return NULL;
563 }
564
565 alloc = size;
566 orig_w = w;
567 for (i = size; i > 0; i--) {
568 if (*w > 0xFFFF)
569 alloc++;
570 w++;
571 }
572 w = orig_w;
573 unicode = _PyUnicode_New(alloc);
574 if (!unicode)
575 return NULL;
576
577 /* Copy the wchar_t data into the new object */
578 {
579 register Py_UNICODE *u;
580 u = PyUnicode_AS_UNICODE(unicode);
581 for (i = size; i > 0; i--) {
582 if (*w > 0xFFFF) {
583 wchar_t ordinal = *w++;
584 ordinal -= 0x10000;
585 *u++ = 0xD800 | (ordinal >> 10);
586 *u++ = 0xDC00 | (ordinal & 0x3FF);
587 }
588 else
589 *u++ = *w++;
590 }
591 }
592 return (PyObject *)unicode;
593}
594
595#else
596
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000598 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599{
600 PyUnicodeObject *unicode;
601
602 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000603 PyErr_BadInternalCall();
604 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000605 }
606
607 unicode = _PyUnicode_New(size);
608 if (!unicode)
609 return NULL;
610
611 /* Copy the wchar_t data into the new object */
612#ifdef HAVE_USABLE_WCHAR_T
613 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000614#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000615 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000616 register Py_UNICODE *u;
617 register Py_ssize_t i;
618 u = PyUnicode_AS_UNICODE(unicode);
619 for (i = size; i > 0; i--)
620 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621 }
622#endif
623
624 return (PyObject *)unicode;
625}
626
Mark Dickinson6b265f12009-03-18 16:07:26 +0000627#endif /* CONVERT_WCHAR_TO_SURROGATES */
628
629#undef CONVERT_WCHAR_TO_SURROGATES
630
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000631static void
632makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
633{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000634 *fmt++ = '%';
635 if (width) {
636 if (zeropad)
637 *fmt++ = '0';
638 fmt += sprintf(fmt, "%d", width);
639 }
640 if (precision)
641 fmt += sprintf(fmt, ".%d", precision);
642 if (longflag)
643 *fmt++ = 'l';
644 else if (size_tflag) {
645 char *f = PY_FORMAT_SIZE_T;
646 while (*f)
647 *fmt++ = *f++;
648 }
649 *fmt++ = c;
650 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000651}
652
653#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
654
655PyObject *
656PyUnicode_FromFormatV(const char *format, va_list vargs)
657{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000658 va_list count;
659 Py_ssize_t callcount = 0;
660 PyObject **callresults = NULL;
661 PyObject **callresult = NULL;
662 Py_ssize_t n = 0;
663 int width = 0;
664 int precision = 0;
665 int zeropad;
666 const char* f;
667 Py_UNICODE *s;
668 PyObject *string;
669 /* used by sprintf */
670 char buffer[21];
671 /* use abuffer instead of buffer, if we need more space
672 * (which can happen if there's a format specifier with width). */
673 char *abuffer = NULL;
674 char *realbuffer;
675 Py_ssize_t abuffersize = 0;
676 char fmt[60]; /* should be enough for %0width.precisionld */
677 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000678
679#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000680 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000681#else
682#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000683 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000684#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000685 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000686#endif
687#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000688 /* step 1: count the number of %S/%R/%s format specifications
689 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
690 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000691 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000692 if (*f == '%') {
693 if (*(f+1)=='%')
694 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000695 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000696 ++callcount;
697 while (isdigit((unsigned)*f))
698 width = (width*10) + *f++ - '0';
699 while (*++f && *f != '%' && !isalpha((unsigned)*f))
700 ;
701 if (*f == 's')
702 ++callcount;
703 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000704 }
705 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000706 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000707 if (callcount) {
708 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
709 if (!callresults) {
710 PyErr_NoMemory();
711 return NULL;
712 }
713 callresult = callresults;
714 }
715 /* step 3: figure out how large a buffer we need */
716 for (f = format; *f; f++) {
717 if (*f == '%') {
718 const char* p = f;
719 width = 0;
720 while (isdigit((unsigned)*f))
721 width = (width*10) + *f++ - '0';
722 while (*++f && *f != '%' && !isalpha((unsigned)*f))
723 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000724
Benjamin Peterson857ce152009-01-31 16:29:18 +0000725 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
726 * they don't affect the amount of space we reserve.
727 */
728 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000729 (f[1] == 'd' || f[1] == 'u'))
730 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000731
Benjamin Peterson857ce152009-01-31 16:29:18 +0000732 switch (*f) {
733 case 'c':
734 (void)va_arg(count, int);
735 /* fall through... */
736 case '%':
737 n++;
738 break;
739 case 'd': case 'u': case 'i': case 'x':
740 (void) va_arg(count, int);
741 /* 20 bytes is enough to hold a 64-bit
742 integer. Decimal takes the most space.
743 This isn't enough for octal.
744 If a width is specified we need more
745 (which we allocate later). */
746 if (width < 20)
747 width = 20;
748 n += width;
749 if (abuffersize < width)
750 abuffersize = width;
751 break;
752 case 's':
753 {
754 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000755 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000756 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
757 if (!str)
758 goto fail;
759 n += PyUnicode_GET_SIZE(str);
760 /* Remember the str and switch to the next slot */
761 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000762 break;
763 }
764 case 'U':
765 {
766 PyObject *obj = va_arg(count, PyObject *);
767 assert(obj && PyUnicode_Check(obj));
768 n += PyUnicode_GET_SIZE(obj);
769 break;
770 }
771 case 'V':
772 {
773 PyObject *obj = va_arg(count, PyObject *);
774 const char *str = va_arg(count, const char *);
775 assert(obj || str);
776 assert(!obj || PyUnicode_Check(obj));
777 if (obj)
778 n += PyUnicode_GET_SIZE(obj);
779 else
780 n += strlen(str);
781 break;
782 }
783 case 'S':
784 {
785 PyObject *obj = va_arg(count, PyObject *);
786 PyObject *str;
787 assert(obj);
788 str = PyObject_Str(obj);
789 if (!str)
790 goto fail;
791 n += PyUnicode_GET_SIZE(str);
792 /* Remember the str and switch to the next slot */
793 *callresult++ = str;
794 break;
795 }
796 case 'R':
797 {
798 PyObject *obj = va_arg(count, PyObject *);
799 PyObject *repr;
800 assert(obj);
801 repr = PyObject_Repr(obj);
802 if (!repr)
803 goto fail;
804 n += PyUnicode_GET_SIZE(repr);
805 /* Remember the repr and switch to the next slot */
806 *callresult++ = repr;
807 break;
808 }
809 case 'p':
810 (void) va_arg(count, int);
811 /* maximum 64-bit pointer representation:
812 * 0xffffffffffffffff
813 * so 19 characters is enough.
814 * XXX I count 18 -- what's the extra for?
815 */
816 n += 19;
817 break;
818 default:
819 /* if we stumble upon an unknown
820 formatting code, copy the rest of
821 the format string to the output
822 string. (we cannot just skip the
823 code, since there's no way to know
824 what's in the argument list) */
825 n += strlen(p);
826 goto expand;
827 }
828 } else
829 n++;
830 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000831 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000832 if (abuffersize > 20) {
833 abuffer = PyObject_Malloc(abuffersize);
834 if (!abuffer) {
835 PyErr_NoMemory();
836 goto fail;
837 }
838 realbuffer = abuffer;
839 }
840 else
841 realbuffer = buffer;
842 /* step 4: fill the buffer */
843 /* Since we've analyzed how much space we need for the worst case,
844 we don't have to resize the string.
845 There can be no errors beyond this point. */
846 string = PyUnicode_FromUnicode(NULL, n);
847 if (!string)
848 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000849
Benjamin Peterson857ce152009-01-31 16:29:18 +0000850 s = PyUnicode_AS_UNICODE(string);
851 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000852
Benjamin Peterson857ce152009-01-31 16:29:18 +0000853 for (f = format; *f; f++) {
854 if (*f == '%') {
855 const char* p = f++;
856 int longflag = 0;
857 int size_tflag = 0;
858 zeropad = (*f == '0');
859 /* parse the width.precision part */
860 width = 0;
861 while (isdigit((unsigned)*f))
862 width = (width*10) + *f++ - '0';
863 precision = 0;
864 if (*f == '.') {
865 f++;
866 while (isdigit((unsigned)*f))
867 precision = (precision*10) + *f++ - '0';
868 }
869 /* handle the long flag, but only for %ld and %lu.
870 others can be added when necessary. */
871 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
872 longflag = 1;
873 ++f;
874 }
875 /* handle the size_t flag. */
876 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
877 size_tflag = 1;
878 ++f;
879 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000880
Benjamin Peterson857ce152009-01-31 16:29:18 +0000881 switch (*f) {
882 case 'c':
883 *s++ = va_arg(vargs, int);
884 break;
885 case 'd':
886 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
887 if (longflag)
888 sprintf(realbuffer, fmt, va_arg(vargs, long));
889 else if (size_tflag)
890 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
891 else
892 sprintf(realbuffer, fmt, va_arg(vargs, int));
893 appendstring(realbuffer);
894 break;
895 case 'u':
896 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
897 if (longflag)
898 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
899 else if (size_tflag)
900 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
901 else
902 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
903 appendstring(realbuffer);
904 break;
905 case 'i':
906 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
907 sprintf(realbuffer, fmt, va_arg(vargs, int));
908 appendstring(realbuffer);
909 break;
910 case 'x':
911 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
912 sprintf(realbuffer, fmt, va_arg(vargs, int));
913 appendstring(realbuffer);
914 break;
915 case 's':
916 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000917 /* unused, since we already have the result */
918 (void) va_arg(vargs, char *);
919 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
920 PyUnicode_GET_SIZE(*callresult));
921 s += PyUnicode_GET_SIZE(*callresult);
922 /* We're done with the unicode()/repr() => forget it */
923 Py_DECREF(*callresult);
924 /* switch to next unicode()/repr() result */
925 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000926 break;
927 }
928 case 'U':
929 {
930 PyObject *obj = va_arg(vargs, PyObject *);
931 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
932 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
933 s += size;
934 break;
935 }
936 case 'V':
937 {
938 PyObject *obj = va_arg(vargs, PyObject *);
939 const char *str = va_arg(vargs, const char *);
940 if (obj) {
941 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
942 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
943 s += size;
944 } else {
945 appendstring(str);
946 }
947 break;
948 }
949 case 'S':
950 case 'R':
951 {
952 Py_UNICODE *ucopy;
953 Py_ssize_t usize;
954 Py_ssize_t upos;
955 /* unused, since we already have the result */
956 (void) va_arg(vargs, PyObject *);
957 ucopy = PyUnicode_AS_UNICODE(*callresult);
958 usize = PyUnicode_GET_SIZE(*callresult);
959 for (upos = 0; upos<usize;)
960 *s++ = ucopy[upos++];
961 /* We're done with the unicode()/repr() => forget it */
962 Py_DECREF(*callresult);
963 /* switch to next unicode()/repr() result */
964 ++callresult;
965 break;
966 }
967 case 'p':
968 sprintf(buffer, "%p", va_arg(vargs, void*));
969 /* %p is ill-defined: ensure leading 0x. */
970 if (buffer[1] == 'X')
971 buffer[1] = 'x';
972 else if (buffer[1] != 'x') {
973 memmove(buffer+2, buffer, strlen(buffer)+1);
974 buffer[0] = '0';
975 buffer[1] = 'x';
976 }
977 appendstring(buffer);
978 break;
979 case '%':
980 *s++ = '%';
981 break;
982 default:
983 appendstring(p);
984 goto end;
985 }
986 } else
987 *s++ = *f;
988 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000989
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000990 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000991 if (callresults)
992 PyObject_Free(callresults);
993 if (abuffer)
994 PyObject_Free(abuffer);
995 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
996 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000997 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000998 if (callresults) {
999 PyObject **callresult2 = callresults;
1000 while (callresult2 < callresult) {
1001 Py_DECREF(*callresult2);
1002 ++callresult2;
1003 }
1004 PyObject_Free(callresults);
1005 }
1006 if (abuffer)
1007 PyObject_Free(abuffer);
1008 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001009}
1010
1011#undef appendstring
1012
1013PyObject *
1014PyUnicode_FromFormat(const char *format, ...)
1015{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001016 PyObject* ret;
1017 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001018
1019#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001020 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001021#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001022 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001023#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001024 ret = PyUnicode_FromFormatV(format, vargs);
1025 va_end(vargs);
1026 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001027}
1028
Martin v. Löwis18e16552006-02-15 17:27:45 +00001029Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001030 wchar_t *w,
1031 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001032{
1033 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001034 PyErr_BadInternalCall();
1035 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001037
1038 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001040 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001041
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042#ifdef HAVE_USABLE_WCHAR_T
1043 memcpy(w, unicode->str, size * sizeof(wchar_t));
1044#else
1045 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001046 register Py_UNICODE *u;
1047 register Py_ssize_t i;
1048 u = PyUnicode_AS_UNICODE(unicode);
1049 for (i = size; i > 0; i--)
1050 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051 }
1052#endif
1053
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001054 if (size > PyUnicode_GET_SIZE(unicode))
1055 return PyUnicode_GET_SIZE(unicode);
1056 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001057 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058}
1059
1060#endif
1061
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001062PyObject *PyUnicode_FromOrdinal(int ordinal)
1063{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001064 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001065
1066#ifdef Py_UNICODE_WIDE
1067 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001068 PyErr_SetString(PyExc_ValueError,
1069 "unichr() arg not in range(0x110000) "
1070 "(wide Python build)");
1071 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001072 }
1073#else
1074 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001075 PyErr_SetString(PyExc_ValueError,
1076 "unichr() arg not in range(0x10000) "
1077 "(narrow Python build)");
1078 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001079 }
1080#endif
1081
Hye-Shik Chang40574832004-04-06 07:24:51 +00001082 s[0] = (Py_UNICODE)ordinal;
1083 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001084}
1085
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086PyObject *PyUnicode_FromObject(register PyObject *obj)
1087{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001088 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001089 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001090 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 Py_INCREF(obj);
1092 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001093 }
1094 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001095 /* For a Unicode subtype that's not a Unicode object,
1096 return a true Unicode object with the same data. */
1097 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1098 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001099 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001100 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1101}
1102
1103PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001104 const char *encoding,
1105 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001106{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001107 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001108 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001109 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001110
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001112 PyErr_BadInternalCall();
1113 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001115
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001116#if 0
1117 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001118 that no encodings is given and then redirect to
1119 PyObject_Unicode() which then applies the additional logic for
1120 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001121
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001122 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001123 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001124
1125 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001126 if (PyUnicode_Check(obj)) {
1127 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001128 PyErr_SetString(PyExc_TypeError,
1129 "decoding Unicode is not supported");
1130 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001131 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001132 return PyObject_Unicode(obj);
1133 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001134#else
1135 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001136 PyErr_SetString(PyExc_TypeError,
1137 "decoding Unicode is not supported");
1138 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001139 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001140#endif
1141
1142 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001143 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001144 s = PyString_AS_STRING(obj);
1145 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001146 }
Christian Heimes3497f942008-05-26 12:29:14 +00001147 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001148 /* Python 2.x specific */
1149 PyErr_Format(PyExc_TypeError,
1150 "decoding bytearray is not supported");
1151 return NULL;
1152 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001153 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001154 /* Overwrite the error message with something more useful in
1155 case of a TypeError. */
1156 if (PyErr_ExceptionMatches(PyExc_TypeError))
1157 PyErr_Format(PyExc_TypeError,
1158 "coercing to Unicode: need string or buffer, "
1159 "%.80s found",
1160 Py_TYPE(obj)->tp_name);
1161 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001162 }
Tim Petersced69f82003-09-16 20:30:58 +00001163
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001164 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 if (len == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001166 Py_INCREF(unicode_empty);
1167 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 }
Tim Petersced69f82003-09-16 20:30:58 +00001169 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001170 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001171
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001172 return v;
1173
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001174 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001175 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176}
1177
1178PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001179 Py_ssize_t size,
1180 const char *encoding,
1181 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182{
1183 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001184
1185 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001186 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001187
1188 /* Shortcuts for common default encodings */
1189 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001191 else if (strcmp(encoding, "latin-1") == 0)
1192 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001193#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1194 else if (strcmp(encoding, "mbcs") == 0)
1195 return PyUnicode_DecodeMBCS(s, size, errors);
1196#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001197 else if (strcmp(encoding, "ascii") == 0)
1198 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199
1200 /* Decode via the codec registry */
1201 buffer = PyBuffer_FromMemory((void *)s, size);
1202 if (buffer == NULL)
1203 goto onError;
1204 unicode = PyCodec_Decode(buffer, encoding, errors);
1205 if (unicode == NULL)
1206 goto onError;
1207 if (!PyUnicode_Check(unicode)) {
1208 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001209 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001210 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 Py_DECREF(unicode);
1212 goto onError;
1213 }
1214 Py_DECREF(buffer);
1215 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001216
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001217 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218 Py_XDECREF(buffer);
1219 return NULL;
1220}
1221
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001222PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1223 const char *encoding,
1224 const char *errors)
1225{
1226 PyObject *v;
1227
1228 if (!PyUnicode_Check(unicode)) {
1229 PyErr_BadArgument();
1230 goto onError;
1231 }
1232
1233 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001234 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001235
1236 /* Decode via the codec registry */
1237 v = PyCodec_Decode(unicode, encoding, errors);
1238 if (v == NULL)
1239 goto onError;
1240 return v;
1241
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001242 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001243 return NULL;
1244}
1245
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001247 Py_ssize_t size,
1248 const char *encoding,
1249 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250{
1251 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001252
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 unicode = PyUnicode_FromUnicode(s, size);
1254 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1257 Py_DECREF(unicode);
1258 return v;
1259}
1260
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001261PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1262 const char *encoding,
1263 const char *errors)
1264{
1265 PyObject *v;
1266
1267 if (!PyUnicode_Check(unicode)) {
1268 PyErr_BadArgument();
1269 goto onError;
1270 }
1271
1272 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001273 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001274
1275 /* Encode via the codec registry */
1276 v = PyCodec_Encode(unicode, encoding, errors);
1277 if (v == NULL)
1278 goto onError;
1279 return v;
1280
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001281 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001282 return NULL;
1283}
1284
Guido van Rossumd57fd912000-03-10 22:53:23 +00001285PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1286 const char *encoding,
1287 const char *errors)
1288{
1289 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001290
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 if (!PyUnicode_Check(unicode)) {
1292 PyErr_BadArgument();
1293 goto onError;
1294 }
Fred Drakee4315f52000-05-09 19:53:39 +00001295
Tim Petersced69f82003-09-16 20:30:58 +00001296 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001297 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001298
1299 /* Shortcuts for common default encodings */
1300 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001301 if (strcmp(encoding, "utf-8") == 0)
1302 return PyUnicode_AsUTF8String(unicode);
1303 else if (strcmp(encoding, "latin-1") == 0)
1304 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001305#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001306 else if (strcmp(encoding, "mbcs") == 0)
1307 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001308#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001309 else if (strcmp(encoding, "ascii") == 0)
1310 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001311 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312
1313 /* Encode via the codec registry */
1314 v = PyCodec_Encode(unicode, encoding, errors);
1315 if (v == NULL)
1316 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001317 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001318 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001319 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001320 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 Py_DECREF(v);
1322 goto onError;
1323 }
1324 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001325
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001326 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 return NULL;
1328}
1329
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001330PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001331 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001332{
1333 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1334
1335 if (v)
1336 return v;
1337 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1338 if (v && errors == NULL)
1339 ((PyUnicodeObject *)unicode)->defenc = v;
1340 return v;
1341}
1342
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1344{
1345 if (!PyUnicode_Check(unicode)) {
1346 PyErr_BadArgument();
1347 goto onError;
1348 }
1349 return PyUnicode_AS_UNICODE(unicode);
1350
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001351 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352 return NULL;
1353}
1354
Martin v. Löwis18e16552006-02-15 17:27:45 +00001355Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001356{
1357 if (!PyUnicode_Check(unicode)) {
1358 PyErr_BadArgument();
1359 goto onError;
1360 }
1361 return PyUnicode_GET_SIZE(unicode);
1362
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001363 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001364 return -1;
1365}
1366
Thomas Wouters78890102000-07-22 19:25:51 +00001367const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001368{
1369 return unicode_default_encoding;
1370}
1371
1372int PyUnicode_SetDefaultEncoding(const char *encoding)
1373{
1374 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001375
Fred Drakee4315f52000-05-09 19:53:39 +00001376 /* Make sure the encoding is valid. As side effect, this also
1377 loads the encoding into the codec registry cache. */
1378 v = _PyCodec_Lookup(encoding);
1379 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001380 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001381 Py_DECREF(v);
1382 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001383 encoding,
1384 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001385 return 0;
1386
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001387 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001388 return -1;
1389}
1390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001391/* error handling callback helper:
1392 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001393 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001394 and adjust various state variables.
1395 return 0 on success, -1 on error
1396*/
1397
1398static
1399int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001400 const char *encoding, const char *reason,
1401 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1402 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1403 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001404{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001405 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001406
1407 PyObject *restuple = NULL;
1408 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001409 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1410 Py_ssize_t requiredsize;
1411 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001412 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001413 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 int res = -1;
1415
1416 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001417 *errorHandler = PyCodec_LookupError(errors);
1418 if (*errorHandler == NULL)
1419 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420 }
1421
1422 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001423 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001424 encoding, input, insize, *startinpos, *endinpos, reason);
1425 if (*exceptionObject == NULL)
1426 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001427 }
1428 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001429 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1430 goto onError;
1431 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1432 goto onError;
1433 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1434 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435 }
1436
1437 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1438 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001439 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001440 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001441 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001442 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001443 }
1444 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001445 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001446 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001447 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001448 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001449 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1450 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001451 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001452
1453 /* need more space? (at least enough for what we
1454 have+the replacement+the rest of the string (starting
1455 at the new input position), so we won't have to check space
1456 when there are no errors in the rest of the string) */
1457 repptr = PyUnicode_AS_UNICODE(repunicode);
1458 repsize = PyUnicode_GET_SIZE(repunicode);
1459 requiredsize = *outpos + repsize + insize-newpos;
1460 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001461 if (requiredsize<2*outsize)
1462 requiredsize = 2*outsize;
1463 if (_PyUnicode_Resize(output, requiredsize) < 0)
1464 goto onError;
1465 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001466 }
1467 *endinpos = newpos;
1468 *inptr = input + newpos;
1469 Py_UNICODE_COPY(*outptr, repptr, repsize);
1470 *outptr += repsize;
1471 *outpos += repsize;
1472 /* we made it! */
1473 res = 0;
1474
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001475 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001476 Py_XDECREF(restuple);
1477 return res;
1478}
1479
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001480/* --- UTF-7 Codec -------------------------------------------------------- */
1481
Antoine Pitrou653dece2009-05-04 18:32:32 +00001482/* See RFC2152 for details. We encode conservatively and decode liberally. */
1483
1484/* Three simple macros defining base-64. */
1485
1486/* Is c a base-64 character? */
1487
1488#define IS_BASE64(c) \
1489 (isalnum(c) || (c) == '+' || (c) == '/')
1490
1491/* given that c is a base-64 character, what is its base-64 value? */
1492
1493#define FROM_BASE64(c) \
1494 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1495 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1496 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1497 (c) == '+' ? 62 : 63)
1498
1499/* What is the base-64 character of the bottom 6 bits of n? */
1500
1501#define TO_BASE64(n) \
1502 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1503
1504/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1505 * decoded as itself. We are permissive on decoding; the only ASCII
1506 * byte not decoding to itself is the + which begins a base64
1507 * string. */
1508
1509#define DECODE_DIRECT(c) \
1510 ((c) <= 127 && (c) != '+')
1511
1512/* The UTF-7 encoder treats ASCII characters differently according to
1513 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1514 * the above). See RFC2152. This array identifies these different
1515 * sets:
1516 * 0 : "Set D"
1517 * alphanumeric and '(),-./:?
1518 * 1 : "Set O"
1519 * !"#$%&*;<=>@[]^_`{|}
1520 * 2 : "whitespace"
1521 * ht nl cr sp
1522 * 3 : special (must be base64 encoded)
1523 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1524 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001525
Tim Petersced69f82003-09-16 20:30:58 +00001526static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001527char utf7_category[128] = {
1528/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1529 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1530/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1532/* sp ! " # $ % & ' ( ) * + , - . / */
1533 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1534/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1535 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1536/* @ A B C D E F G H I J K L M N O */
1537 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1538/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1540/* ` a b c d e f g h i j k l m n o */
1541 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1542/* p q r s t u v w x y z { | } ~ del */
1543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001544};
1545
Antoine Pitrou653dece2009-05-04 18:32:32 +00001546/* ENCODE_DIRECT: this character should be encoded as itself. The
1547 * answer depends on whether we are encoding set O as itself, and also
1548 * on whether we are encoding whitespace as itself. RFC2152 makes it
1549 * clear that the answers to these questions vary between
1550 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001551
Antoine Pitrou653dece2009-05-04 18:32:32 +00001552#define ENCODE_DIRECT(c, directO, directWS) \
1553 ((c) < 128 && (c) > 0 && \
1554 ((utf7_category[(c)] == 0) || \
1555 (directWS && (utf7_category[(c)] == 2)) || \
1556 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001557
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001558PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001559 Py_ssize_t size,
1560 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001561{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001562 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1563}
1564
Antoine Pitrou653dece2009-05-04 18:32:32 +00001565/* The decoder. The only state we preserve is our read position,
1566 * i.e. how many characters we have consumed. So if we end in the
1567 * middle of a shift sequence we have to back off the read position
1568 * and the output to the beginning of the sequence, otherwise we lose
1569 * all the shift state (seen bits, number of bits seen, high
1570 * surrogate). */
1571
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001572PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001573 Py_ssize_t size,
1574 const char *errors,
1575 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001576{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001577 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001578 Py_ssize_t startinpos;
1579 Py_ssize_t endinpos;
1580 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001581 const char *e;
1582 PyUnicodeObject *unicode;
1583 Py_UNICODE *p;
1584 const char *errmsg = "";
1585 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001586 Py_UNICODE *shiftOutStart;
1587 unsigned int base64bits = 0;
1588 unsigned long base64buffer = 0;
1589 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001590 PyObject *errorHandler = NULL;
1591 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001592
1593 unicode = _PyUnicode_New(size);
1594 if (!unicode)
1595 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001596 if (size == 0) {
1597 if (consumed)
1598 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001599 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001600 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001601
1602 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001603 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001604 e = s + size;
1605
1606 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001607 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001608
Antoine Pitrou653dece2009-05-04 18:32:32 +00001609 if (inShift) { /* in a base-64 section */
1610 if (IS_BASE64(ch)) { /* consume a base-64 character */
1611 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1612 base64bits += 6;
1613 s++;
1614 if (base64bits >= 16) {
1615 /* we have enough bits for a UTF-16 value */
1616 Py_UNICODE outCh = (Py_UNICODE)
1617 (base64buffer >> (base64bits-16));
1618 base64bits -= 16;
1619 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1620 if (surrogate) {
1621 /* expecting a second surrogate */
1622 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1623#ifdef Py_UNICODE_WIDE
1624 *p++ = (((surrogate & 0x3FF)<<10)
1625 | (outCh & 0x3FF)) + 0x10000;
1626#else
1627 *p++ = surrogate;
1628 *p++ = outCh;
1629#endif
1630 surrogate = 0;
Antoine Pitrou30402542011-11-15 01:49:40 +01001631 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001632 }
1633 else {
Antoine Pitrou30402542011-11-15 01:49:40 +01001634 *p++ = surrogate;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001635 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001636 }
1637 }
Antoine Pitrou30402542011-11-15 01:49:40 +01001638 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001639 /* first surrogate */
1640 surrogate = outCh;
1641 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001642 else {
1643 *p++ = outCh;
1644 }
1645 }
1646 }
1647 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001648 inShift = 0;
1649 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001650 if (surrogate) {
Antoine Pitrou30402542011-11-15 01:49:40 +01001651 *p++ = surrogate;
1652 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001654 if (base64bits > 0) { /* left-over bits */
1655 if (base64bits >= 6) {
1656 /* We've seen at least one base-64 character */
1657 errmsg = "partial character in shift sequence";
1658 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001659 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001660 else {
1661 /* Some bits remain; they should be zero */
1662 if (base64buffer != 0) {
1663 errmsg = "non-zero padding bits in shift sequence";
1664 goto utf7Error;
1665 }
1666 }
1667 }
1668 if (ch != '-') {
1669 /* '-' is absorbed; other terminating
1670 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001671 *p++ = ch;
1672 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001673 }
1674 }
1675 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001676 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001677 s++; /* consume '+' */
1678 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 s++;
1680 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001681 }
1682 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001683 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001684 shiftOutStart = p;
1685 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001686 }
1687 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001688 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689 *p++ = ch;
1690 s++;
1691 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001692 else {
1693 startinpos = s-starts;
1694 s++;
1695 errmsg = "unexpected special character";
1696 goto utf7Error;
1697 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001698 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001699utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001700 outpos = p-PyUnicode_AS_UNICODE(unicode);
1701 endinpos = s-starts;
1702 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001703 errors, &errorHandler,
1704 "utf7", errmsg,
1705 starts, size, &startinpos, &endinpos, &exc, &s,
1706 &unicode, &outpos, &p))
1707 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001708 }
1709
Antoine Pitrou653dece2009-05-04 18:32:32 +00001710 /* end of string */
1711
1712 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1713 /* if we're in an inconsistent state, that's an error */
1714 if (surrogate ||
1715 (base64bits >= 6) ||
1716 (base64bits > 0 && base64buffer != 0)) {
1717 outpos = p-PyUnicode_AS_UNICODE(unicode);
1718 endinpos = size;
1719 if (unicode_decode_call_errorhandler(
1720 errors, &errorHandler,
1721 "utf7", "unterminated shift sequence",
1722 starts, size, &startinpos, &endinpos, &exc, &s,
1723 &unicode, &outpos, &p))
1724 goto onError;
1725 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001726 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001727
1728 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001729 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001730 if (inShift) {
1731 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001732 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001733 }
1734 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001735 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001736 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001737 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001738
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001739 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001740 goto onError;
1741
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 Py_XDECREF(errorHandler);
1743 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744 return (PyObject *)unicode;
1745
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001746 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001747 Py_XDECREF(errorHandler);
1748 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749 Py_DECREF(unicode);
1750 return NULL;
1751}
1752
1753
1754PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001755 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001756 int base64SetO,
1757 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001758 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759{
1760 PyObject *v;
1761 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001762 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001763 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001764 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001765 unsigned int base64bits = 0;
1766 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001767 char * out;
1768 char * start;
1769
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001770 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001771 return PyErr_NoMemory();
1772
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001774 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001775
Antoine Pitrou653dece2009-05-04 18:32:32 +00001776 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001777 if (v == NULL)
1778 return NULL;
1779
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001780 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001781 for (;i < size; ++i) {
1782 Py_UNICODE ch = s[i];
1783
Antoine Pitrou653dece2009-05-04 18:32:32 +00001784 if (inShift) {
1785 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1786 /* shifting out */
1787 if (base64bits) { /* output remaining bits */
1788 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1789 base64buffer = 0;
1790 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001791 }
1792 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001793 /* Characters not in the BASE64 set implicitly unshift the sequence
1794 so no '-' is required, except if the character is itself a '-' */
1795 if (IS_BASE64(ch) || ch == '-') {
1796 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001797 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001798 *out++ = (char) ch;
1799 }
1800 else {
1801 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001802 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001803 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001804 else { /* not in a shift sequence */
1805 if (ch == '+') {
1806 *out++ = '+';
1807 *out++ = '-';
1808 }
1809 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1810 *out++ = (char) ch;
1811 }
1812 else {
1813 *out++ = '+';
1814 inShift = 1;
1815 goto encode_char;
1816 }
1817 }
1818 continue;
1819encode_char:
1820#ifdef Py_UNICODE_WIDE
1821 if (ch >= 0x10000) {
1822 /* code first surrogate */
1823 base64bits += 16;
1824 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1825 while (base64bits >= 6) {
1826 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1827 base64bits -= 6;
1828 }
1829 /* prepare second surrogate */
1830 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1831 }
1832#endif
1833 base64bits += 16;
1834 base64buffer = (base64buffer << 16) | ch;
1835 while (base64bits >= 6) {
1836 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1837 base64bits -= 6;
1838 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001839 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001840 if (base64bits)
1841 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1842 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001843 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001844
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001845 if (_PyString_Resize(&v, out - start))
1846 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001847 return v;
1848}
1849
Antoine Pitrou653dece2009-05-04 18:32:32 +00001850#undef IS_BASE64
1851#undef FROM_BASE64
1852#undef TO_BASE64
1853#undef DECODE_DIRECT
1854#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001855
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856/* --- UTF-8 Codec -------------------------------------------------------- */
1857
Tim Petersced69f82003-09-16 20:30:58 +00001858static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001860 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1861 illegal prefix. See RFC 3629 for details */
1862 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1863 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07001864 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1866 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1867 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1870 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1872 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001873 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1874 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1875 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1876 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1877 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878};
1879
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001881 Py_ssize_t size,
1882 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883{
Walter Dörwald69652032004-09-07 20:24:22 +00001884 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1885}
1886
1887PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001888 Py_ssize_t size,
1889 const char *errors,
1890 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001891{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001892 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001893 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001894 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001895 Py_ssize_t startinpos;
1896 Py_ssize_t endinpos;
1897 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898 const char *e;
1899 PyUnicodeObject *unicode;
1900 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001901 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001902 PyObject *errorHandler = NULL;
1903 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904
1905 /* Note: size will always be longer than the resulting Unicode
1906 character count */
1907 unicode = _PyUnicode_New(size);
1908 if (!unicode)
1909 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001910 if (size == 0) {
1911 if (consumed)
1912 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001914 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915
1916 /* Unpack UTF-8 encoded data */
1917 p = unicode->str;
1918 e = s + size;
1919
1920 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001921 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922
1923 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001924 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001925 s++;
1926 continue;
1927 }
1928
1929 n = utf8_code_length[ch];
1930
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001931 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001932 if (consumed)
1933 break;
1934 else {
1935 errmsg = "unexpected end of data";
1936 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001937 endinpos = startinpos+1;
1938 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1939 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001940 goto utf8Error;
1941 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001942 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943
1944 switch (n) {
1945
1946 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001947 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001948 startinpos = s-starts;
1949 endinpos = startinpos+1;
1950 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951
1952 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001953 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001954 startinpos = s-starts;
1955 endinpos = startinpos+1;
1956 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957
1958 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001960 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001961 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001962 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001963 goto utf8Error;
1964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001965 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00001966 assert ((ch > 0x007F) && (ch <= 0x07FF));
1967 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968 break;
1969
1970 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001971 /* XXX: surrogates shouldn't be valid UTF-8!
1972 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1973 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1974 Uncomment the 2 lines below to make them invalid,
1975 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00001976 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00001977 (s[2] & 0xc0) != 0x80 ||
1978 ((unsigned char)s[0] == 0xE0 &&
1979 (unsigned char)s[1] < 0xA0)/* ||
1980 ((unsigned char)s[0] == 0xED &&
1981 (unsigned char)s[1] > 0x9F)*/) {
1982 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001983 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001984 endinpos = startinpos + 1;
1985
1986 /* if s[1] first two bits are 1 and 0, then the invalid
1987 continuation byte is s[2], so increment endinpos by 1,
1988 if not, s[1] is invalid and endinpos doesn't need to
1989 be incremented. */
1990 if ((s[1] & 0xC0) == 0x80)
1991 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001992 goto utf8Error;
1993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00001995 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
1996 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001997 break;
1998
1999 case 4:
2000 if ((s[1] & 0xc0) != 0x80 ||
2001 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002002 (s[3] & 0xc0) != 0x80 ||
2003 ((unsigned char)s[0] == 0xF0 &&
2004 (unsigned char)s[1] < 0x90) ||
2005 ((unsigned char)s[0] == 0xF4 &&
2006 (unsigned char)s[1] > 0x8F)) {
2007 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002008 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002009 endinpos = startinpos + 1;
2010 if ((s[1] & 0xC0) == 0x80) {
2011 endinpos++;
2012 if ((s[2] & 0xC0) == 0x80)
2013 endinpos++;
2014 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002015 goto utf8Error;
2016 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002017 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002018 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2019 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2020
Fredrik Lundh8f455852001-06-27 18:59:43 +00002021#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002022 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002023#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002024 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002025
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002026 /* translate from 10000..10FFFF to 0..FFFF */
2027 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002028
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002029 /* high surrogate = top 10 bits added to D800 */
2030 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002031
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002032 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002033 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002034#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 }
2037 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002038 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002039
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002040 utf8Error:
2041 outpos = p-PyUnicode_AS_UNICODE(unicode);
2042 if (unicode_decode_call_errorhandler(
2043 errors, &errorHandler,
2044 "utf8", errmsg,
2045 starts, size, &startinpos, &endinpos, &exc, &s,
2046 &unicode, &outpos, &p))
2047 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 }
Walter Dörwald69652032004-09-07 20:24:22 +00002049 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002050 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051
2052 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002053 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 goto onError;
2055
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056 Py_XDECREF(errorHandler);
2057 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 return (PyObject *)unicode;
2059
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002060 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 Py_XDECREF(errorHandler);
2062 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 Py_DECREF(unicode);
2064 return NULL;
2065}
2066
Tim Peters602f7402002-04-27 18:03:26 +00002067/* Allocation strategy: if the string is short, convert into a stack buffer
2068 and allocate exactly as much space needed at the end. Else allocate the
2069 maximum possible needed (4 result bytes per Unicode character), and return
2070 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002071*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002072PyObject *
2073PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002074 Py_ssize_t size,
2075 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076{
Tim Peters602f7402002-04-27 18:03:26 +00002077#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002078
Martin v. Löwis18e16552006-02-15 17:27:45 +00002079 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002080 PyObject *v; /* result string object */
2081 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002082 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002083 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002084 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002085
Tim Peters602f7402002-04-27 18:03:26 +00002086 assert(s != NULL);
2087 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088
Tim Peters602f7402002-04-27 18:03:26 +00002089 if (size <= MAX_SHORT_UNICHARS) {
2090 /* Write into the stack buffer; nallocated can't overflow.
2091 * At the end, we'll allocate exactly as much heap space as it
2092 * turns out we need.
2093 */
2094 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2095 v = NULL; /* will allocate after we're done */
2096 p = stackbuf;
2097 }
2098 else {
2099 /* Overallocate on the heap, and give the excess back at the end. */
2100 nallocated = size * 4;
2101 if (nallocated / 4 != size) /* overflow! */
2102 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002103 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002104 if (v == NULL)
2105 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002106 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002107 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002108
Tim Peters602f7402002-04-27 18:03:26 +00002109 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002110 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002111
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002112 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002113 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002115
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002117 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002118 *p++ = (char)(0xc0 | (ch >> 6));
2119 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002120 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002121 else {
Tim Peters602f7402002-04-27 18:03:26 +00002122 /* Encode UCS2 Unicode ordinals */
2123 if (ch < 0x10000) {
2124 /* Special case: check for high surrogate */
2125 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2126 Py_UCS4 ch2 = s[i];
2127 /* Check for low surrogate and combine the two to
2128 form a UCS4 value */
2129 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002130 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002131 i++;
2132 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002133 }
Tim Peters602f7402002-04-27 18:03:26 +00002134 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002135 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002136 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002137 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2138 *p++ = (char)(0x80 | (ch & 0x3f));
2139 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002140 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002141 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002142 /* Encode UCS4 Unicode ordinals */
2143 *p++ = (char)(0xf0 | (ch >> 18));
2144 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2145 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2146 *p++ = (char)(0x80 | (ch & 0x3f));
2147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002149
Tim Peters602f7402002-04-27 18:03:26 +00002150 if (v == NULL) {
2151 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002152 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002153 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002154 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002155 }
2156 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002157 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002158 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002159 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002160 if (_PyString_Resize(&v, nneeded))
2161 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002162 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002164
Tim Peters602f7402002-04-27 18:03:26 +00002165#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002166}
2167
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2169{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 if (!PyUnicode_Check(unicode)) {
2171 PyErr_BadArgument();
2172 return NULL;
2173 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002174 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002175 PyUnicode_GET_SIZE(unicode),
2176 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177}
2178
Walter Dörwald6e390802007-08-17 16:41:28 +00002179/* --- UTF-32 Codec ------------------------------------------------------- */
2180
2181PyObject *
2182PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002183 Py_ssize_t size,
2184 const char *errors,
2185 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002186{
2187 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2188}
2189
2190PyObject *
2191PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002192 Py_ssize_t size,
2193 const char *errors,
2194 int *byteorder,
2195 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002196{
2197 const char *starts = s;
2198 Py_ssize_t startinpos;
2199 Py_ssize_t endinpos;
2200 Py_ssize_t outpos;
2201 PyUnicodeObject *unicode;
2202 Py_UNICODE *p;
2203#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002204 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002205 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002206#else
2207 const int pairs = 0;
2208#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002209 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002210 int bo = 0; /* assume native ordering by default */
2211 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002212 /* Offsets from q for retrieving bytes in the right order. */
2213#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2214 int iorder[] = {0, 1, 2, 3};
2215#else
2216 int iorder[] = {3, 2, 1, 0};
2217#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002218 PyObject *errorHandler = NULL;
2219 PyObject *exc = NULL;
Chris Jerdonekad4b0002012-10-07 20:37:54 -07002220
Walter Dörwald6e390802007-08-17 16:41:28 +00002221 q = (unsigned char *)s;
2222 e = q + size;
2223
2224 if (byteorder)
2225 bo = *byteorder;
2226
2227 /* Check for BOM marks (U+FEFF) in the input and adjust current
2228 byte order setting accordingly. In native mode, the leading BOM
2229 mark is skipped, in all other modes, it is copied to the output
2230 stream as-is (giving a ZWNBSP character). */
2231 if (bo == 0) {
2232 if (size >= 4) {
2233 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002234 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002235#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002236 if (bom == 0x0000FEFF) {
2237 q += 4;
2238 bo = -1;
2239 }
2240 else if (bom == 0xFFFE0000) {
2241 q += 4;
2242 bo = 1;
2243 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002244#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002245 if (bom == 0x0000FEFF) {
2246 q += 4;
2247 bo = 1;
2248 }
2249 else if (bom == 0xFFFE0000) {
2250 q += 4;
2251 bo = -1;
2252 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002253#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002254 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002255 }
2256
2257 if (bo == -1) {
2258 /* force LE */
2259 iorder[0] = 0;
2260 iorder[1] = 1;
2261 iorder[2] = 2;
2262 iorder[3] = 3;
2263 }
2264 else if (bo == 1) {
2265 /* force BE */
2266 iorder[0] = 3;
2267 iorder[1] = 2;
2268 iorder[2] = 1;
2269 iorder[3] = 0;
2270 }
2271
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002272 /* On narrow builds we split characters outside the BMP into two
2273 codepoints => count how much extra space we need. */
2274#ifndef Py_UNICODE_WIDE
Serhiy Storchakac9631a12013-01-08 22:43:18 +02002275 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002276 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2277 pairs++;
2278#endif
2279
2280 /* This might be one to much, because of a BOM */
2281 unicode = _PyUnicode_New((size+3)/4+pairs);
2282 if (!unicode)
2283 return NULL;
2284 if (size == 0)
2285 return (PyObject *)unicode;
2286
2287 /* Unpack UTF-32 encoded data */
2288 p = unicode->str;
2289
Walter Dörwald6e390802007-08-17 16:41:28 +00002290 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002291 Py_UCS4 ch;
2292 /* remaining bytes at the end? (size should be divisible by 4) */
2293 if (e-q<4) {
2294 if (consumed)
2295 break;
2296 errmsg = "truncated data";
2297 startinpos = ((const char *)q)-starts;
2298 endinpos = ((const char *)e)-starts;
2299 goto utf32Error;
2300 /* The remaining input chars are ignored if the callback
2301 chooses to skip the input */
2302 }
2303 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2304 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002305
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002306 if (ch >= 0x110000)
2307 {
2308 errmsg = "codepoint not in range(0x110000)";
2309 startinpos = ((const char *)q)-starts;
2310 endinpos = startinpos+4;
2311 goto utf32Error;
2312 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002313#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002314 if (ch >= 0x10000)
2315 {
2316 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2317 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2318 }
2319 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002320#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002321 *p++ = ch;
2322 q += 4;
2323 continue;
2324 utf32Error:
2325 outpos = p-PyUnicode_AS_UNICODE(unicode);
2326 if (unicode_decode_call_errorhandler(
2327 errors, &errorHandler,
2328 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002329 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002330 &unicode, &outpos, &p))
2331 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002332 }
2333
2334 if (byteorder)
2335 *byteorder = bo;
2336
2337 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002338 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002339
2340 /* Adjust length */
2341 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2342 goto onError;
2343
2344 Py_XDECREF(errorHandler);
2345 Py_XDECREF(exc);
2346 return (PyObject *)unicode;
2347
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002348 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002349 Py_DECREF(unicode);
2350 Py_XDECREF(errorHandler);
2351 Py_XDECREF(exc);
2352 return NULL;
2353}
2354
2355PyObject *
2356PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002357 Py_ssize_t size,
2358 const char *errors,
2359 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002360{
2361 PyObject *v;
2362 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002363 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002364#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002365 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002366#else
2367 const int pairs = 0;
2368#endif
2369 /* Offsets from p for storing byte pairs in the right order. */
2370#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2371 int iorder[] = {0, 1, 2, 3};
2372#else
2373 int iorder[] = {3, 2, 1, 0};
2374#endif
2375
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002376#define STORECHAR(CH) \
2377 do { \
2378 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2379 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2380 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2381 p[iorder[0]] = (CH) & 0xff; \
2382 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002383 } while(0)
2384
2385 /* In narrow builds we can output surrogate pairs as one codepoint,
2386 so we need less space. */
2387#ifndef Py_UNICODE_WIDE
2388 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002389 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2390 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2391 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002392#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002393 nsize = (size - pairs + (byteorder == 0));
2394 bytesize = nsize * 4;
2395 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002396 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002397 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002398 if (v == NULL)
2399 return NULL;
2400
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002401 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002402 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002403 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002404 if (size == 0)
2405 return v;
2406
2407 if (byteorder == -1) {
2408 /* force LE */
2409 iorder[0] = 0;
2410 iorder[1] = 1;
2411 iorder[2] = 2;
2412 iorder[3] = 3;
2413 }
2414 else if (byteorder == 1) {
2415 /* force BE */
2416 iorder[0] = 3;
2417 iorder[1] = 2;
2418 iorder[2] = 1;
2419 iorder[3] = 0;
2420 }
2421
2422 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002423 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002424#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002425 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2426 Py_UCS4 ch2 = *s;
2427 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2428 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2429 s++;
2430 size--;
2431 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002432 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002433#endif
2434 STORECHAR(ch);
2435 }
2436 return v;
2437#undef STORECHAR
2438}
2439
2440PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2441{
2442 if (!PyUnicode_Check(unicode)) {
2443 PyErr_BadArgument();
2444 return NULL;
2445 }
2446 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002447 PyUnicode_GET_SIZE(unicode),
2448 NULL,
2449 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002450}
2451
Guido van Rossumd57fd912000-03-10 22:53:23 +00002452/* --- UTF-16 Codec ------------------------------------------------------- */
2453
Tim Peters772747b2001-08-09 22:21:55 +00002454PyObject *
2455PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002456 Py_ssize_t size,
2457 const char *errors,
2458 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002459{
Walter Dörwald69652032004-09-07 20:24:22 +00002460 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2461}
2462
2463PyObject *
2464PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002465 Py_ssize_t size,
2466 const char *errors,
2467 int *byteorder,
2468 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002469{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002470 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002471 Py_ssize_t startinpos;
2472 Py_ssize_t endinpos;
2473 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 PyUnicodeObject *unicode;
2475 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002476 const unsigned char *q, *e;
2477 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002478 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002479 /* Offsets from q for retrieving byte pairs in the right order. */
2480#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2481 int ihi = 1, ilo = 0;
2482#else
2483 int ihi = 0, ilo = 1;
2484#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002485 PyObject *errorHandler = NULL;
2486 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487
2488 /* Note: size will always be longer than the resulting Unicode
2489 character count */
2490 unicode = _PyUnicode_New(size);
2491 if (!unicode)
2492 return NULL;
2493 if (size == 0)
2494 return (PyObject *)unicode;
2495
2496 /* Unpack UTF-16 encoded data */
2497 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002498 q = (unsigned char *)s;
2499 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500
2501 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002502 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002504 /* Check for BOM marks (U+FEFF) in the input and adjust current
2505 byte order setting accordingly. In native mode, the leading BOM
2506 mark is skipped, in all other modes, it is copied to the output
2507 stream as-is (giving a ZWNBSP character). */
2508 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002509 if (size >= 2) {
2510 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002511#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002512 if (bom == 0xFEFF) {
2513 q += 2;
2514 bo = -1;
2515 }
2516 else if (bom == 0xFFFE) {
2517 q += 2;
2518 bo = 1;
2519 }
Tim Petersced69f82003-09-16 20:30:58 +00002520#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002521 if (bom == 0xFEFF) {
2522 q += 2;
2523 bo = 1;
2524 }
2525 else if (bom == 0xFFFE) {
2526 q += 2;
2527 bo = -1;
2528 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002529#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002530 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002531 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532
Tim Peters772747b2001-08-09 22:21:55 +00002533 if (bo == -1) {
2534 /* force LE */
2535 ihi = 1;
2536 ilo = 0;
2537 }
2538 else if (bo == 1) {
2539 /* force BE */
2540 ihi = 0;
2541 ilo = 1;
2542 }
2543
2544 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002545 Py_UNICODE ch;
2546 /* remaining bytes at the end? (size should be even) */
2547 if (e-q<2) {
2548 if (consumed)
2549 break;
2550 errmsg = "truncated data";
2551 startinpos = ((const char *)q)-starts;
2552 endinpos = ((const char *)e)-starts;
2553 goto utf16Error;
2554 /* The remaining input chars are ignored if the callback
2555 chooses to skip the input */
2556 }
2557 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002558
Benjamin Peterson857ce152009-01-31 16:29:18 +00002559 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002560
2561 if (ch < 0xD800 || ch > 0xDFFF) {
2562 *p++ = ch;
2563 continue;
2564 }
2565
2566 /* UTF-16 code pair: */
Antoine Pitrou715a63b2012-07-21 00:52:06 +02002567 if (e - q < 2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002568 errmsg = "unexpected end of data";
2569 startinpos = (((const char *)q)-2)-starts;
2570 endinpos = ((const char *)e)-starts;
2571 goto utf16Error;
2572 }
2573 if (0xD800 <= ch && ch <= 0xDBFF) {
2574 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2575 q += 2;
2576 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002577#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002578 *p++ = ch;
2579 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002580#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002581 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002582#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002583 continue;
2584 }
2585 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002586 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002587 startinpos = (((const char *)q)-4)-starts;
2588 endinpos = startinpos+2;
2589 goto utf16Error;
2590 }
2591
Benjamin Peterson857ce152009-01-31 16:29:18 +00002592 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002593 errmsg = "illegal encoding";
2594 startinpos = (((const char *)q)-2)-starts;
2595 endinpos = startinpos+2;
2596 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002597
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002598 utf16Error:
2599 outpos = p-PyUnicode_AS_UNICODE(unicode);
2600 if (unicode_decode_call_errorhandler(
2601 errors, &errorHandler,
2602 "utf16", errmsg,
2603 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2604 &unicode, &outpos, &p))
2605 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002606 }
2607
2608 if (byteorder)
2609 *byteorder = bo;
2610
Walter Dörwald69652032004-09-07 20:24:22 +00002611 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002612 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002613
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002615 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616 goto onError;
2617
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002618 Py_XDECREF(errorHandler);
2619 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 return (PyObject *)unicode;
2621
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002622 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002624 Py_XDECREF(errorHandler);
2625 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626 return NULL;
2627}
2628
Tim Peters772747b2001-08-09 22:21:55 +00002629PyObject *
2630PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002631 Py_ssize_t size,
2632 const char *errors,
2633 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634{
2635 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002636 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002637 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002638#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002639 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002640#else
2641 const int pairs = 0;
2642#endif
Tim Peters772747b2001-08-09 22:21:55 +00002643 /* Offsets from p for storing byte pairs in the right order. */
2644#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2645 int ihi = 1, ilo = 0;
2646#else
2647 int ihi = 0, ilo = 1;
2648#endif
2649
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002650#define STORECHAR(CH) \
2651 do { \
2652 p[ihi] = ((CH) >> 8) & 0xff; \
2653 p[ilo] = (CH) & 0xff; \
2654 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002655 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002656
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002657#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002658 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002659 if (s[i] >= 0x10000)
2660 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002661#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002662 /* 2 * (size + pairs + (byteorder == 0)) */
2663 if (size > PY_SSIZE_T_MAX ||
2664 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002665 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002666 nsize = size + pairs + (byteorder == 0);
2667 bytesize = nsize * 2;
2668 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002669 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002670 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 if (v == NULL)
2672 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002673
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002674 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002675 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002676 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002677 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002678 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002679
2680 if (byteorder == -1) {
2681 /* force LE */
2682 ihi = 1;
2683 ilo = 0;
2684 }
2685 else if (byteorder == 1) {
2686 /* force BE */
2687 ihi = 0;
2688 ilo = 1;
2689 }
2690
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002691 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002692 Py_UNICODE ch = *s++;
2693 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002694#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002695 if (ch >= 0x10000) {
2696 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2697 ch = 0xD800 | ((ch-0x10000) >> 10);
2698 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002699#endif
Tim Peters772747b2001-08-09 22:21:55 +00002700 STORECHAR(ch);
2701 if (ch2)
2702 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002703 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002704 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002705#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706}
2707
2708PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2709{
2710 if (!PyUnicode_Check(unicode)) {
2711 PyErr_BadArgument();
2712 return NULL;
2713 }
2714 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002715 PyUnicode_GET_SIZE(unicode),
2716 NULL,
2717 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718}
2719
2720/* --- Unicode Escape Codec ----------------------------------------------- */
2721
Fredrik Lundh06d12682001-01-24 07:59:11 +00002722static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002723
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002725 Py_ssize_t size,
2726 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002728 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002729 Py_ssize_t startinpos;
2730 Py_ssize_t endinpos;
2731 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002736 char* message;
2737 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 PyObject *errorHandler = NULL;
2739 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002740
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 /* Escaped strings will always be longer than the resulting
2742 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002743 length after conversion to the true value.
2744 (but if the error callback returns a long replacement string
2745 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746 v = _PyUnicode_New(size);
2747 if (v == NULL)
2748 goto onError;
2749 if (size == 0)
2750 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002751
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002752 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002753 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002754
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 while (s < end) {
2756 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002757 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002758 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759
2760 /* Non-escape characters are interpreted as Unicode ordinals */
2761 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002762 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 continue;
2764 }
2765
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002766 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 /* \ - Escapes */
2768 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002769 c = *s++;
2770 if (s > end)
2771 c = '\0'; /* Invalid after \ */
2772 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002774 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 case '\n': break;
2776 case '\\': *p++ = '\\'; break;
2777 case '\'': *p++ = '\''; break;
2778 case '\"': *p++ = '\"'; break;
2779 case 'b': *p++ = '\b'; break;
2780 case 'f': *p++ = '\014'; break; /* FF */
2781 case 't': *p++ = '\t'; break;
2782 case 'n': *p++ = '\n'; break;
2783 case 'r': *p++ = '\r'; break;
2784 case 'v': *p++ = '\013'; break; /* VT */
2785 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2786
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002787 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788 case '0': case '1': case '2': case '3':
2789 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002790 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002791 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002792 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002793 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002794 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002796 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 break;
2798
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002799 /* hex escapes */
2800 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002802 digits = 2;
2803 message = "truncated \\xXX escape";
2804 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002806 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002808 digits = 4;
2809 message = "truncated \\uXXXX escape";
2810 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002811
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002812 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002813 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002814 digits = 8;
2815 message = "truncated \\UXXXXXXXX escape";
2816 hexescape:
2817 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002818 outpos = p-PyUnicode_AS_UNICODE(v);
2819 if (s+digits>end) {
2820 endinpos = size;
2821 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002822 errors, &errorHandler,
2823 "unicodeescape", "end of string in escape sequence",
2824 starts, size, &startinpos, &endinpos, &exc, &s,
2825 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002826 goto onError;
2827 goto nextByte;
2828 }
2829 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002830 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002831 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002832 endinpos = (s+i+1)-starts;
2833 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002834 errors, &errorHandler,
2835 "unicodeescape", message,
2836 starts, size, &startinpos, &endinpos, &exc, &s,
2837 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002838 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002839 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002840 }
2841 chr = (chr<<4) & ~0xF;
2842 if (c >= '0' && c <= '9')
2843 chr += c - '0';
2844 else if (c >= 'a' && c <= 'f')
2845 chr += 10 + c - 'a';
2846 else
2847 chr += 10 + c - 'A';
2848 }
2849 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002850 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002851 /* _decoding_error will have already written into the
2852 target buffer. */
2853 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002854 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002855 /* when we get here, chr is a 32-bit unicode character */
2856 if (chr <= 0xffff)
2857 /* UCS-2 character */
2858 *p++ = (Py_UNICODE) chr;
2859 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002860 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002861 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002862#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002863 *p++ = chr;
2864#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002865 chr -= 0x10000L;
2866 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002867 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002868#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002869 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002870 endinpos = s-starts;
2871 outpos = p-PyUnicode_AS_UNICODE(v);
2872 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002873 errors, &errorHandler,
2874 "unicodeescape", "illegal Unicode character",
2875 starts, size, &startinpos, &endinpos, &exc, &s,
2876 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002877 goto onError;
2878 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002879 break;
2880
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002881 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002882 case 'N':
2883 message = "malformed \\N character escape";
2884 if (ucnhash_CAPI == NULL) {
2885 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002886 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002887 if (ucnhash_CAPI == NULL)
2888 goto ucnhashError;
2889 }
2890 if (*s == '{') {
2891 const char *start = s+1;
2892 /* look for the closing brace */
2893 while (*s != '}' && s < end)
2894 s++;
2895 if (s > start && s < end && *s == '}') {
2896 /* found a name. look it up in the unicode database */
2897 message = "unknown Unicode character name";
2898 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002899 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002900 goto store;
2901 }
2902 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002903 endinpos = s-starts;
2904 outpos = p-PyUnicode_AS_UNICODE(v);
2905 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002906 errors, &errorHandler,
2907 "unicodeescape", message,
2908 starts, size, &startinpos, &endinpos, &exc, &s,
2909 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002910 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002911 break;
2912
2913 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002914 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002915 message = "\\ at end of string";
2916 s--;
2917 endinpos = s-starts;
2918 outpos = p-PyUnicode_AS_UNICODE(v);
2919 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002920 errors, &errorHandler,
2921 "unicodeescape", message,
2922 starts, size, &startinpos, &endinpos, &exc, &s,
2923 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002924 goto onError;
2925 }
2926 else {
2927 *p++ = '\\';
2928 *p++ = (unsigned char)s[-1];
2929 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002930 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002931 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002932 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002933 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002935 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002936 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002937 Py_XDECREF(errorHandler);
2938 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002940
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002941 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002942 PyErr_SetString(
2943 PyExc_UnicodeError,
2944 "\\N escapes not supported (can't load unicodedata module)"
2945 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002946 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002947 Py_XDECREF(errorHandler);
2948 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002949 return NULL;
2950
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002951 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002953 Py_XDECREF(errorHandler);
2954 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955 return NULL;
2956}
2957
2958/* Return a Unicode-Escape string version of the Unicode object.
2959
2960 If quotes is true, the string is enclosed in u"" or u'' quotes as
2961 appropriate.
2962
2963*/
2964
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002965Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002966 Py_ssize_t size,
2967 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002968{
2969 /* like wcschr, but doesn't stop at NULL characters */
2970
2971 while (size-- > 0) {
2972 if (*s == ch)
2973 return s;
2974 s++;
2975 }
2976
2977 return NULL;
2978}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002979
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980static
2981PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002982 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983 int quotes)
2984{
2985 PyObject *repr;
2986 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002988 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002989#ifdef Py_UNICODE_WIDE
2990 const Py_ssize_t expandsize = 10;
2991#else
2992 const Py_ssize_t expandsize = 6;
2993#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994
Neal Norwitz17753ec2006-08-21 22:21:19 +00002995 /* XXX(nnorwitz): rather than over-allocating, it would be
2996 better to choose a different scheme. Perhaps scan the
2997 first N-chars of the string and allocate based on that size.
2998 */
2999 /* Initial allocation is based on the longest-possible unichr
3000 escape.
3001
3002 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3003 unichr, so in this case it's the longest unichr escape. In
3004 narrow (UTF-16) builds this is five chars per source unichr
3005 since there are two unichrs in the surrogate pair, so in narrow
3006 (UTF-16) builds it's not the longest unichr escape.
3007
3008 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3009 so in the narrow (UTF-16) build case it's the longest unichr
3010 escape.
3011 */
3012
Neal Norwitze7d8be82008-07-31 17:17:14 +00003013 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003014 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003015
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003016 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003017 2
3018 + expandsize*size
3019 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020 if (repr == NULL)
3021 return NULL;
3022
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003023 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024
3025 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003026 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003027 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 !findchar(s, size, '"')) ? '"' : '\'';
3029 }
3030 while (size-- > 0) {
3031 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003032
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003033 /* Escape quotes and backslashes */
3034 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003035 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 *p++ = '\\';
3037 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003038 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003039 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003040
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003041#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003042 /* Map 21-bit characters to '\U00xxxxxx' */
3043 else if (ch >= 0x10000) {
3044 *p++ = '\\';
3045 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003046 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3047 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3048 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3049 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3050 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3051 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3052 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003053 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003054 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003055 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003056#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003057 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3058 else if (ch >= 0xD800 && ch < 0xDC00) {
3059 Py_UNICODE ch2;
3060 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003061
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003062 ch2 = *s++;
3063 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003064 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003065 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3066 *p++ = '\\';
3067 *p++ = 'U';
3068 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3069 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3070 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3071 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3072 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3073 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3074 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3075 *p++ = hexdigit[ucs & 0x0000000F];
3076 continue;
3077 }
3078 /* Fall through: isolated surrogates are copied as-is */
3079 s--;
3080 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003081 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003082#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003083
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003085 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086 *p++ = '\\';
3087 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003088 *p++ = hexdigit[(ch >> 12) & 0x000F];
3089 *p++ = hexdigit[(ch >> 8) & 0x000F];
3090 *p++ = hexdigit[(ch >> 4) & 0x000F];
3091 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003093
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003094 /* Map special whitespace to '\t', \n', '\r' */
3095 else if (ch == '\t') {
3096 *p++ = '\\';
3097 *p++ = 't';
3098 }
3099 else if (ch == '\n') {
3100 *p++ = '\\';
3101 *p++ = 'n';
3102 }
3103 else if (ch == '\r') {
3104 *p++ = '\\';
3105 *p++ = 'r';
3106 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003107
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003108 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003109 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003110 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003111 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003112 *p++ = hexdigit[(ch >> 4) & 0x000F];
3113 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003114 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003115
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 /* Copy everything else as-is */
3117 else
3118 *p++ = (char) ch;
3119 }
3120 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003121 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122
3123 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003124 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3125 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126 return repr;
3127}
3128
3129PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003130 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131{
3132 return unicodeescape_string(s, size, 0);
3133}
3134
3135PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3136{
3137 if (!PyUnicode_Check(unicode)) {
3138 PyErr_BadArgument();
3139 return NULL;
3140 }
3141 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003142 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143}
3144
3145/* --- Raw Unicode Escape Codec ------------------------------------------- */
3146
3147PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003148 Py_ssize_t size,
3149 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003151 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003152 Py_ssize_t startinpos;
3153 Py_ssize_t endinpos;
3154 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003155 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003156 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157 const char *end;
3158 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003159 PyObject *errorHandler = NULL;
3160 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003161
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162 /* Escaped strings will always be longer than the resulting
3163 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003164 length after conversion to the true value. (But decoding error
3165 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166 v = _PyUnicode_New(size);
3167 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003168 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003170 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003171 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172 end = s + size;
3173 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003174 unsigned char c;
3175 Py_UCS4 x;
3176 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003177 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003178
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003179 /* Non-escape characters are interpreted as Unicode ordinals */
3180 if (*s != '\\') {
3181 *p++ = (unsigned char)*s++;
3182 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003183 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003184 startinpos = s-starts;
3185
3186 /* \u-escapes are only interpreted iff the number of leading
3187 backslashes if odd */
3188 bs = s;
3189 for (;s < end;) {
3190 if (*s != '\\')
3191 break;
3192 *p++ = (unsigned char)*s++;
3193 }
3194 if (((s - bs) & 1) == 0 ||
3195 s >= end ||
3196 (*s != 'u' && *s != 'U')) {
3197 continue;
3198 }
3199 p--;
3200 count = *s=='u' ? 4 : 8;
3201 s++;
3202
3203 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3204 outpos = p-PyUnicode_AS_UNICODE(v);
3205 for (x = 0, i = 0; i < count; ++i, ++s) {
3206 c = (unsigned char)*s;
3207 if (!isxdigit(c)) {
3208 endinpos = s-starts;
3209 if (unicode_decode_call_errorhandler(
3210 errors, &errorHandler,
3211 "rawunicodeescape", "truncated \\uXXXX",
3212 starts, size, &startinpos, &endinpos, &exc, &s,
3213 &v, &outpos, &p))
3214 goto onError;
3215 goto nextByte;
3216 }
3217 x = (x<<4) & ~0xF;
3218 if (c >= '0' && c <= '9')
3219 x += c - '0';
3220 else if (c >= 'a' && c <= 'f')
3221 x += 10 + c - 'a';
3222 else
3223 x += 10 + c - 'A';
3224 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003225 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003226 /* UCS-2 character */
3227 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003228 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003229 /* UCS-4 character. Either store directly, or as
3230 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003231#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003232 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003233#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003234 x -= 0x10000L;
3235 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3236 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003237#endif
3238 } else {
3239 endinpos = s-starts;
3240 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003241 if (unicode_decode_call_errorhandler(
3242 errors, &errorHandler,
3243 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003244 starts, size, &startinpos, &endinpos, &exc, &s,
3245 &v, &outpos, &p))
3246 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003247 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003248 nextByte:
3249 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003251 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003252 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003253 Py_XDECREF(errorHandler);
3254 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003256
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003257 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003259 Py_XDECREF(errorHandler);
3260 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 return NULL;
3262}
3263
3264PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003265 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266{
3267 PyObject *repr;
3268 char *p;
3269 char *q;
3270
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003271 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003272#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003273 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003274#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003275 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003276#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003277
Neal Norwitze7d8be82008-07-31 17:17:14 +00003278 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003279 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003280
Neal Norwitze7d8be82008-07-31 17:17:14 +00003281 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003282 if (repr == NULL)
3283 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003284 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003285 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003287 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003288 while (size-- > 0) {
3289 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003290#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003291 /* Map 32-bit characters to '\Uxxxxxxxx' */
3292 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003293 *p++ = '\\';
3294 *p++ = 'U';
3295 *p++ = hexdigit[(ch >> 28) & 0xf];
3296 *p++ = hexdigit[(ch >> 24) & 0xf];
3297 *p++ = hexdigit[(ch >> 20) & 0xf];
3298 *p++ = hexdigit[(ch >> 16) & 0xf];
3299 *p++ = hexdigit[(ch >> 12) & 0xf];
3300 *p++ = hexdigit[(ch >> 8) & 0xf];
3301 *p++ = hexdigit[(ch >> 4) & 0xf];
3302 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003303 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003304 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003305#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003306 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3307 if (ch >= 0xD800 && ch < 0xDC00) {
3308 Py_UNICODE ch2;
3309 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003310
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003311 ch2 = *s++;
3312 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003313 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003314 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3315 *p++ = '\\';
3316 *p++ = 'U';
3317 *p++ = hexdigit[(ucs >> 28) & 0xf];
3318 *p++ = hexdigit[(ucs >> 24) & 0xf];
3319 *p++ = hexdigit[(ucs >> 20) & 0xf];
3320 *p++ = hexdigit[(ucs >> 16) & 0xf];
3321 *p++ = hexdigit[(ucs >> 12) & 0xf];
3322 *p++ = hexdigit[(ucs >> 8) & 0xf];
3323 *p++ = hexdigit[(ucs >> 4) & 0xf];
3324 *p++ = hexdigit[ucs & 0xf];
3325 continue;
3326 }
3327 /* Fall through: isolated surrogates are copied as-is */
3328 s--;
3329 size++;
3330 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003331#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003332 /* Map 16-bit characters to '\uxxxx' */
3333 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334 *p++ = '\\';
3335 *p++ = 'u';
3336 *p++ = hexdigit[(ch >> 12) & 0xf];
3337 *p++ = hexdigit[(ch >> 8) & 0xf];
3338 *p++ = hexdigit[(ch >> 4) & 0xf];
3339 *p++ = hexdigit[ch & 15];
3340 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003341 /* Copy everything else as-is */
3342 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 *p++ = (char) ch;
3344 }
3345 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003346 if (_PyString_Resize(&repr, p - q))
3347 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348 return repr;
3349}
3350
3351PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3352{
3353 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003354 PyErr_BadArgument();
3355 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356 }
3357 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003358 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359}
3360
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003361/* --- Unicode Internal Codec ------------------------------------------- */
3362
3363PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003364 Py_ssize_t size,
3365 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003366{
3367 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003368 Py_ssize_t startinpos;
3369 Py_ssize_t endinpos;
3370 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003371 PyUnicodeObject *v;
3372 Py_UNICODE *p;
3373 const char *end;
3374 const char *reason;
3375 PyObject *errorHandler = NULL;
3376 PyObject *exc = NULL;
3377
Neal Norwitzd43069c2006-01-08 01:12:10 +00003378#ifdef Py_UNICODE_WIDE
3379 Py_UNICODE unimax = PyUnicode_GetMax();
3380#endif
3381
Armin Rigo7ccbca92006-10-04 12:17:45 +00003382 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003383 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3384 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003385 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003386 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003387 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003388 p = PyUnicode_AS_UNICODE(v);
3389 end = s + size;
3390
3391 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003392 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003393 /* We have to sanity check the raw data, otherwise doom looms for
3394 some malformed UCS-4 data. */
3395 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003396#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003397 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003398#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003399 end-s < Py_UNICODE_SIZE
3400 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003401 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003402 startinpos = s - starts;
3403 if (end-s < Py_UNICODE_SIZE) {
3404 endinpos = end-starts;
3405 reason = "truncated input";
3406 }
3407 else {
3408 endinpos = s - starts + Py_UNICODE_SIZE;
3409 reason = "illegal code point (> 0x10FFFF)";
3410 }
3411 outpos = p - PyUnicode_AS_UNICODE(v);
3412 if (unicode_decode_call_errorhandler(
3413 errors, &errorHandler,
3414 "unicode_internal", reason,
3415 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003416 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003417 goto onError;
3418 }
3419 }
3420 else {
3421 p++;
3422 s += Py_UNICODE_SIZE;
3423 }
3424 }
3425
Martin v. Löwis412fb672006-04-13 06:34:32 +00003426 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003427 goto onError;
3428 Py_XDECREF(errorHandler);
3429 Py_XDECREF(exc);
3430 return (PyObject *)v;
3431
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003432 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003433 Py_XDECREF(v);
3434 Py_XDECREF(errorHandler);
3435 Py_XDECREF(exc);
3436 return NULL;
3437}
3438
Guido van Rossumd57fd912000-03-10 22:53:23 +00003439/* --- Latin-1 Codec ------------------------------------------------------ */
3440
3441PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003442 Py_ssize_t size,
3443 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444{
3445 PyUnicodeObject *v;
3446 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003447
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003449 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003450 Py_UNICODE r = *(unsigned char*)s;
3451 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003452 }
3453
Guido van Rossumd57fd912000-03-10 22:53:23 +00003454 v = _PyUnicode_New(size);
3455 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003456 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003458 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459 p = PyUnicode_AS_UNICODE(v);
3460 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003461 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003463
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003464 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 Py_XDECREF(v);
3466 return NULL;
3467}
3468
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003469/* create or adjust a UnicodeEncodeError */
3470static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003471 const char *encoding,
3472 const Py_UNICODE *unicode, Py_ssize_t size,
3473 Py_ssize_t startpos, Py_ssize_t endpos,
3474 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003477 *exceptionObject = PyUnicodeEncodeError_Create(
3478 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479 }
3480 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003481 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3482 goto onError;
3483 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3484 goto onError;
3485 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3486 goto onError;
3487 return;
3488 onError:
3489 Py_DECREF(*exceptionObject);
3490 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491 }
3492}
3493
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003494/* raises a UnicodeEncodeError */
3495static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003496 const char *encoding,
3497 const Py_UNICODE *unicode, Py_ssize_t size,
3498 Py_ssize_t startpos, Py_ssize_t endpos,
3499 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003500{
3501 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003502 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003504 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505}
3506
3507/* error handling callback helper:
3508 build arguments, call the callback and check the arguments,
3509 put the result into newpos and return the replacement string, which
3510 has to be freed by the caller */
3511static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003512 PyObject **errorHandler,
3513 const char *encoding, const char *reason,
3514 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3515 Py_ssize_t startpos, Py_ssize_t endpos,
3516 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003517{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003518 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519
3520 PyObject *restuple;
3521 PyObject *resunicode;
3522
3523 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003524 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003525 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003526 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 }
3528
3529 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003530 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003532 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003533
3534 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003535 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003536 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003537 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003538 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003539 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003540 Py_DECREF(restuple);
3541 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 }
3543 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003544 &resunicode, newpos)) {
3545 Py_DECREF(restuple);
3546 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003547 }
3548 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003549 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003550 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003551 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3552 Py_DECREF(restuple);
3553 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003554 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003555 Py_INCREF(resunicode);
3556 Py_DECREF(restuple);
3557 return resunicode;
3558}
3559
3560static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003561 Py_ssize_t size,
3562 const char *errors,
3563 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003564{
3565 /* output object */
3566 PyObject *res;
3567 /* pointers to the beginning and end+1 of input */
3568 const Py_UNICODE *startp = p;
3569 const Py_UNICODE *endp = p + size;
3570 /* pointer to the beginning of the unencodable characters */
3571 /* const Py_UNICODE *badp = NULL; */
3572 /* pointer into the output */
3573 char *str;
3574 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003575 Py_ssize_t respos = 0;
3576 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003577 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3578 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003579 PyObject *errorHandler = NULL;
3580 PyObject *exc = NULL;
3581 /* the following variable is used for caching string comparisons
3582 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3583 int known_errorHandler = -1;
3584
3585 /* allocate enough for a simple encoding without
3586 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003587 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588 if (res == NULL)
3589 goto onError;
3590 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003591 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003592 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 ressize = size;
3594
3595 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003596 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003598 /* can we encode this? */
3599 if (c<limit) {
3600 /* no overflow check, because we know that the space is enough */
3601 *str++ = (char)c;
3602 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003603 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003604 else {
3605 Py_ssize_t unicodepos = p-startp;
3606 Py_ssize_t requiredsize;
3607 PyObject *repunicode;
3608 Py_ssize_t repsize;
3609 Py_ssize_t newpos;
3610 Py_ssize_t respos;
3611 Py_UNICODE *uni2;
3612 /* startpos for collecting unencodable chars */
3613 const Py_UNICODE *collstart = p;
3614 const Py_UNICODE *collend = p;
3615 /* find all unecodable characters */
3616 while ((collend < endp) && ((*collend)>=limit))
3617 ++collend;
3618 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3619 if (known_errorHandler==-1) {
3620 if ((errors==NULL) || (!strcmp(errors, "strict")))
3621 known_errorHandler = 1;
3622 else if (!strcmp(errors, "replace"))
3623 known_errorHandler = 2;
3624 else if (!strcmp(errors, "ignore"))
3625 known_errorHandler = 3;
3626 else if (!strcmp(errors, "xmlcharrefreplace"))
3627 known_errorHandler = 4;
3628 else
3629 known_errorHandler = 0;
3630 }
3631 switch (known_errorHandler) {
3632 case 1: /* strict */
3633 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3634 goto onError;
3635 case 2: /* replace */
3636 while (collstart++<collend)
3637 *str++ = '?'; /* fall through */
3638 case 3: /* ignore */
3639 p = collend;
3640 break;
3641 case 4: /* xmlcharrefreplace */
3642 respos = str-PyString_AS_STRING(res);
3643 /* determine replacement size (temporarily (mis)uses p) */
3644 for (p = collstart, repsize = 0; p < collend; ++p) {
3645 if (*p<10)
3646 repsize += 2+1+1;
3647 else if (*p<100)
3648 repsize += 2+2+1;
3649 else if (*p<1000)
3650 repsize += 2+3+1;
3651 else if (*p<10000)
3652 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003653#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003654 else
3655 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003656#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003657 else if (*p<100000)
3658 repsize += 2+5+1;
3659 else if (*p<1000000)
3660 repsize += 2+6+1;
3661 else
3662 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003663#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003664 }
3665 requiredsize = respos+repsize+(endp-collend);
3666 if (requiredsize > ressize) {
3667 if (requiredsize<2*ressize)
3668 requiredsize = 2*ressize;
3669 if (_PyString_Resize(&res, requiredsize))
3670 goto onError;
3671 str = PyString_AS_STRING(res) + respos;
3672 ressize = requiredsize;
3673 }
3674 /* generate replacement (temporarily (mis)uses p) */
3675 for (p = collstart; p < collend; ++p) {
3676 str += sprintf(str, "&#%d;", (int)*p);
3677 }
3678 p = collend;
3679 break;
3680 default:
3681 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3682 encoding, reason, startp, size, &exc,
3683 collstart-startp, collend-startp, &newpos);
3684 if (repunicode == NULL)
3685 goto onError;
3686 /* need more space? (at least enough for what we have+the
3687 replacement+the rest of the string, so we won't have to
3688 check space for encodable characters) */
3689 respos = str-PyString_AS_STRING(res);
3690 repsize = PyUnicode_GET_SIZE(repunicode);
3691 requiredsize = respos+repsize+(endp-collend);
3692 if (requiredsize > ressize) {
3693 if (requiredsize<2*ressize)
3694 requiredsize = 2*ressize;
3695 if (_PyString_Resize(&res, requiredsize)) {
3696 Py_DECREF(repunicode);
3697 goto onError;
3698 }
3699 str = PyString_AS_STRING(res) + respos;
3700 ressize = requiredsize;
3701 }
3702 /* check if there is anything unencodable in the replacement
3703 and copy it to the output */
3704 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3705 c = *uni2;
3706 if (c >= limit) {
3707 raise_encode_exception(&exc, encoding, startp, size,
3708 unicodepos, unicodepos+1, reason);
3709 Py_DECREF(repunicode);
3710 goto onError;
3711 }
3712 *str = (char)c;
3713 }
3714 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003715 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003716 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003717 }
3718 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003720 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003721 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003722 /* If this falls res will be NULL */
3723 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003724 Py_XDECREF(errorHandler);
3725 Py_XDECREF(exc);
3726 return res;
3727
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003728 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003729 Py_XDECREF(res);
3730 Py_XDECREF(errorHandler);
3731 Py_XDECREF(exc);
3732 return NULL;
3733}
3734
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003736 Py_ssize_t size,
3737 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003739 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740}
3741
3742PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3743{
3744 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003745 PyErr_BadArgument();
3746 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747 }
3748 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003749 PyUnicode_GET_SIZE(unicode),
3750 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751}
3752
3753/* --- 7-bit ASCII Codec -------------------------------------------------- */
3754
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003756 Py_ssize_t size,
3757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760 PyUnicodeObject *v;
3761 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003762 Py_ssize_t startinpos;
3763 Py_ssize_t endinpos;
3764 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 const char *e;
3766 PyObject *errorHandler = NULL;
3767 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003768
Guido van Rossumd57fd912000-03-10 22:53:23 +00003769 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003770 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003771 Py_UNICODE r = *(unsigned char*)s;
3772 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003773 }
Tim Petersced69f82003-09-16 20:30:58 +00003774
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 v = _PyUnicode_New(size);
3776 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003777 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003779 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 e = s + size;
3782 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003783 register unsigned char c = (unsigned char)*s;
3784 if (c < 128) {
3785 *p++ = c;
3786 ++s;
3787 }
3788 else {
3789 startinpos = s-starts;
3790 endinpos = startinpos + 1;
3791 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3792 if (unicode_decode_call_errorhandler(
3793 errors, &errorHandler,
3794 "ascii", "ordinal not in range(128)",
3795 starts, size, &startinpos, &endinpos, &exc, &s,
3796 &v, &outpos, &p))
3797 goto onError;
3798 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003800 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003801 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3802 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003803 Py_XDECREF(errorHandler);
3804 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003806
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003807 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003809 Py_XDECREF(errorHandler);
3810 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 return NULL;
3812}
3813
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003815 Py_ssize_t size,
3816 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003818 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819}
3820
3821PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3822{
3823 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003824 PyErr_BadArgument();
3825 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003826 }
3827 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003828 PyUnicode_GET_SIZE(unicode),
3829 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830}
3831
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003832#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003833
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003834/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003835
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003836#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003837#define NEED_RETRY
3838#endif
3839
3840/* XXX This code is limited to "true" double-byte encodings, as
3841 a) it assumes an incomplete character consists of a single byte, and
3842 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003843 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003844
3845static int is_dbcs_lead_byte(const char *s, int offset)
3846{
3847 const char *curr = s + offset;
3848
3849 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003850 const char *prev = CharPrev(s, curr);
3851 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003852 }
3853 return 0;
3854}
3855
3856/*
3857 * Decode MBCS string into unicode object. If 'final' is set, converts
3858 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3859 */
3860static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003861 const char *s, /* MBCS string */
3862 int size, /* sizeof MBCS string */
3863 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003864{
3865 Py_UNICODE *p;
3866 Py_ssize_t n = 0;
3867 int usize = 0;
3868
3869 assert(size >= 0);
3870
3871 /* Skip trailing lead-byte unless 'final' is set */
3872 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003873 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003874
3875 /* First get the size of the result */
3876 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003877 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3878 if (usize == 0) {
3879 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3880 return -1;
3881 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003882 }
3883
3884 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003885 /* Create unicode object */
3886 *v = _PyUnicode_New(usize);
3887 if (*v == NULL)
3888 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003889 }
3890 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003891 /* Extend unicode object */
3892 n = PyUnicode_GET_SIZE(*v);
3893 if (_PyUnicode_Resize(v, n + usize) < 0)
3894 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003895 }
3896
3897 /* Do the conversion */
3898 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003899 p = PyUnicode_AS_UNICODE(*v) + n;
3900 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3901 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3902 return -1;
3903 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003904 }
3905
3906 return size;
3907}
3908
3909PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003910 Py_ssize_t size,
3911 const char *errors,
3912 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003913{
3914 PyUnicodeObject *v = NULL;
3915 int done;
3916
3917 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003918 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003919
3920#ifdef NEED_RETRY
3921 retry:
3922 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003923 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003924 else
3925#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003926 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003927
3928 if (done < 0) {
3929 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003930 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003931 }
3932
3933 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003934 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003935
3936#ifdef NEED_RETRY
3937 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003938 s += done;
3939 size -= done;
3940 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003941 }
3942#endif
3943
3944 return (PyObject *)v;
3945}
3946
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003947PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003948 Py_ssize_t size,
3949 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003950{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003951 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3952}
3953
3954/*
3955 * Convert unicode into string object (MBCS).
3956 * Returns 0 if succeed, -1 otherwise.
3957 */
3958static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003959 const Py_UNICODE *p, /* unicode */
3960 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003961{
3962 int mbcssize = 0;
3963 Py_ssize_t n = 0;
3964
3965 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003966
3967 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003968 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003969 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3970 if (mbcssize == 0) {
3971 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3972 return -1;
3973 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003974 }
3975
Martin v. Löwisd8251432006-06-14 05:21:04 +00003976 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003977 /* Create string object */
3978 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3979 if (*repr == NULL)
3980 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003981 }
3982 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003983 /* Extend string object */
3984 n = PyString_Size(*repr);
3985 if (_PyString_Resize(repr, n + mbcssize) < 0)
3986 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003987 }
3988
3989 /* Do the conversion */
3990 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003991 char *s = PyString_AS_STRING(*repr) + n;
3992 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3993 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3994 return -1;
3995 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003996 }
3997
3998 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003999}
4000
4001PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004002 Py_ssize_t size,
4003 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004004{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004005 PyObject *repr = NULL;
4006 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004007
Martin v. Löwisd8251432006-06-14 05:21:04 +00004008#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004009 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004010 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004011 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004012 else
4013#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004014 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004015
Martin v. Löwisd8251432006-06-14 05:21:04 +00004016 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004017 Py_XDECREF(repr);
4018 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004019 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004020
4021#ifdef NEED_RETRY
4022 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004023 p += INT_MAX;
4024 size -= INT_MAX;
4025 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004026 }
4027#endif
4028
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004029 return repr;
4030}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004031
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004032PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4033{
4034 if (!PyUnicode_Check(unicode)) {
4035 PyErr_BadArgument();
4036 return NULL;
4037 }
4038 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004039 PyUnicode_GET_SIZE(unicode),
4040 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004041}
4042
Martin v. Löwisd8251432006-06-14 05:21:04 +00004043#undef NEED_RETRY
4044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004045#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004046
Guido van Rossumd57fd912000-03-10 22:53:23 +00004047/* --- Character Mapping Codec -------------------------------------------- */
4048
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004050 Py_ssize_t size,
4051 PyObject *mapping,
4052 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004055 Py_ssize_t startinpos;
4056 Py_ssize_t endinpos;
4057 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059 PyUnicodeObject *v;
4060 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004061 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 PyObject *errorHandler = NULL;
4063 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004064 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004065 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004066
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 /* Default to Latin-1 */
4068 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004069 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070
4071 v = _PyUnicode_New(size);
4072 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004073 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004074 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004075 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004077 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004078 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004079 mapstring = PyUnicode_AS_UNICODE(mapping);
4080 maplen = PyUnicode_GET_SIZE(mapping);
4081 while (s < e) {
4082 unsigned char ch = *s;
4083 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004085 if (ch < maplen)
4086 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004088 if (x == 0xfffe) {
4089 /* undefined mapping */
4090 outpos = p-PyUnicode_AS_UNICODE(v);
4091 startinpos = s-starts;
4092 endinpos = startinpos+1;
4093 if (unicode_decode_call_errorhandler(
4094 errors, &errorHandler,
4095 "charmap", "character maps to <undefined>",
4096 starts, size, &startinpos, &endinpos, &exc, &s,
4097 &v, &outpos, &p)) {
4098 goto onError;
4099 }
4100 continue;
4101 }
4102 *p++ = x;
4103 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004104 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004105 }
4106 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004107 while (s < e) {
4108 unsigned char ch = *s;
4109 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004110
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004111 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4112 w = PyInt_FromLong((long)ch);
4113 if (w == NULL)
4114 goto onError;
4115 x = PyObject_GetItem(mapping, w);
4116 Py_DECREF(w);
4117 if (x == NULL) {
4118 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4119 /* No mapping found means: mapping is undefined. */
4120 PyErr_Clear();
4121 x = Py_None;
4122 Py_INCREF(x);
4123 } else
4124 goto onError;
4125 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004126
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004127 /* Apply mapping */
4128 if (PyInt_Check(x)) {
4129 long value = PyInt_AS_LONG(x);
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004130 if (value < 0 || value > 0x10FFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004131 PyErr_SetString(PyExc_TypeError,
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004132 "character mapping must be in range(0x110000)");
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004133 Py_DECREF(x);
4134 goto onError;
4135 }
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004136
4137#ifndef Py_UNICODE_WIDE
4138 if (value > 0xFFFF) {
4139 /* see the code for 1-n mapping below */
4140 if (extrachars < 2) {
4141 /* resize first */
4142 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4143 Py_ssize_t needed = 10 - extrachars;
4144 extrachars += needed;
4145 /* XXX overflow detection missing */
4146 if (_PyUnicode_Resize(&v,
4147 PyUnicode_GET_SIZE(v) + needed) < 0) {
4148 Py_DECREF(x);
4149 goto onError;
4150 }
4151 p = PyUnicode_AS_UNICODE(v) + oldpos;
4152 }
4153 value -= 0x10000;
4154 *p++ = 0xD800 | (value >> 10);
4155 *p++ = 0xDC00 | (value & 0x3FF);
4156 extrachars -= 2;
4157 }
4158 else
4159#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004160 *p++ = (Py_UNICODE)value;
4161 }
4162 else if (x == Py_None) {
4163 /* undefined mapping */
4164 outpos = p-PyUnicode_AS_UNICODE(v);
4165 startinpos = s-starts;
4166 endinpos = startinpos+1;
4167 if (unicode_decode_call_errorhandler(
4168 errors, &errorHandler,
4169 "charmap", "character maps to <undefined>",
4170 starts, size, &startinpos, &endinpos, &exc, &s,
4171 &v, &outpos, &p)) {
4172 Py_DECREF(x);
4173 goto onError;
4174 }
4175 Py_DECREF(x);
4176 continue;
4177 }
4178 else if (PyUnicode_Check(x)) {
4179 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004180
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004181 if (targetsize == 1)
4182 /* 1-1 mapping */
4183 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004184
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004185 else if (targetsize > 1) {
4186 /* 1-n mapping */
4187 if (targetsize > extrachars) {
4188 /* resize first */
4189 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4190 Py_ssize_t needed = (targetsize - extrachars) + \
4191 (targetsize << 2);
4192 extrachars += needed;
4193 /* XXX overflow detection missing */
4194 if (_PyUnicode_Resize(&v,
4195 PyUnicode_GET_SIZE(v) + needed) < 0) {
4196 Py_DECREF(x);
4197 goto onError;
4198 }
4199 p = PyUnicode_AS_UNICODE(v) + oldpos;
4200 }
4201 Py_UNICODE_COPY(p,
4202 PyUnicode_AS_UNICODE(x),
4203 targetsize);
4204 p += targetsize;
4205 extrachars -= targetsize;
4206 }
4207 /* 1-0 mapping: skip the character */
4208 }
4209 else {
4210 /* wrong return value */
4211 PyErr_SetString(PyExc_TypeError,
4212 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004213 Py_DECREF(x);
4214 goto onError;
4215 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004216 Py_DECREF(x);
4217 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004219 }
4220 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004221 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4222 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004223 Py_XDECREF(errorHandler);
4224 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004225 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004226
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004227 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228 Py_XDECREF(errorHandler);
4229 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004230 Py_XDECREF(v);
4231 return NULL;
4232}
4233
Martin v. Löwis3f767792006-06-04 19:36:28 +00004234/* Charmap encoding: the lookup table */
4235
4236struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004237 PyObject_HEAD
4238 unsigned char level1[32];
4239 int count2, count3;
4240 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004241};
4242
4243static PyObject*
4244encoding_map_size(PyObject *obj, PyObject* args)
4245{
4246 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004247 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004248 128*map->count3);
4249}
4250
4251static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004252 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004253 PyDoc_STR("Return the size (in bytes) of this object") },
4254 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004255};
4256
4257static void
4258encoding_map_dealloc(PyObject* o)
4259{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004260 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004261}
4262
4263static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004264 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004265 "EncodingMap", /*tp_name*/
4266 sizeof(struct encoding_map), /*tp_basicsize*/
4267 0, /*tp_itemsize*/
4268 /* methods */
4269 encoding_map_dealloc, /*tp_dealloc*/
4270 0, /*tp_print*/
4271 0, /*tp_getattr*/
4272 0, /*tp_setattr*/
4273 0, /*tp_compare*/
4274 0, /*tp_repr*/
4275 0, /*tp_as_number*/
4276 0, /*tp_as_sequence*/
4277 0, /*tp_as_mapping*/
4278 0, /*tp_hash*/
4279 0, /*tp_call*/
4280 0, /*tp_str*/
4281 0, /*tp_getattro*/
4282 0, /*tp_setattro*/
4283 0, /*tp_as_buffer*/
4284 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4285 0, /*tp_doc*/
4286 0, /*tp_traverse*/
4287 0, /*tp_clear*/
4288 0, /*tp_richcompare*/
4289 0, /*tp_weaklistoffset*/
4290 0, /*tp_iter*/
4291 0, /*tp_iternext*/
4292 encoding_map_methods, /*tp_methods*/
4293 0, /*tp_members*/
4294 0, /*tp_getset*/
4295 0, /*tp_base*/
4296 0, /*tp_dict*/
4297 0, /*tp_descr_get*/
4298 0, /*tp_descr_set*/
4299 0, /*tp_dictoffset*/
4300 0, /*tp_init*/
4301 0, /*tp_alloc*/
4302 0, /*tp_new*/
4303 0, /*tp_free*/
4304 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004305};
4306
4307PyObject*
4308PyUnicode_BuildEncodingMap(PyObject* string)
4309{
4310 Py_UNICODE *decode;
4311 PyObject *result;
4312 struct encoding_map *mresult;
4313 int i;
4314 int need_dict = 0;
4315 unsigned char level1[32];
4316 unsigned char level2[512];
4317 unsigned char *mlevel1, *mlevel2, *mlevel3;
4318 int count2 = 0, count3 = 0;
4319
4320 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4321 PyErr_BadArgument();
4322 return NULL;
4323 }
4324 decode = PyUnicode_AS_UNICODE(string);
4325 memset(level1, 0xFF, sizeof level1);
4326 memset(level2, 0xFF, sizeof level2);
4327
4328 /* If there isn't a one-to-one mapping of NULL to \0,
4329 or if there are non-BMP characters, we need to use
4330 a mapping dictionary. */
4331 if (decode[0] != 0)
4332 need_dict = 1;
4333 for (i = 1; i < 256; i++) {
4334 int l1, l2;
4335 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004336#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004337 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004338#endif
4339 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004340 need_dict = 1;
4341 break;
4342 }
4343 if (decode[i] == 0xFFFE)
4344 /* unmapped character */
4345 continue;
4346 l1 = decode[i] >> 11;
4347 l2 = decode[i] >> 7;
4348 if (level1[l1] == 0xFF)
4349 level1[l1] = count2++;
4350 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004351 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004352 }
4353
4354 if (count2 >= 0xFF || count3 >= 0xFF)
4355 need_dict = 1;
4356
4357 if (need_dict) {
4358 PyObject *result = PyDict_New();
4359 PyObject *key, *value;
4360 if (!result)
4361 return NULL;
4362 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004363 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004364 key = PyInt_FromLong(decode[i]);
4365 value = PyInt_FromLong(i);
4366 if (!key || !value)
4367 goto failed1;
4368 if (PyDict_SetItem(result, key, value) == -1)
4369 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004370 Py_DECREF(key);
4371 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004372 }
4373 return result;
4374 failed1:
4375 Py_XDECREF(key);
4376 Py_XDECREF(value);
4377 Py_DECREF(result);
4378 return NULL;
4379 }
4380
4381 /* Create a three-level trie */
4382 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4383 16*count2 + 128*count3 - 1);
4384 if (!result)
4385 return PyErr_NoMemory();
4386 PyObject_Init(result, &EncodingMapType);
4387 mresult = (struct encoding_map*)result;
4388 mresult->count2 = count2;
4389 mresult->count3 = count3;
4390 mlevel1 = mresult->level1;
4391 mlevel2 = mresult->level23;
4392 mlevel3 = mresult->level23 + 16*count2;
4393 memcpy(mlevel1, level1, 32);
4394 memset(mlevel2, 0xFF, 16*count2);
4395 memset(mlevel3, 0, 128*count3);
4396 count3 = 0;
4397 for (i = 1; i < 256; i++) {
4398 int o1, o2, o3, i2, i3;
4399 if (decode[i] == 0xFFFE)
4400 /* unmapped character */
4401 continue;
4402 o1 = decode[i]>>11;
4403 o2 = (decode[i]>>7) & 0xF;
4404 i2 = 16*mlevel1[o1] + o2;
4405 if (mlevel2[i2] == 0xFF)
4406 mlevel2[i2] = count3++;
4407 o3 = decode[i] & 0x7F;
4408 i3 = 128*mlevel2[i2] + o3;
4409 mlevel3[i3] = i;
4410 }
4411 return result;
4412}
4413
4414static int
4415encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4416{
4417 struct encoding_map *map = (struct encoding_map*)mapping;
4418 int l1 = c>>11;
4419 int l2 = (c>>7) & 0xF;
4420 int l3 = c & 0x7F;
4421 int i;
4422
4423#ifdef Py_UNICODE_WIDE
4424 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004425 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004426 }
4427#endif
4428 if (c == 0)
4429 return 0;
4430 /* level 1*/
4431 i = map->level1[l1];
4432 if (i == 0xFF) {
4433 return -1;
4434 }
4435 /* level 2*/
4436 i = map->level23[16*i+l2];
4437 if (i == 0xFF) {
4438 return -1;
4439 }
4440 /* level 3 */
4441 i = map->level23[16*map->count2 + 128*i + l3];
4442 if (i == 0) {
4443 return -1;
4444 }
4445 return i;
4446}
4447
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004448/* Lookup the character ch in the mapping. If the character
4449 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004450 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004453 PyObject *w = PyInt_FromLong((long)c);
4454 PyObject *x;
4455
4456 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004457 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458 x = PyObject_GetItem(mapping, w);
4459 Py_DECREF(w);
4460 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004461 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4462 /* No mapping found means: mapping is undefined. */
4463 PyErr_Clear();
4464 x = Py_None;
4465 Py_INCREF(x);
4466 return x;
4467 } else
4468 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004470 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004471 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004473 long value = PyInt_AS_LONG(x);
4474 if (value < 0 || value > 255) {
4475 PyErr_SetString(PyExc_TypeError,
4476 "character mapping must be in range(256)");
4477 Py_DECREF(x);
4478 return NULL;
4479 }
4480 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004482 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004483 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004485 /* wrong return value */
4486 PyErr_SetString(PyExc_TypeError,
4487 "character mapping must return integer, None or str");
4488 Py_DECREF(x);
4489 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 }
4491}
4492
Martin v. Löwis3f767792006-06-04 19:36:28 +00004493static int
4494charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4495{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004496 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4497 /* exponentially overallocate to minimize reallocations */
4498 if (requiredsize < 2*outsize)
4499 requiredsize = 2*outsize;
4500 if (_PyString_Resize(outobj, requiredsize)) {
4501 return 0;
4502 }
4503 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004504}
4505
Benjamin Peterson857ce152009-01-31 16:29:18 +00004506typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004507 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004508}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509/* lookup the character, put the result in the output string and adjust
4510 various state variables. Reallocate the output string if not enough
4511 space is available. Return a new reference to the object that
4512 was put in the output buffer, or Py_None, if the mapping was undefined
4513 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004514 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004515static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004516charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004517 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004518{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004519 PyObject *rep;
4520 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004521 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522
Christian Heimese93237d2007-12-19 02:37:44 +00004523 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004524 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004525 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004526 if (res == -1)
4527 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004528 if (outsize<requiredsize)
4529 if (!charmapencode_resize(outobj, outpos, requiredsize))
4530 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004531 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004532 outstart[(*outpos)++] = (char)res;
4533 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004534 }
4535
4536 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004538 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004539 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004540 Py_DECREF(rep);
4541 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004542 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004543 if (PyInt_Check(rep)) {
4544 Py_ssize_t requiredsize = *outpos+1;
4545 if (outsize<requiredsize)
4546 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4547 Py_DECREF(rep);
4548 return enc_EXCEPTION;
4549 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004550 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004551 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004552 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004553 else {
4554 const char *repchars = PyString_AS_STRING(rep);
4555 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4556 Py_ssize_t requiredsize = *outpos+repsize;
4557 if (outsize<requiredsize)
4558 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4559 Py_DECREF(rep);
4560 return enc_EXCEPTION;
4561 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004562 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004563 memcpy(outstart + *outpos, repchars, repsize);
4564 *outpos += repsize;
4565 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 }
Georg Brandl9f167602006-06-04 21:46:16 +00004567 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004568 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569}
4570
4571/* handle an error in PyUnicode_EncodeCharmap
4572 Return 0 on success, -1 on error */
4573static
4574int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004575 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004577 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004578 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004579{
4580 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004581 Py_ssize_t repsize;
4582 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 Py_UNICODE *uni2;
4584 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004585 Py_ssize_t collstartpos = *inpos;
4586 Py_ssize_t collendpos = *inpos+1;
4587 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 char *encoding = "charmap";
4589 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004590 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004592 /* find all unencodable characters */
4593 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004594 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004595 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004596 int res = encoding_map_lookup(p[collendpos], mapping);
4597 if (res != -1)
4598 break;
4599 ++collendpos;
4600 continue;
4601 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004602
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004603 rep = charmapencode_lookup(p[collendpos], mapping);
4604 if (rep==NULL)
4605 return -1;
4606 else if (rep!=Py_None) {
4607 Py_DECREF(rep);
4608 break;
4609 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004610 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004611 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612 }
4613 /* cache callback name lookup
4614 * (if not done yet, i.e. it's the first error) */
4615 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004616 if ((errors==NULL) || (!strcmp(errors, "strict")))
4617 *known_errorHandler = 1;
4618 else if (!strcmp(errors, "replace"))
4619 *known_errorHandler = 2;
4620 else if (!strcmp(errors, "ignore"))
4621 *known_errorHandler = 3;
4622 else if (!strcmp(errors, "xmlcharrefreplace"))
4623 *known_errorHandler = 4;
4624 else
4625 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626 }
4627 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004628 case 1: /* strict */
4629 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4630 return -1;
4631 case 2: /* replace */
4632 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004633 x = charmapencode_output('?', mapping, res, respos);
4634 if (x==enc_EXCEPTION) {
4635 return -1;
4636 }
4637 else if (x==enc_FAILED) {
4638 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4639 return -1;
4640 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004641 }
4642 /* fall through */
4643 case 3: /* ignore */
4644 *inpos = collendpos;
4645 break;
4646 case 4: /* xmlcharrefreplace */
4647 /* generate replacement (temporarily (mis)uses p) */
4648 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004649 char buffer[2+29+1+1];
4650 char *cp;
4651 sprintf(buffer, "&#%d;", (int)p[collpos]);
4652 for (cp = buffer; *cp; ++cp) {
4653 x = charmapencode_output(*cp, mapping, res, respos);
4654 if (x==enc_EXCEPTION)
4655 return -1;
4656 else if (x==enc_FAILED) {
4657 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4658 return -1;
4659 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004660 }
4661 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004662 *inpos = collendpos;
4663 break;
4664 default:
4665 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004666 encoding, reason, p, size, exceptionObject,
4667 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004668 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004669 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004670 /* generate replacement */
4671 repsize = PyUnicode_GET_SIZE(repunicode);
4672 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004673 x = charmapencode_output(*uni2, mapping, res, respos);
4674 if (x==enc_EXCEPTION) {
4675 return -1;
4676 }
4677 else if (x==enc_FAILED) {
4678 Py_DECREF(repunicode);
4679 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4680 return -1;
4681 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004682 }
4683 *inpos = newpos;
4684 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004685 }
4686 return 0;
4687}
4688
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004690 Py_ssize_t size,
4691 PyObject *mapping,
4692 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004694 /* output object */
4695 PyObject *res = NULL;
4696 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004697 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004698 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004699 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004700 PyObject *errorHandler = NULL;
4701 PyObject *exc = NULL;
4702 /* the following variable is used for caching string comparisons
4703 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4704 * 3=ignore, 4=xmlcharrefreplace */
4705 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706
4707 /* Default to Latin-1 */
4708 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004709 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004711 /* allocate enough for a simple encoding without
4712 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004713 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004714 if (res == NULL)
4715 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004716 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004717 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004719 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004720 /* try to encode it */
4721 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4722 if (x==enc_EXCEPTION) /* error */
4723 goto onError;
4724 if (x==enc_FAILED) { /* unencodable character */
4725 if (charmap_encoding_error(p, size, &inpos, mapping,
4726 &exc,
4727 &known_errorHandler, &errorHandler, errors,
4728 &res, &respos)) {
4729 goto onError;
4730 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004731 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004732 else
4733 /* done with this character => adjust input position */
4734 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004737 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004738 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004739 if (_PyString_Resize(&res, respos))
4740 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004741 }
4742 Py_XDECREF(exc);
4743 Py_XDECREF(errorHandler);
4744 return res;
4745
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004746 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004747 Py_XDECREF(res);
4748 Py_XDECREF(exc);
4749 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 return NULL;
4751}
4752
4753PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004754 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755{
4756 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004757 PyErr_BadArgument();
4758 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759 }
4760 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004761 PyUnicode_GET_SIZE(unicode),
4762 mapping,
4763 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764}
4765
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004766/* create or adjust a UnicodeTranslateError */
4767static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004768 const Py_UNICODE *unicode, Py_ssize_t size,
4769 Py_ssize_t startpos, Py_ssize_t endpos,
4770 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004772 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004773 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004774 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775 }
4776 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004777 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4778 goto onError;
4779 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4780 goto onError;
4781 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4782 goto onError;
4783 return;
4784 onError:
4785 Py_DECREF(*exceptionObject);
4786 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787 }
4788}
4789
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004790/* raises a UnicodeTranslateError */
4791static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004792 const Py_UNICODE *unicode, Py_ssize_t size,
4793 Py_ssize_t startpos, Py_ssize_t endpos,
4794 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004795{
4796 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004797 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004799 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004800}
4801
4802/* error handling callback helper:
4803 build arguments, call the callback and check the arguments,
4804 put the result into newpos and return the replacement string, which
4805 has to be freed by the caller */
4806static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004807 PyObject **errorHandler,
4808 const char *reason,
4809 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4810 Py_ssize_t startpos, Py_ssize_t endpos,
4811 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004813 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004814
Martin v. Löwis412fb672006-04-13 06:34:32 +00004815 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 PyObject *restuple;
4817 PyObject *resunicode;
4818
4819 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004820 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004821 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004822 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 }
4824
4825 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004826 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004827 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004828 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004829
4830 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004831 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004832 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004833 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004834 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004835 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004836 Py_DECREF(restuple);
4837 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004838 }
4839 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004840 &resunicode, &i_newpos)) {
4841 Py_DECREF(restuple);
4842 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004843 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004844 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004845 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004846 else
4847 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004848 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004849 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4850 Py_DECREF(restuple);
4851 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004852 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004853 Py_INCREF(resunicode);
4854 Py_DECREF(restuple);
4855 return resunicode;
4856}
4857
4858/* Lookup the character ch in the mapping and put the result in result,
4859 which must be decrefed by the caller.
4860 Return 0 on success, -1 on error */
4861static
4862int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4863{
4864 PyObject *w = PyInt_FromLong((long)c);
4865 PyObject *x;
4866
4867 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004868 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004869 x = PyObject_GetItem(mapping, w);
4870 Py_DECREF(w);
4871 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004872 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4873 /* No mapping found means: use 1:1 mapping. */
4874 PyErr_Clear();
4875 *result = NULL;
4876 return 0;
4877 } else
4878 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004879 }
4880 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004881 *result = x;
4882 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004883 }
4884 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004885 long value = PyInt_AS_LONG(x);
4886 long max = PyUnicode_GetMax();
4887 if (value < 0 || value > max) {
4888 PyErr_Format(PyExc_TypeError,
4889 "character mapping must be in range(0x%lx)", max+1);
4890 Py_DECREF(x);
4891 return -1;
4892 }
4893 *result = x;
4894 return 0;
4895 }
4896 else if (PyUnicode_Check(x)) {
4897 *result = x;
4898 return 0;
4899 }
4900 else {
4901 /* wrong return value */
4902 PyErr_SetString(PyExc_TypeError,
4903 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004904 Py_DECREF(x);
4905 return -1;
4906 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004907}
4908/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004909 if not reallocate and adjust various state variables.
4910 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004911static
Walter Dörwald4894c302003-10-24 14:25:28 +00004912int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004913 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004914{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004915 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004916 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004917 /* remember old output position */
4918 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4919 /* exponentially overallocate to minimize reallocations */
4920 if (requiredsize < 2 * oldsize)
4921 requiredsize = 2 * oldsize;
4922 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4923 return -1;
4924 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004925 }
4926 return 0;
4927}
4928/* lookup the character, put the result in the output string and adjust
4929 various state variables. Return a new reference to the object that
4930 was put in the output buffer in *result, or Py_None, if the mapping was
4931 undefined (in which case no character was written).
4932 The called must decref result.
4933 Return 0 on success, -1 on error. */
4934static
Walter Dörwald4894c302003-10-24 14:25:28 +00004935int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004936 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4937 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004938{
Walter Dörwald4894c302003-10-24 14:25:28 +00004939 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004940 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004941 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004942 /* not found => default to 1:1 mapping */
4943 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004944 }
4945 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004946 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004947 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004948 /* no overflow check, because we know that the space is enough */
4949 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004950 }
4951 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004952 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4953 if (repsize==1) {
4954 /* no overflow check, because we know that the space is enough */
4955 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4956 }
4957 else if (repsize!=0) {
4958 /* more than one character */
4959 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4960 (insize - (curinp-startinp)) +
4961 repsize - 1;
4962 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4963 return -1;
4964 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4965 *outp += repsize;
4966 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004967 }
4968 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004969 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004970 return 0;
4971}
4972
4973PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004974 Py_ssize_t size,
4975 PyObject *mapping,
4976 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004978 /* output object */
4979 PyObject *res = NULL;
4980 /* pointers to the beginning and end+1 of input */
4981 const Py_UNICODE *startp = p;
4982 const Py_UNICODE *endp = p + size;
4983 /* pointer into the output */
4984 Py_UNICODE *str;
4985 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004986 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004987 char *reason = "character maps to <undefined>";
4988 PyObject *errorHandler = NULL;
4989 PyObject *exc = NULL;
4990 /* the following variable is used for caching string comparisons
4991 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4992 * 3=ignore, 4=xmlcharrefreplace */
4993 int known_errorHandler = -1;
4994
Guido van Rossumd57fd912000-03-10 22:53:23 +00004995 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004996 PyErr_BadArgument();
4997 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004999
5000 /* allocate enough for a simple 1:1 translation without
5001 replacements, if we need more, we'll resize */
5002 res = PyUnicode_FromUnicode(NULL, size);
5003 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005004 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005006 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005007 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005008
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005009 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005010 /* try to encode it */
5011 PyObject *x = NULL;
5012 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5013 Py_XDECREF(x);
5014 goto onError;
5015 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005016 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005017 if (x!=Py_None) /* it worked => adjust input pointer */
5018 ++p;
5019 else { /* untranslatable character */
5020 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5021 Py_ssize_t repsize;
5022 Py_ssize_t newpos;
5023 Py_UNICODE *uni2;
5024 /* startpos for collecting untranslatable chars */
5025 const Py_UNICODE *collstart = p;
5026 const Py_UNICODE *collend = p+1;
5027 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005029 /* find all untranslatable characters */
5030 while (collend < endp) {
5031 if (charmaptranslate_lookup(*collend, mapping, &x))
5032 goto onError;
5033 Py_XDECREF(x);
5034 if (x!=Py_None)
5035 break;
5036 ++collend;
5037 }
5038 /* cache callback name lookup
5039 * (if not done yet, i.e. it's the first error) */
5040 if (known_errorHandler==-1) {
5041 if ((errors==NULL) || (!strcmp(errors, "strict")))
5042 known_errorHandler = 1;
5043 else if (!strcmp(errors, "replace"))
5044 known_errorHandler = 2;
5045 else if (!strcmp(errors, "ignore"))
5046 known_errorHandler = 3;
5047 else if (!strcmp(errors, "xmlcharrefreplace"))
5048 known_errorHandler = 4;
5049 else
5050 known_errorHandler = 0;
5051 }
5052 switch (known_errorHandler) {
5053 case 1: /* strict */
5054 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005055 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005056 case 2: /* replace */
5057 /* No need to check for space, this is a 1:1 replacement */
5058 for (coll = collstart; coll<collend; ++coll)
5059 *str++ = '?';
5060 /* fall through */
5061 case 3: /* ignore */
5062 p = collend;
5063 break;
5064 case 4: /* xmlcharrefreplace */
5065 /* generate replacement (temporarily (mis)uses p) */
5066 for (p = collstart; p < collend; ++p) {
5067 char buffer[2+29+1+1];
5068 char *cp;
5069 sprintf(buffer, "&#%d;", (int)*p);
5070 if (charmaptranslate_makespace(&res, &str,
5071 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5072 goto onError;
5073 for (cp = buffer; *cp; ++cp)
5074 *str++ = *cp;
5075 }
5076 p = collend;
5077 break;
5078 default:
5079 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5080 reason, startp, size, &exc,
5081 collstart-startp, collend-startp, &newpos);
5082 if (repunicode == NULL)
5083 goto onError;
5084 /* generate replacement */
5085 repsize = PyUnicode_GET_SIZE(repunicode);
5086 if (charmaptranslate_makespace(&res, &str,
5087 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5088 Py_DECREF(repunicode);
5089 goto onError;
5090 }
5091 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5092 *str++ = *uni2;
5093 p = startp + newpos;
5094 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005095 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005096 }
5097 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005098 /* Resize if we allocated to much */
5099 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005100 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005101 if (PyUnicode_Resize(&res, respos) < 0)
5102 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005103 }
5104 Py_XDECREF(exc);
5105 Py_XDECREF(errorHandler);
5106 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005108 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005109 Py_XDECREF(res);
5110 Py_XDECREF(exc);
5111 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 return NULL;
5113}
5114
5115PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005116 PyObject *mapping,
5117 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118{
5119 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005120
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121 str = PyUnicode_FromObject(str);
5122 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005123 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005125 PyUnicode_GET_SIZE(str),
5126 mapping,
5127 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128 Py_DECREF(str);
5129 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005130
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005131 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132 Py_XDECREF(str);
5133 return NULL;
5134}
Tim Petersced69f82003-09-16 20:30:58 +00005135
Guido van Rossum9e896b32000-04-05 20:11:21 +00005136/* --- Decimal Encoder ---------------------------------------------------- */
5137
5138int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005139 Py_ssize_t length,
5140 char *output,
5141 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005142{
5143 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005144 PyObject *errorHandler = NULL;
5145 PyObject *exc = NULL;
5146 const char *encoding = "decimal";
5147 const char *reason = "invalid decimal Unicode string";
5148 /* the following variable is used for caching string comparisons
5149 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5150 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005151
5152 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005153 PyErr_BadArgument();
5154 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005155 }
5156
5157 p = s;
5158 end = s + length;
5159 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005160 register Py_UNICODE ch = *p;
5161 int decimal;
5162 PyObject *repunicode;
5163 Py_ssize_t repsize;
5164 Py_ssize_t newpos;
5165 Py_UNICODE *uni2;
5166 Py_UNICODE *collstart;
5167 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005168
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005169 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005170 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005171 ++p;
5172 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005173 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005174 decimal = Py_UNICODE_TODECIMAL(ch);
5175 if (decimal >= 0) {
5176 *output++ = '0' + decimal;
5177 ++p;
5178 continue;
5179 }
5180 if (0 < ch && ch < 256) {
5181 *output++ = (char)ch;
5182 ++p;
5183 continue;
5184 }
5185 /* All other characters are considered unencodable */
5186 collstart = p;
Victor Stinner975134e2011-11-22 01:54:19 +01005187 for (collend = p+1; collend < end; collend++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005188 if ((0 < *collend && *collend < 256) ||
Victor Stinner975134e2011-11-22 01:54:19 +01005189 Py_UNICODE_ISSPACE(*collend) ||
5190 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005191 break;
5192 }
5193 /* cache callback name lookup
5194 * (if not done yet, i.e. it's the first error) */
5195 if (known_errorHandler==-1) {
5196 if ((errors==NULL) || (!strcmp(errors, "strict")))
5197 known_errorHandler = 1;
5198 else if (!strcmp(errors, "replace"))
5199 known_errorHandler = 2;
5200 else if (!strcmp(errors, "ignore"))
5201 known_errorHandler = 3;
5202 else if (!strcmp(errors, "xmlcharrefreplace"))
5203 known_errorHandler = 4;
5204 else
5205 known_errorHandler = 0;
5206 }
5207 switch (known_errorHandler) {
5208 case 1: /* strict */
5209 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5210 goto onError;
5211 case 2: /* replace */
5212 for (p = collstart; p < collend; ++p)
5213 *output++ = '?';
5214 /* fall through */
5215 case 3: /* ignore */
5216 p = collend;
5217 break;
5218 case 4: /* xmlcharrefreplace */
5219 /* generate replacement (temporarily (mis)uses p) */
5220 for (p = collstart; p < collend; ++p)
5221 output += sprintf(output, "&#%d;", (int)*p);
5222 p = collend;
5223 break;
5224 default:
5225 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5226 encoding, reason, s, length, &exc,
5227 collstart-s, collend-s, &newpos);
5228 if (repunicode == NULL)
5229 goto onError;
5230 /* generate replacement */
5231 repsize = PyUnicode_GET_SIZE(repunicode);
5232 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5233 Py_UNICODE ch = *uni2;
5234 if (Py_UNICODE_ISSPACE(ch))
5235 *output++ = ' ';
5236 else {
5237 decimal = Py_UNICODE_TODECIMAL(ch);
5238 if (decimal >= 0)
5239 *output++ = '0' + decimal;
5240 else if (0 < ch && ch < 256)
5241 *output++ = (char)ch;
5242 else {
5243 Py_DECREF(repunicode);
5244 raise_encode_exception(&exc, encoding,
5245 s, length, collstart-s, collend-s, reason);
5246 goto onError;
5247 }
5248 }
5249 }
5250 p = s + newpos;
5251 Py_DECREF(repunicode);
5252 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005253 }
5254 /* 0-terminate the output string */
5255 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005256 Py_XDECREF(exc);
5257 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005258 return 0;
5259
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005260 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005261 Py_XDECREF(exc);
5262 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005263 return -1;
5264}
5265
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266/* --- Helpers ------------------------------------------------------------ */
5267
Eric Smitha9f7d622008-02-17 19:46:49 +00005268#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005269#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005270
5271#include "stringlib/count.h"
5272#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005273#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005274#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005275
Fredrik Lundhc8162812006-05-26 19:33:03 +00005276/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005277#define ADJUST_INDICES(start, end, len) \
5278 if (end > len) \
5279 end = len; \
5280 else if (end < 0) { \
5281 end += len; \
5282 if (end < 0) \
5283 end = 0; \
5284 } \
5285 if (start < 0) { \
5286 start += len; \
5287 if (start < 0) \
5288 start = 0; \
5289 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005290
Martin v. Löwis18e16552006-02-15 17:27:45 +00005291Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005292 PyObject *substr,
5293 Py_ssize_t start,
5294 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005296 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005297 PyUnicodeObject* str_obj;
5298 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005299
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005300 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5301 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005302 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005303 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5304 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005305 Py_DECREF(str_obj);
5306 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 }
Tim Petersced69f82003-09-16 20:30:58 +00005308
Antoine Pitrou64672132010-01-13 07:55:48 +00005309 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005310 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005311 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5312 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005313 );
5314
5315 Py_DECREF(sub_obj);
5316 Py_DECREF(str_obj);
5317
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 return result;
5319}
5320
Martin v. Löwis18e16552006-02-15 17:27:45 +00005321Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005322 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005323 Py_ssize_t start,
5324 Py_ssize_t end,
5325 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005327 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005328
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005329 str = PyUnicode_FromObject(str);
5330 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005331 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005332 sub = PyUnicode_FromObject(sub);
5333 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005334 Py_DECREF(str);
5335 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 }
Tim Petersced69f82003-09-16 20:30:58 +00005337
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005338 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005339 result = stringlib_find_slice(
5340 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5341 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5342 start, end
5343 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005344 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005345 result = stringlib_rfind_slice(
5346 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5347 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5348 start, end
5349 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005350
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005351 Py_DECREF(str);
5352 Py_DECREF(sub);
5353
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 return result;
5355}
5356
Tim Petersced69f82003-09-16 20:30:58 +00005357static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005359 PyUnicodeObject *substring,
5360 Py_ssize_t start,
5361 Py_ssize_t end,
5362 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364 if (substring->length == 0)
5365 return 1;
5366
Antoine Pitrou64672132010-01-13 07:55:48 +00005367 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 end -= substring->length;
5369 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005370 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371
5372 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005373 if (Py_UNICODE_MATCH(self, end, substring))
5374 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 } else {
5376 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005377 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 }
5379
5380 return 0;
5381}
5382
Martin v. Löwis18e16552006-02-15 17:27:45 +00005383Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005384 PyObject *substr,
5385 Py_ssize_t start,
5386 Py_ssize_t end,
5387 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005389 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005390
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 str = PyUnicode_FromObject(str);
5392 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005393 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 substr = PyUnicode_FromObject(substr);
5395 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005396 Py_DECREF(str);
5397 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005398 }
Tim Petersced69f82003-09-16 20:30:58 +00005399
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005401 (PyUnicodeObject *)substr,
5402 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 Py_DECREF(str);
5404 Py_DECREF(substr);
5405 return result;
5406}
5407
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408/* Apply fixfct filter to the Unicode object self and return a
5409 reference to the modified object */
5410
Tim Petersced69f82003-09-16 20:30:58 +00005411static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005413 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414{
5415
5416 PyUnicodeObject *u;
5417
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005418 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005420 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005421
5422 Py_UNICODE_COPY(u->str, self->str, self->length);
5423
Tim Peters7a29bd52001-09-12 03:03:31 +00005424 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005425 /* fixfct should return TRUE if it modified the buffer. If
5426 FALSE, return a reference to the original buffer instead
5427 (to save space, not time) */
5428 Py_INCREF(self);
5429 Py_DECREF(u);
5430 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 }
5432 return (PyObject*) u;
5433}
5434
Tim Petersced69f82003-09-16 20:30:58 +00005435static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436int fixupper(PyUnicodeObject *self)
5437{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005438 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439 Py_UNICODE *s = self->str;
5440 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005441
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005443 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005444
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005445 ch = Py_UNICODE_TOUPPER(*s);
5446 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005448 *s = ch;
5449 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 s++;
5451 }
5452
5453 return status;
5454}
5455
Tim Petersced69f82003-09-16 20:30:58 +00005456static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457int fixlower(PyUnicodeObject *self)
5458{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005459 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 Py_UNICODE *s = self->str;
5461 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005462
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005464 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005465
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005466 ch = Py_UNICODE_TOLOWER(*s);
5467 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005469 *s = ch;
5470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 s++;
5472 }
5473
5474 return status;
5475}
5476
Tim Petersced69f82003-09-16 20:30:58 +00005477static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478int fixswapcase(PyUnicodeObject *self)
5479{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005480 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 Py_UNICODE *s = self->str;
5482 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005483
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 while (len-- > 0) {
5485 if (Py_UNICODE_ISUPPER(*s)) {
5486 *s = Py_UNICODE_TOLOWER(*s);
5487 status = 1;
5488 } else if (Py_UNICODE_ISLOWER(*s)) {
5489 *s = Py_UNICODE_TOUPPER(*s);
5490 status = 1;
5491 }
5492 s++;
5493 }
5494
5495 return status;
5496}
5497
Tim Petersced69f82003-09-16 20:30:58 +00005498static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499int fixcapitalize(PyUnicodeObject *self)
5500{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005501 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005502 Py_UNICODE *s = self->str;
5503 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005504
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005505 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005506 return 0;
Ezio Melotti15d6b652011-08-15 09:22:24 +03005507 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005508 *s = Py_UNICODE_TOUPPER(*s);
5509 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005511 s++;
5512 while (--len > 0) {
Ezio Melotti15d6b652011-08-15 09:22:24 +03005513 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005514 *s = Py_UNICODE_TOLOWER(*s);
5515 status = 1;
5516 }
5517 s++;
5518 }
5519 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520}
5521
5522static
5523int fixtitle(PyUnicodeObject *self)
5524{
5525 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5526 register Py_UNICODE *e;
5527 int previous_is_cased;
5528
5529 /* Shortcut for single character strings */
5530 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005531 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5532 if (*p != ch) {
5533 *p = ch;
5534 return 1;
5535 }
5536 else
5537 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 }
Tim Petersced69f82003-09-16 20:30:58 +00005539
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 e = p + PyUnicode_GET_SIZE(self);
5541 previous_is_cased = 0;
5542 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005543 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005544
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005545 if (previous_is_cased)
5546 *p = Py_UNICODE_TOLOWER(ch);
5547 else
5548 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005549
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005550 if (Py_UNICODE_ISLOWER(ch) ||
5551 Py_UNICODE_ISUPPER(ch) ||
5552 Py_UNICODE_ISTITLE(ch))
5553 previous_is_cased = 1;
5554 else
5555 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556 }
5557 return 1;
5558}
5559
Tim Peters8ce9f162004-08-27 01:49:32 +00005560PyObject *
5561PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562{
Tim Peters8ce9f162004-08-27 01:49:32 +00005563 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005564 const Py_UNICODE blank = ' ';
5565 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005566 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005567 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005568 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5569 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005570 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5571 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005572 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005573 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005574 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575
Tim Peters05eba1f2004-08-27 21:32:02 +00005576 fseq = PySequence_Fast(seq, "");
5577 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005578 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005579 }
5580
Tim Peters91879ab2004-08-27 22:35:44 +00005581 /* Grrrr. A codec may be invoked to convert str objects to
5582 * Unicode, and so it's possible to call back into Python code
5583 * during PyUnicode_FromObject(), and so it's possible for a sick
5584 * codec to change the size of fseq (if seq is a list). Therefore
5585 * we have to keep refetching the size -- can't assume seqlen
5586 * is invariant.
5587 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005588 seqlen = PySequence_Fast_GET_SIZE(fseq);
5589 /* If empty sequence, return u"". */
5590 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005591 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5592 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005593 }
5594 /* If singleton sequence with an exact Unicode, return that. */
5595 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005596 item = PySequence_Fast_GET_ITEM(fseq, 0);
5597 if (PyUnicode_CheckExact(item)) {
5598 Py_INCREF(item);
5599 res = (PyUnicodeObject *)item;
5600 goto Done;
5601 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005602 }
5603
Tim Peters05eba1f2004-08-27 21:32:02 +00005604 /* At least two items to join, or one that isn't exact Unicode. */
5605 if (seqlen > 1) {
5606 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005607 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005608 sep = &blank;
5609 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005610 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005611 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005612 internal_separator = PyUnicode_FromObject(separator);
5613 if (internal_separator == NULL)
5614 goto onError;
5615 sep = PyUnicode_AS_UNICODE(internal_separator);
5616 seplen = PyUnicode_GET_SIZE(internal_separator);
5617 /* In case PyUnicode_FromObject() mutated seq. */
5618 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005619 }
5620 }
5621
5622 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005623 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005624 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005625 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005626 res_p = PyUnicode_AS_UNICODE(res);
5627 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005628
Tim Peters05eba1f2004-08-27 21:32:02 +00005629 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005630 Py_ssize_t itemlen;
5631 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005632
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005633 item = PySequence_Fast_GET_ITEM(fseq, i);
5634 /* Convert item to Unicode. */
5635 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5636 PyErr_Format(PyExc_TypeError,
5637 "sequence item %zd: expected string or Unicode,"
5638 " %.80s found",
5639 i, Py_TYPE(item)->tp_name);
5640 goto onError;
5641 }
5642 item = PyUnicode_FromObject(item);
5643 if (item == NULL)
5644 goto onError;
5645 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005646
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005647 /* In case PyUnicode_FromObject() mutated seq. */
5648 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005649
Tim Peters8ce9f162004-08-27 01:49:32 +00005650 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005651 itemlen = PyUnicode_GET_SIZE(item);
5652 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005653 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005654 goto Overflow;
5655 if (i < seqlen - 1) {
5656 new_res_used += seplen;
5657 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005658 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005659 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005660 if (new_res_used > res_alloc) {
5661 /* double allocated size until it's big enough */
5662 do {
5663 res_alloc += res_alloc;
5664 if (res_alloc <= 0)
5665 goto Overflow;
5666 } while (new_res_used > res_alloc);
5667 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5668 Py_DECREF(item);
5669 goto onError;
5670 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005671 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005672 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005673
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005674 /* Copy item, and maybe the separator. */
5675 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5676 res_p += itemlen;
5677 if (i < seqlen - 1) {
5678 Py_UNICODE_COPY(res_p, sep, seplen);
5679 res_p += seplen;
5680 }
5681 Py_DECREF(item);
5682 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005683 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005684
Tim Peters05eba1f2004-08-27 21:32:02 +00005685 /* Shrink res to match the used area; this probably can't fail,
5686 * but it's cheap to check.
5687 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005688 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005689 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005690
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005691 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005692 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005693 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 return (PyObject *)res;
5695
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005696 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005697 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005698 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005699 Py_DECREF(item);
5700 /* fall through */
5701
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005702 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005703 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005704 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005705 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 return NULL;
5707}
5708
Tim Petersced69f82003-09-16 20:30:58 +00005709static
5710PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005711 Py_ssize_t left,
5712 Py_ssize_t right,
5713 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714{
5715 PyUnicodeObject *u;
5716
5717 if (left < 0)
5718 left = 0;
5719 if (right < 0)
5720 right = 0;
5721
Tim Peters7a29bd52001-09-12 03:03:31 +00005722 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723 Py_INCREF(self);
5724 return self;
5725 }
5726
Neal Norwitze7d8be82008-07-31 17:17:14 +00005727 if (left > PY_SSIZE_T_MAX - self->length ||
5728 right > PY_SSIZE_T_MAX - (left + self->length)) {
5729 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5730 return NULL;
5731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 u = _PyUnicode_New(left + self->length + right);
5733 if (u) {
5734 if (left)
5735 Py_UNICODE_FILL(u->str, fill, left);
5736 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5737 if (right)
5738 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5739 }
5740
5741 return u;
5742}
5743
Antoine Pitrou64672132010-01-13 07:55:48 +00005744PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747
5748 string = PyUnicode_FromObject(string);
5749 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005750 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751
Antoine Pitrou64672132010-01-13 07:55:48 +00005752 list = stringlib_splitlines(
5753 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5754 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755
5756 Py_DECREF(string);
5757 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758}
5759
Tim Petersced69f82003-09-16 20:30:58 +00005760static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005762 PyUnicodeObject *substring,
5763 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005766 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005769 return stringlib_split_whitespace(
5770 (PyObject*) self, self->str, self->length, maxcount
5771 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772
Antoine Pitrou64672132010-01-13 07:55:48 +00005773 return stringlib_split(
5774 (PyObject*) self, self->str, self->length,
5775 substring->str, substring->length,
5776 maxcount
5777 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778}
5779
Tim Petersced69f82003-09-16 20:30:58 +00005780static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005781PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005782 PyUnicodeObject *substring,
5783 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005784{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005785 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005786 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005787
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005788 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005789 return stringlib_rsplit_whitespace(
5790 (PyObject*) self, self->str, self->length, maxcount
5791 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005792
Antoine Pitrou64672132010-01-13 07:55:48 +00005793 return stringlib_rsplit(
5794 (PyObject*) self, self->str, self->length,
5795 substring->str, substring->length,
5796 maxcount
5797 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005798}
5799
5800static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005802 PyUnicodeObject *str1,
5803 PyUnicodeObject *str2,
5804 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805{
5806 PyUnicodeObject *u;
5807
5808 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005809 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005810 else if (maxcount == 0 || self->length == 0)
5811 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812
Fredrik Lundh347ee272006-05-24 16:35:18 +00005813 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005814 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005815 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005816 if (str1->length == 0)
5817 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005818 if (str1->length == 1) {
5819 /* replace characters */
5820 Py_UNICODE u1, u2;
5821 if (!findchar(self->str, self->length, str1->str[0]))
5822 goto nothing;
5823 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5824 if (!u)
5825 return NULL;
5826 Py_UNICODE_COPY(u->str, self->str, self->length);
5827 u1 = str1->str[0];
5828 u2 = str2->str[0];
5829 for (i = 0; i < u->length; i++)
5830 if (u->str[i] == u1) {
5831 if (--maxcount < 0)
5832 break;
5833 u->str[i] = u2;
5834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005836 i = stringlib_find(
5837 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005838 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005839 if (i < 0)
5840 goto nothing;
5841 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5842 if (!u)
5843 return NULL;
5844 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005845
5846 /* change everything in-place, starting with this one */
5847 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5848 i += str1->length;
5849
5850 while ( --maxcount > 0) {
5851 i = stringlib_find(self->str+i, self->length-i,
5852 str1->str, str1->length,
5853 i);
5854 if (i == -1)
5855 break;
5856 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5857 i += str1->length;
5858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005861
Brett Cannona7f13ee2010-05-04 01:16:51 +00005862 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005863 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 Py_UNICODE *p;
5865
5866 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005867 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5868 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005869 if (n == 0)
5870 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005871 /* new_size = self->length + n * (str2->length - str1->length)); */
5872 delta = (str2->length - str1->length);
5873 if (delta == 0) {
5874 new_size = self->length;
5875 } else {
5876 product = n * (str2->length - str1->length);
5877 if ((product / (str2->length - str1->length)) != n) {
5878 PyErr_SetString(PyExc_OverflowError,
5879 "replace string is too long");
5880 return NULL;
5881 }
5882 new_size = self->length + product;
5883 if (new_size < 0) {
5884 PyErr_SetString(PyExc_OverflowError,
5885 "replace string is too long");
5886 return NULL;
5887 }
5888 }
5889 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005890 if (!u)
5891 return NULL;
5892 i = 0;
5893 p = u->str;
5894 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005895 while (n-- > 0) {
5896 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005897 j = stringlib_find(self->str+i, self->length-i,
5898 str1->str, str1->length,
5899 i);
5900 if (j == -1)
5901 break;
5902 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005903 /* copy unchanged part [i:j] */
5904 Py_UNICODE_COPY(p, self->str+i, j-i);
5905 p += j - i;
5906 }
5907 /* copy substitution string */
5908 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005909 Py_UNICODE_COPY(p, str2->str, str2->length);
5910 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005911 }
5912 i = j + str1->length;
5913 }
5914 if (i < self->length)
5915 /* copy tail [i:] */
5916 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005917 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005918 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005919 while (n > 0) {
5920 Py_UNICODE_COPY(p, str2->str, str2->length);
5921 p += str2->length;
5922 if (--n <= 0)
5923 break;
5924 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005926 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 }
5928 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005930
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005931 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005932 /* nothing to replace; return original string (when possible) */
5933 if (PyUnicode_CheckExact(self)) {
5934 Py_INCREF(self);
5935 return (PyObject *) self;
5936 }
5937 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938}
5939
5940/* --- Unicode Object Methods --------------------------------------------- */
5941
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005942PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005943 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944\n\
5945Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005946characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947
5948static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005949unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951 return fixup(self, fixtitle);
5952}
5953
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005954PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005955 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956\n\
5957Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00005958have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959
5960static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005961unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 return fixup(self, fixcapitalize);
5964}
5965
5966#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005967PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005968 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969\n\
5970Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005971normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972
5973static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005974unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975{
5976 PyObject *list;
5977 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005978 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 /* Split into words */
5981 list = split(self, NULL, -1);
5982 if (!list)
5983 return NULL;
5984
5985 /* Capitalize each word */
5986 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5987 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005988 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 if (item == NULL)
5990 goto onError;
5991 Py_DECREF(PyList_GET_ITEM(list, i));
5992 PyList_SET_ITEM(list, i, item);
5993 }
5994
5995 /* Join the words to form a new string */
5996 item = PyUnicode_Join(NULL, list);
5997
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005998 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 Py_DECREF(list);
6000 return (PyObject *)item;
6001}
6002#endif
6003
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006004/* Argument converter. Coerces to a single unicode character */
6005
6006static int
6007convert_uc(PyObject *obj, void *addr)
6008{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006009 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6010 PyObject *uniobj;
6011 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006012
Benjamin Peterson857ce152009-01-31 16:29:18 +00006013 uniobj = PyUnicode_FromObject(obj);
6014 if (uniobj == NULL) {
6015 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006016 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006017 return 0;
6018 }
6019 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6020 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006021 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006022 Py_DECREF(uniobj);
6023 return 0;
6024 }
6025 unistr = PyUnicode_AS_UNICODE(uniobj);
6026 *fillcharloc = unistr[0];
6027 Py_DECREF(uniobj);
6028 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006029}
6030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006031PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006032 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006034Return S centered in a Unicode string of length width. Padding is\n\
6035done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036
6037static PyObject *
6038unicode_center(PyUnicodeObject *self, PyObject *args)
6039{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006040 Py_ssize_t marg, left;
6041 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006042 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043
Thomas Woutersde017742006-02-16 19:34:37 +00006044 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 return NULL;
6046
Tim Peters7a29bd52001-09-12 03:03:31 +00006047 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 Py_INCREF(self);
6049 return (PyObject*) self;
6050 }
6051
6052 marg = width - self->length;
6053 left = marg / 2 + (marg & width & 1);
6054
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006055 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056}
6057
Marc-André Lemburge5034372000-08-08 08:04:29 +00006058#if 0
6059
6060/* This code should go into some future Unicode collation support
6061 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006062 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006063
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006064/* speedy UTF-16 code point order comparison */
6065/* gleaned from: */
6066/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6067
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006068static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006069{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006070 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006071 0, 0, 0, 0, 0, 0, 0, 0,
6072 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006073 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006074};
6075
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076static int
6077unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6078{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006079 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006080
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081 Py_UNICODE *s1 = str1->str;
6082 Py_UNICODE *s2 = str2->str;
6083
6084 len1 = str1->length;
6085 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006086
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006088 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006089
6090 c1 = *s1++;
6091 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006092
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006093 if (c1 > (1<<11) * 26)
6094 c1 += utf16Fixup[c1>>11];
6095 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006096 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006097 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006098
6099 if (c1 != c2)
6100 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006101
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006102 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 }
6104
6105 return (len1 < len2) ? -1 : (len1 != len2);
6106}
6107
Marc-André Lemburge5034372000-08-08 08:04:29 +00006108#else
6109
6110static int
6111unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6112{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006113 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006114
6115 Py_UNICODE *s1 = str1->str;
6116 Py_UNICODE *s2 = str2->str;
6117
6118 len1 = str1->length;
6119 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006120
Marc-André Lemburge5034372000-08-08 08:04:29 +00006121 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006122 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006123
Fredrik Lundh45714e92001-06-26 16:39:36 +00006124 c1 = *s1++;
6125 c2 = *s2++;
6126
6127 if (c1 != c2)
6128 return (c1 < c2) ? -1 : 1;
6129
Marc-André Lemburge5034372000-08-08 08:04:29 +00006130 len1--; len2--;
6131 }
6132
6133 return (len1 < len2) ? -1 : (len1 != len2);
6134}
6135
6136#endif
6137
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006139 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140{
6141 PyUnicodeObject *u = NULL, *v = NULL;
6142 int result;
6143
6144 /* Coerce the two arguments */
6145 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6146 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006147 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6149 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006150 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151
Thomas Wouters7e474022000-07-16 12:04:32 +00006152 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006154 Py_DECREF(u);
6155 Py_DECREF(v);
6156 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 }
6158
6159 result = unicode_compare(u, v);
6160
6161 Py_DECREF(u);
6162 Py_DECREF(v);
6163 return result;
6164
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006165 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 Py_XDECREF(u);
6167 Py_XDECREF(v);
6168 return -1;
6169}
6170
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006171PyObject *PyUnicode_RichCompare(PyObject *left,
6172 PyObject *right,
6173 int op)
6174{
6175 int result;
6176
6177 result = PyUnicode_Compare(left, right);
6178 if (result == -1 && PyErr_Occurred())
6179 goto onError;
6180
6181 /* Convert the return value to a Boolean */
6182 switch (op) {
6183 case Py_EQ:
6184 result = (result == 0);
6185 break;
6186 case Py_NE:
6187 result = (result != 0);
6188 break;
6189 case Py_LE:
6190 result = (result <= 0);
6191 break;
6192 case Py_GE:
6193 result = (result >= 0);
6194 break;
6195 case Py_LT:
6196 result = (result == -1);
6197 break;
6198 case Py_GT:
6199 result = (result == 1);
6200 break;
6201 }
6202 return PyBool_FromLong(result);
6203
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006204 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006205
6206 /* Standard case
6207
6208 Type errors mean that PyUnicode_FromObject() could not convert
6209 one of the arguments (usually the right hand side) to Unicode,
6210 ie. we can't handle the comparison request. However, it is
6211 possible that the other object knows a comparison method, which
6212 is why we return Py_NotImplemented to give the other object a
6213 chance.
6214
6215 */
6216 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6217 PyErr_Clear();
6218 Py_INCREF(Py_NotImplemented);
6219 return Py_NotImplemented;
6220 }
6221 if (op != Py_EQ && op != Py_NE)
6222 return NULL;
6223
6224 /* Equality comparison.
6225
6226 This is a special case: we silence any PyExc_UnicodeDecodeError
6227 and instead turn it into a PyErr_UnicodeWarning.
6228
6229 */
6230 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6231 return NULL;
6232 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006233 if (PyErr_Warn(PyExc_UnicodeWarning,
6234 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006235 "Unicode equal comparison "
6236 "failed to convert both arguments to Unicode - "
6237 "interpreting them as being unequal" :
6238 "Unicode unequal comparison "
6239 "failed to convert both arguments to Unicode - "
6240 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006241 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006242 return NULL;
6243 result = (op == Py_NE);
6244 return PyBool_FromLong(result);
6245}
6246
Guido van Rossum403d68b2000-03-13 15:55:09 +00006247int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006248 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006249{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006250 PyObject *str, *sub;
6251 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006252
6253 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006254 sub = PyUnicode_FromObject(element);
6255 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006256 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006257 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006258
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006259 str = PyUnicode_FromObject(container);
6260 if (!str) {
6261 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006262 return -1;
6263 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006264
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006265 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006266
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006267 Py_DECREF(str);
6268 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006269
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006270 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006271}
6272
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273/* Concat to string or Unicode object giving a new Unicode object. */
6274
6275PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006276 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277{
6278 PyUnicodeObject *u = NULL, *v = NULL, *w;
6279
6280 /* Coerce the two arguments */
6281 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6282 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006283 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6285 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006286 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287
6288 /* Shortcuts */
6289 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006290 Py_DECREF(v);
6291 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 }
6293 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006294 Py_DECREF(u);
6295 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 }
6297
6298 /* Concat the two Unicode strings */
6299 w = _PyUnicode_New(u->length + v->length);
6300 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006301 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302 Py_UNICODE_COPY(w->str, u->str, u->length);
6303 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6304
6305 Py_DECREF(u);
6306 Py_DECREF(v);
6307 return (PyObject *)w;
6308
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006309 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310 Py_XDECREF(u);
6311 Py_XDECREF(v);
6312 return NULL;
6313}
6314
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006315PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006316 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006318Return the number of non-overlapping occurrences of substring sub in\n\
6319Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006320interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321
6322static PyObject *
6323unicode_count(PyUnicodeObject *self, PyObject *args)
6324{
6325 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006326 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006327 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328 PyObject *result;
6329
Jesus Cea44e81682011-04-20 16:39:15 +02006330 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6331 &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006332 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006333
Antoine Pitrou64672132010-01-13 07:55:48 +00006334 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006335 result = PyInt_FromSsize_t(
6336 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006337 substring->str, substring->length,
6338 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006339 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340
6341 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006342
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343 return result;
6344}
6345
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006346PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006347 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006349Encodes S using the codec registered for encoding. encoding defaults\n\
6350to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006351handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006352a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6353'xmlcharrefreplace' as well as any other name registered with\n\
6354codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355
6356static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006357unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006359 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360 char *encoding = NULL;
6361 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006362 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006363
Benjamin Peterson332d7212009-09-18 21:14:55 +00006364 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6365 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006367 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006368 if (v == NULL)
6369 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006370 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006371 PyErr_Format(PyExc_TypeError,
6372 "encoder did not return a string/unicode object "
6373 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006374 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006375 Py_DECREF(v);
6376 return NULL;
6377 }
6378 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006379
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006380 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006381 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006382}
6383
6384PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006385 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006386\n\
6387Decodes S using the codec registered for encoding. encoding defaults\n\
6388to the default encoding. errors may be given to set a different error\n\
6389handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6390a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
Éric Araujoa4c81b02012-02-20 02:07:31 +01006391as well as any other name registered with codecs.register_error that is\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006392able to handle UnicodeDecodeErrors.");
6393
6394static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006395unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006396{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006397 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006398 char *encoding = NULL;
6399 char *errors = NULL;
6400 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006401
Benjamin Peterson332d7212009-09-18 21:14:55 +00006402 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6403 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006404 return NULL;
6405 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006406 if (v == NULL)
6407 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006408 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006409 PyErr_Format(PyExc_TypeError,
6410 "decoder did not return a string/unicode object "
6411 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006412 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006413 Py_DECREF(v);
6414 return NULL;
6415 }
6416 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006417
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006418 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006419 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420}
6421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006422PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006423 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424\n\
6425Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006426If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427
6428static PyObject*
6429unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6430{
6431 Py_UNICODE *e;
6432 Py_UNICODE *p;
6433 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006434 Py_UNICODE *qe;
6435 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 PyUnicodeObject *u;
6437 int tabsize = 8;
6438
6439 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006440 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441
Thomas Wouters7e474022000-07-16 12:04:32 +00006442 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006443 i = 0; /* chars up to and including most recent \n or \r */
6444 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6445 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 for (p = self->str; p < e; p++)
6447 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006448 if (tabsize > 0) {
6449 incr = tabsize - (j % tabsize); /* cannot overflow */
6450 if (j > PY_SSIZE_T_MAX - incr)
6451 goto overflow1;
6452 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006453 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006456 if (j > PY_SSIZE_T_MAX - 1)
6457 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458 j++;
6459 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006460 if (i > PY_SSIZE_T_MAX - j)
6461 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006463 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 }
6465 }
6466
Guido van Rossum5bdff602008-03-11 21:18:06 +00006467 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006468 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006469
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 /* Second pass: create output string and fill it */
6471 u = _PyUnicode_New(i + j);
6472 if (!u)
6473 return NULL;
6474
Guido van Rossum5bdff602008-03-11 21:18:06 +00006475 j = 0; /* same as in first pass */
6476 q = u->str; /* next output char */
6477 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478
6479 for (p = self->str; p < e; p++)
6480 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006481 if (tabsize > 0) {
6482 i = tabsize - (j % tabsize);
6483 j += i;
6484 while (i--) {
6485 if (q >= qe)
6486 goto overflow2;
6487 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006488 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006489 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006490 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006491 else {
6492 if (q >= qe)
6493 goto overflow2;
6494 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006495 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496 if (*p == '\n' || *p == '\r')
6497 j = 0;
6498 }
6499
6500 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006501
6502 overflow2:
6503 Py_DECREF(u);
6504 overflow1:
6505 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6506 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507}
6508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006509PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006510 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511\n\
6512Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08006513such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514arguments start and end are interpreted as in slice notation.\n\
6515\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006516Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517
6518static PyObject *
6519unicode_find(PyUnicodeObject *self, PyObject *args)
6520{
Jesus Cea44e81682011-04-20 16:39:15 +02006521 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006522 Py_ssize_t start;
6523 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006524 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525
Jesus Cea44e81682011-04-20 16:39:15 +02006526 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6527 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006530 result = stringlib_find_slice(
6531 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6532 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6533 start, end
6534 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535
6536 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006537
6538 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539}
6540
6541static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006542unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543{
6544 if (index < 0 || index >= self->length) {
6545 PyErr_SetString(PyExc_IndexError, "string index out of range");
6546 return NULL;
6547 }
6548
6549 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6550}
6551
6552static long
6553unicode_hash(PyUnicodeObject *self)
6554{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006555 /* Since Unicode objects compare equal to their ASCII string
6556 counterparts, they should use the individual character values
6557 as basis for their hash value. This is needed to assure that
6558 strings and Unicode objects behave in the same way as
6559 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560
Martin v. Löwis18e16552006-02-15 17:27:45 +00006561 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006562 register Py_UNICODE *p;
6563 register long x;
6564
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006565#ifdef Py_DEBUG
Benjamin Peterson26da9202012-02-21 11:08:50 -05006566 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006567#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006569 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006570 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006571 /*
6572 We make the hash of the empty string be 0, rather than using
6573 (prefix ^ suffix), since this slightly obfuscates the hash secret
6574 */
6575 if (len == 0) {
6576 self->hash = 0;
6577 return 0;
6578 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006579 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006580 x = _Py_HashSecret.prefix;
6581 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006582 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006583 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006584 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006585 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006586 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006587 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006588 self->hash = x;
6589 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590}
6591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006592PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006593 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006595Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596
6597static PyObject *
6598unicode_index(PyUnicodeObject *self, PyObject *args)
6599{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006600 Py_ssize_t result;
Jesus Cea44e81682011-04-20 16:39:15 +02006601 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006602 Py_ssize_t start;
6603 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604
Jesus Cea44e81682011-04-20 16:39:15 +02006605 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6606 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006609 result = stringlib_find_slice(
6610 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6611 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6612 start, end
6613 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614
6615 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006616
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 if (result < 0) {
6618 PyErr_SetString(PyExc_ValueError, "substring not found");
6619 return NULL;
6620 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006621
Martin v. Löwis18e16552006-02-15 17:27:45 +00006622 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623}
6624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006625PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006626 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006628Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006629at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630
6631static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006632unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633{
6634 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6635 register const Py_UNICODE *e;
6636 int cased;
6637
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638 /* Shortcut for single character strings */
6639 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006640 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006642 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006643 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006644 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006645
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 e = p + PyUnicode_GET_SIZE(self);
6647 cased = 0;
6648 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006649 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006650
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006651 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6652 return PyBool_FromLong(0);
6653 else if (!cased && Py_UNICODE_ISLOWER(ch))
6654 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006656 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657}
6658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006659PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006660 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006662Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006663at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664
6665static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006666unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667{
6668 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6669 register const Py_UNICODE *e;
6670 int cased;
6671
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672 /* Shortcut for single character strings */
6673 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006674 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006676 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006677 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006678 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006679
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 e = p + PyUnicode_GET_SIZE(self);
6681 cased = 0;
6682 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006683 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006684
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006685 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6686 return PyBool_FromLong(0);
6687 else if (!cased && Py_UNICODE_ISUPPER(ch))
6688 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006690 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691}
6692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006693PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006694 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006696Return True if S is a titlecased string and there is at least one\n\
6697character in S, i.e. upper- and titlecase characters may only\n\
6698follow uncased characters and lowercase characters only cased ones.\n\
6699Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700
6701static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006702unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703{
6704 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6705 register const Py_UNICODE *e;
6706 int cased, previous_is_cased;
6707
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 /* Shortcut for single character strings */
6709 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006710 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6711 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006713 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006714 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006715 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006716
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717 e = p + PyUnicode_GET_SIZE(self);
6718 cased = 0;
6719 previous_is_cased = 0;
6720 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006721 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006722
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006723 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6724 if (previous_is_cased)
6725 return PyBool_FromLong(0);
6726 previous_is_cased = 1;
6727 cased = 1;
6728 }
6729 else if (Py_UNICODE_ISLOWER(ch)) {
6730 if (!previous_is_cased)
6731 return PyBool_FromLong(0);
6732 previous_is_cased = 1;
6733 cased = 1;
6734 }
6735 else
6736 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006738 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739}
6740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006741PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006742 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006744Return True if all characters in S are whitespace\n\
6745and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746
6747static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006748unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749{
6750 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6751 register const Py_UNICODE *e;
6752
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 /* Shortcut for single character strings */
6754 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006755 Py_UNICODE_ISSPACE(*p))
6756 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006758 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006759 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006760 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006761
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762 e = p + PyUnicode_GET_SIZE(self);
6763 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006764 if (!Py_UNICODE_ISSPACE(*p))
6765 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006767 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768}
6769
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006770PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006771 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006772\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006773Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006774and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006775
6776static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006777unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006778{
6779 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6780 register const Py_UNICODE *e;
6781
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006782 /* Shortcut for single character strings */
6783 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006784 Py_UNICODE_ISALPHA(*p))
6785 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006786
6787 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006788 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006789 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006790
6791 e = p + PyUnicode_GET_SIZE(self);
6792 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006793 if (!Py_UNICODE_ISALPHA(*p))
6794 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006795 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006796 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006797}
6798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006799PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006800 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006801\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006802Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006803and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006804
6805static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006806unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006807{
6808 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6809 register const Py_UNICODE *e;
6810
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006811 /* Shortcut for single character strings */
6812 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006813 Py_UNICODE_ISALNUM(*p))
6814 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006815
6816 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006817 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006818 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006819
6820 e = p + PyUnicode_GET_SIZE(self);
6821 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006822 if (!Py_UNICODE_ISALNUM(*p))
6823 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006824 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006825 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006826}
6827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006828PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006829 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006830\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006831Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006832False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006833
6834static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006835unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836{
6837 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6838 register const Py_UNICODE *e;
6839
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 /* Shortcut for single character strings */
6841 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006842 Py_UNICODE_ISDECIMAL(*p))
6843 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006845 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006846 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006847 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006848
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849 e = p + PyUnicode_GET_SIZE(self);
6850 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006851 if (!Py_UNICODE_ISDECIMAL(*p))
6852 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006854 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855}
6856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006857PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006858 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006860Return True if all characters in S are digits\n\
6861and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862
6863static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006864unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865{
6866 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6867 register const Py_UNICODE *e;
6868
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869 /* Shortcut for single character strings */
6870 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006871 Py_UNICODE_ISDIGIT(*p))
6872 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006874 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006875 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006876 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006877
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 e = p + PyUnicode_GET_SIZE(self);
6879 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006880 if (!Py_UNICODE_ISDIGIT(*p))
6881 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006883 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884}
6885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006886PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006887 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006889Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006890False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891
6892static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006893unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894{
6895 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6896 register const Py_UNICODE *e;
6897
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 /* Shortcut for single character strings */
6899 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006900 Py_UNICODE_ISNUMERIC(*p))
6901 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006903 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006904 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006905 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006906
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 e = p + PyUnicode_GET_SIZE(self);
6908 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006909 if (!Py_UNICODE_ISNUMERIC(*p))
6910 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006912 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913}
6914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006915PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006916 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917\n\
6918Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006919iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920
6921static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006922unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006924 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925}
6926
Martin v. Löwis18e16552006-02-15 17:27:45 +00006927static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928unicode_length(PyUnicodeObject *self)
6929{
6930 return self->length;
6931}
6932
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006933PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006934 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006936Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006937done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938
6939static PyObject *
6940unicode_ljust(PyUnicodeObject *self, PyObject *args)
6941{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006942 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006943 Py_UNICODE fillchar = ' ';
6944
Martin v. Löwis412fb672006-04-13 06:34:32 +00006945 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946 return NULL;
6947
Tim Peters7a29bd52001-09-12 03:03:31 +00006948 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949 Py_INCREF(self);
6950 return (PyObject*) self;
6951 }
6952
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006953 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954}
6955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006956PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006957 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006959Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006960
6961static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006962unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964 return fixup(self, fixlower);
6965}
6966
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006967#define LEFTSTRIP 0
6968#define RIGHTSTRIP 1
6969#define BOTHSTRIP 2
6970
6971/* Arrays indexed by above */
6972static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6973
6974#define STRIPNAME(i) (stripformat[i]+3)
6975
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006976/* externally visible for str.strip(unicode) */
6977PyObject *
6978_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6979{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006980 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6981 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6982 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6983 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6984 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006985
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006986 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006987
Benjamin Peterson857ce152009-01-31 16:29:18 +00006988 i = 0;
6989 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006990 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6991 i++;
6992 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006993 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006994
Benjamin Peterson857ce152009-01-31 16:29:18 +00006995 j = len;
6996 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006997 do {
6998 j--;
6999 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7000 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007001 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007002
Benjamin Peterson857ce152009-01-31 16:29:18 +00007003 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007004 Py_INCREF(self);
7005 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007006 }
7007 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007008 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007009}
7010
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011
7012static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007013do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007015 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7016 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007017
Benjamin Peterson857ce152009-01-31 16:29:18 +00007018 i = 0;
7019 if (striptype != RIGHTSTRIP) {
7020 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7021 i++;
7022 }
7023 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007024
Benjamin Peterson857ce152009-01-31 16:29:18 +00007025 j = len;
7026 if (striptype != LEFTSTRIP) {
7027 do {
7028 j--;
7029 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7030 j++;
7031 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007032
Benjamin Peterson857ce152009-01-31 16:29:18 +00007033 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7034 Py_INCREF(self);
7035 return (PyObject*)self;
7036 }
7037 else
7038 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039}
7040
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007041
7042static PyObject *
7043do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7044{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007045 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007046
Benjamin Peterson857ce152009-01-31 16:29:18 +00007047 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7048 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007049
Benjamin Peterson857ce152009-01-31 16:29:18 +00007050 if (sep != NULL && sep != Py_None) {
7051 if (PyUnicode_Check(sep))
7052 return _PyUnicode_XStrip(self, striptype, sep);
7053 else if (PyString_Check(sep)) {
7054 PyObject *res;
7055 sep = PyUnicode_FromObject(sep);
7056 if (sep==NULL)
7057 return NULL;
7058 res = _PyUnicode_XStrip(self, striptype, sep);
7059 Py_DECREF(sep);
7060 return res;
7061 }
7062 else {
7063 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007064 "%s arg must be None, unicode or str",
7065 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007066 return NULL;
7067 }
7068 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007069
Benjamin Peterson857ce152009-01-31 16:29:18 +00007070 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007071}
7072
7073
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007074PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007075 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007076\n\
7077Return a copy of the string S with leading and trailing\n\
7078whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007079If chars is given and not None, remove characters in chars instead.\n\
7080If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007081
7082static PyObject *
7083unicode_strip(PyUnicodeObject *self, PyObject *args)
7084{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007085 if (PyTuple_GET_SIZE(args) == 0)
7086 return do_strip(self, BOTHSTRIP); /* Common case */
7087 else
7088 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007089}
7090
7091
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007092PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007093 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007094\n\
7095Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007096If chars is given and not None, remove characters in chars instead.\n\
7097If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007098
7099static PyObject *
7100unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7101{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007102 if (PyTuple_GET_SIZE(args) == 0)
7103 return do_strip(self, LEFTSTRIP); /* Common case */
7104 else
7105 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007106}
7107
7108
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007109PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007110 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007111\n\
7112Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007113If chars is given and not None, remove characters in chars instead.\n\
7114If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007115
7116static PyObject *
7117unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7118{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007119 if (PyTuple_GET_SIZE(args) == 0)
7120 return do_strip(self, RIGHTSTRIP); /* Common case */
7121 else
7122 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007123}
7124
7125
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007127unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128{
7129 PyUnicodeObject *u;
7130 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007131 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007132 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133
7134 if (len < 0)
7135 len = 0;
7136
Tim Peters7a29bd52001-09-12 03:03:31 +00007137 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138 /* no repeat, return original string */
7139 Py_INCREF(str);
7140 return (PyObject*) str;
7141 }
Tim Peters8f422462000-09-09 06:13:41 +00007142
7143 /* ensure # of chars needed doesn't overflow int and # of bytes
7144 * needed doesn't overflow size_t
7145 */
7146 nchars = len * str->length;
7147 if (len && nchars / len != str->length) {
7148 PyErr_SetString(PyExc_OverflowError,
7149 "repeated string is too long");
7150 return NULL;
7151 }
7152 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7153 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7154 PyErr_SetString(PyExc_OverflowError,
7155 "repeated string is too long");
7156 return NULL;
7157 }
7158 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 if (!u)
7160 return NULL;
7161
7162 p = u->str;
7163
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007164 if (str->length == 1 && len > 0) {
7165 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007166 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007167 Py_ssize_t done = 0; /* number of characters copied this far */
7168 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007169 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007170 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007171 }
7172 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007173 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007174 Py_UNICODE_COPY(p+done, p, n);
7175 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007176 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178
7179 return (PyObject*) u;
7180}
7181
7182PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007183 PyObject *subobj,
7184 PyObject *replobj,
7185 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186{
7187 PyObject *self;
7188 PyObject *str1;
7189 PyObject *str2;
7190 PyObject *result;
7191
7192 self = PyUnicode_FromObject(obj);
7193 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007194 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007195 str1 = PyUnicode_FromObject(subobj);
7196 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007197 Py_DECREF(self);
7198 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 }
7200 str2 = PyUnicode_FromObject(replobj);
7201 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007202 Py_DECREF(self);
7203 Py_DECREF(str1);
7204 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007205 }
Tim Petersced69f82003-09-16 20:30:58 +00007206 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007207 (PyUnicodeObject *)str1,
7208 (PyUnicodeObject *)str2,
7209 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210 Py_DECREF(self);
7211 Py_DECREF(str1);
7212 Py_DECREF(str2);
7213 return result;
7214}
7215
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007216PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007217 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218\n\
7219Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007220old replaced by new. If the optional argument count is\n\
7221given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222
7223static PyObject*
7224unicode_replace(PyUnicodeObject *self, PyObject *args)
7225{
7226 PyUnicodeObject *str1;
7227 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007228 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229 PyObject *result;
7230
Martin v. Löwis18e16552006-02-15 17:27:45 +00007231 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232 return NULL;
7233 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7234 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007235 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007237 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007238 Py_DECREF(str1);
7239 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007241
7242 result = replace(self, str1, str2, maxcount);
7243
7244 Py_DECREF(str1);
7245 Py_DECREF(str2);
7246 return result;
7247}
7248
7249static
7250PyObject *unicode_repr(PyObject *unicode)
7251{
7252 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007253 PyUnicode_GET_SIZE(unicode),
7254 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255}
7256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007257PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007258 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259\n\
7260Return the highest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08007261such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262arguments start and end are interpreted as in slice notation.\n\
7263\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007264Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265
7266static PyObject *
7267unicode_rfind(PyUnicodeObject *self, PyObject *args)
7268{
Jesus Cea44e81682011-04-20 16:39:15 +02007269 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007270 Py_ssize_t start;
7271 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007272 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273
Jesus Cea44e81682011-04-20 16:39:15 +02007274 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7275 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007276 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007278 result = stringlib_rfind_slice(
7279 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7280 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7281 start, end
7282 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283
7284 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007285
7286 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007287}
7288
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007289PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007290 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007292Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007293
7294static PyObject *
7295unicode_rindex(PyUnicodeObject *self, PyObject *args)
7296{
Jesus Cea44e81682011-04-20 16:39:15 +02007297 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007298 Py_ssize_t start;
7299 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007300 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301
Jesus Cea44e81682011-04-20 16:39:15 +02007302 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7303 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007306 result = stringlib_rfind_slice(
7307 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7308 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7309 start, end
7310 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311
7312 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007313
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314 if (result < 0) {
7315 PyErr_SetString(PyExc_ValueError, "substring not found");
7316 return NULL;
7317 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007318 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319}
7320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007321PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007322 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007324Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007325done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326
7327static PyObject *
7328unicode_rjust(PyUnicodeObject *self, PyObject *args)
7329{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007330 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007331 Py_UNICODE fillchar = ' ';
7332
Martin v. Löwis412fb672006-04-13 06:34:32 +00007333 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334 return NULL;
7335
Tim Peters7a29bd52001-09-12 03:03:31 +00007336 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337 Py_INCREF(self);
7338 return (PyObject*) self;
7339 }
7340
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007341 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342}
7343
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007345unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346{
7347 /* standard clamping */
7348 if (start < 0)
7349 start = 0;
7350 if (end < 0)
7351 end = 0;
7352 if (end > self->length)
7353 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007354 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355 /* full slice, return original string */
7356 Py_INCREF(self);
7357 return (PyObject*) self;
7358 }
7359 if (start > end)
7360 start = end;
7361 /* copy slice */
7362 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007363 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364}
7365
7366PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007367 PyObject *sep,
7368 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369{
7370 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007371
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372 s = PyUnicode_FromObject(s);
7373 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007374 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007375 if (sep != NULL) {
7376 sep = PyUnicode_FromObject(sep);
7377 if (sep == NULL) {
7378 Py_DECREF(s);
7379 return NULL;
7380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 }
7382
7383 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7384
7385 Py_DECREF(s);
7386 Py_XDECREF(sep);
7387 return result;
7388}
7389
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007390PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007391 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007392\n\
7393Return a list of the words in S, using sep as the\n\
7394delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007395splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007396whitespace string is a separator and empty strings are\n\
7397removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398
7399static PyObject*
7400unicode_split(PyUnicodeObject *self, PyObject *args)
7401{
7402 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007403 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404
Martin v. Löwis18e16552006-02-15 17:27:45 +00007405 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406 return NULL;
7407
7408 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007409 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007411 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007413 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414}
7415
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007416PyObject *
7417PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7418{
7419 PyObject* str_obj;
7420 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007421 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007422
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007423 str_obj = PyUnicode_FromObject(str_in);
7424 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007425 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007426 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007427 if (!sep_obj) {
7428 Py_DECREF(str_obj);
7429 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007430 }
7431
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007432 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007433 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7434 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7435 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007436
Fredrik Lundhb9479482006-05-26 17:22:38 +00007437 Py_DECREF(sep_obj);
7438 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007439
7440 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007441}
7442
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007443
7444PyObject *
7445PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7446{
7447 PyObject* str_obj;
7448 PyObject* sep_obj;
7449 PyObject* out;
7450
7451 str_obj = PyUnicode_FromObject(str_in);
7452 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007453 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007454 sep_obj = PyUnicode_FromObject(sep_in);
7455 if (!sep_obj) {
7456 Py_DECREF(str_obj);
7457 return NULL;
7458 }
7459
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007460 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007461 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7462 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7463 );
7464
7465 Py_DECREF(sep_obj);
7466 Py_DECREF(str_obj);
7467
7468 return out;
7469}
7470
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007471PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007472 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007473\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007474Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007475the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007476found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007477
7478static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007479unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007480{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007481 return PyUnicode_Partition((PyObject *)self, separator);
7482}
7483
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007484PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007485 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007486\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007487Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007488the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007489separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007490
7491static PyObject*
7492unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7493{
7494 return PyUnicode_RPartition((PyObject *)self, separator);
7495}
7496
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007497PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007498 PyObject *sep,
7499 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007500{
7501 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007502
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007503 s = PyUnicode_FromObject(s);
7504 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007505 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007506 if (sep != NULL) {
7507 sep = PyUnicode_FromObject(sep);
7508 if (sep == NULL) {
7509 Py_DECREF(s);
7510 return NULL;
7511 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007512 }
7513
7514 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7515
7516 Py_DECREF(s);
7517 Py_XDECREF(sep);
7518 return result;
7519}
7520
7521PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007522 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007523\n\
7524Return a list of the words in S, using sep as the\n\
7525delimiter string, starting at the end of the string and\n\
7526working to the front. If maxsplit is given, at most maxsplit\n\
7527splits are done. If sep is not specified, any whitespace string\n\
7528is a separator.");
7529
7530static PyObject*
7531unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7532{
7533 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007534 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007535
Martin v. Löwis18e16552006-02-15 17:27:45 +00007536 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007537 return NULL;
7538
7539 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007540 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007541 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007542 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007543 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007544 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007545}
7546
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007547PyDoc_STRVAR(splitlines__doc__,
Raymond Hettingeraad5b022012-06-02 01:42:58 -04007548 "S.splitlines(keepends=False) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549\n\
7550Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007551Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007552is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553
7554static PyObject*
7555unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7556{
Guido van Rossum86662912000-04-11 15:38:46 +00007557 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558
Guido van Rossum86662912000-04-11 15:38:46 +00007559 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560 return NULL;
7561
Guido van Rossum86662912000-04-11 15:38:46 +00007562 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563}
7564
7565static
7566PyObject *unicode_str(PyUnicodeObject *self)
7567{
Fred Drakee4315f52000-05-09 19:53:39 +00007568 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569}
7570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007571PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007572 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573\n\
7574Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007575and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576
7577static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007578unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580 return fixup(self, fixswapcase);
7581}
7582
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007583PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007584 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585\n\
7586Return a copy of the string S, where all characters have been mapped\n\
7587through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007588Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7589Unmapped characters are left untouched. Characters mapped to None\n\
7590are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591
7592static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007593unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594{
Tim Petersced69f82003-09-16 20:30:58 +00007595 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007596 self->length,
7597 table,
7598 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599}
7600
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007601PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007602 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007604Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605
7606static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007607unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609 return fixup(self, fixupper);
7610}
7611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007612PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007613 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614\n\
Georg Brandl98064072008-09-09 19:26:00 +00007615Pad a numeric string S with zeros on the left, to fill a field\n\
7616of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617
7618static PyObject *
7619unicode_zfill(PyUnicodeObject *self, PyObject *args)
7620{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007621 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622 PyUnicodeObject *u;
7623
Martin v. Löwis18e16552006-02-15 17:27:45 +00007624 Py_ssize_t width;
7625 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626 return NULL;
7627
7628 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007629 if (PyUnicode_CheckExact(self)) {
7630 Py_INCREF(self);
7631 return (PyObject*) self;
7632 }
7633 else
7634 return PyUnicode_FromUnicode(
7635 PyUnicode_AS_UNICODE(self),
7636 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007637 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 }
7639
7640 fill = width - self->length;
7641
7642 u = pad(self, fill, 0, '0');
7643
Walter Dörwald068325e2002-04-15 13:36:47 +00007644 if (u == NULL)
7645 return NULL;
7646
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647 if (u->str[fill] == '+' || u->str[fill] == '-') {
7648 /* move sign to beginning of string */
7649 u->str[0] = u->str[fill];
7650 u->str[fill] = '0';
7651 }
7652
7653 return (PyObject*) u;
7654}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655
7656#if 0
7657static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007658free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007660 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661}
7662#endif
7663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007664PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007665 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007667Return True if S starts with the specified prefix, False otherwise.\n\
7668With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007669With optional end, stop comparing S at that position.\n\
7670prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671
7672static PyObject *
7673unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007674 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675{
Georg Brandl24250812006-06-09 18:45:48 +00007676 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007678 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007679 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007680 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681
Jesus Cea44e81682011-04-20 16:39:15 +02007682 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007683 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007684 if (PyTuple_Check(subobj)) {
7685 Py_ssize_t i;
7686 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7687 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007688 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007689 if (substring == NULL)
7690 return NULL;
7691 result = tailmatch(self, substring, start, end, -1);
7692 Py_DECREF(substring);
7693 if (result) {
7694 Py_RETURN_TRUE;
7695 }
7696 }
7697 /* nothing matched */
7698 Py_RETURN_FALSE;
7699 }
7700 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007701 if (substring == NULL) {
7702 if (PyErr_ExceptionMatches(PyExc_TypeError))
7703 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7704 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007705 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007706 }
Georg Brandl24250812006-06-09 18:45:48 +00007707 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007709 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710}
7711
7712
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007713PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007714 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007716Return True if S ends with the specified suffix, False otherwise.\n\
7717With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007718With optional end, stop comparing S at that position.\n\
7719suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720
7721static PyObject *
7722unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007723 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724{
Georg Brandl24250812006-06-09 18:45:48 +00007725 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007727 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007728 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007729 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730
Jesus Cea44e81682011-04-20 16:39:15 +02007731 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007732 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007733 if (PyTuple_Check(subobj)) {
7734 Py_ssize_t i;
7735 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7736 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007737 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007738 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007739 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007740 result = tailmatch(self, substring, start, end, +1);
7741 Py_DECREF(substring);
7742 if (result) {
7743 Py_RETURN_TRUE;
7744 }
7745 }
7746 Py_RETURN_FALSE;
7747 }
7748 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007749 if (substring == NULL) {
7750 if (PyErr_ExceptionMatches(PyExc_TypeError))
7751 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7752 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007753 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007754 }
Georg Brandl24250812006-06-09 18:45:48 +00007755 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007757 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758}
7759
7760
Eric Smitha9f7d622008-02-17 19:46:49 +00007761/* Implements do_string_format, which is unicode because of stringlib */
7762#include "stringlib/string_format.h"
7763
7764PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007765 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007766\n\
Eric Smith6c840852010-11-06 19:43:44 +00007767Return a formatted version of S, using substitutions from args and kwargs.\n\
7768The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007769
Eric Smithdc13b792008-05-30 18:10:04 +00007770static PyObject *
7771unicode__format__(PyObject *self, PyObject *args)
7772{
7773 PyObject *format_spec;
7774 PyObject *result = NULL;
7775 PyObject *tmp = NULL;
7776
7777 /* If 2.x, convert format_spec to the same type as value */
7778 /* This is to allow things like u''.format('') */
7779 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7780 goto done;
7781 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7782 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007783 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007784 goto done;
7785 }
7786 tmp = PyObject_Unicode(format_spec);
7787 if (tmp == NULL)
7788 goto done;
7789 format_spec = tmp;
7790
7791 result = _PyUnicode_FormatAdvanced(self,
7792 PyUnicode_AS_UNICODE(format_spec),
7793 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007794 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007795 Py_XDECREF(tmp);
7796 return result;
7797}
7798
Eric Smitha9f7d622008-02-17 19:46:49 +00007799PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007800 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007801\n\
Eric Smith6c840852010-11-06 19:43:44 +00007802Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007803
Robert Schuppenies901c9972008-06-10 10:10:31 +00007804static PyObject *
7805unicode__sizeof__(PyUnicodeObject *v)
7806{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007807 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7808 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007809}
7810
7811PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007812 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007813\n\
7814");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007815
7816static PyObject *
7817unicode_getnewargs(PyUnicodeObject *v)
7818{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007819 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007820}
7821
7822
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823static PyMethodDef unicode_methods[] = {
Benjamin Peterson332d7212009-09-18 21:14:55 +00007824 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007825 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7826 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007827 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007828 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7829 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7830 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7831 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7832 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7833 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7834 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007835 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007836 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7837 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7838 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007839 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007840 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007841/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7842 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7843 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7844 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007845 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007846 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007847 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007848 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007849 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7850 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7851 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7852 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7853 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7854 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7855 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7856 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7857 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7858 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7859 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7860 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7861 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7862 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007863 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007864 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7865 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7866 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7867 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007868 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007869#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007870 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871#endif
7872
7873#if 0
7874 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007875 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876#endif
7877
Benjamin Peterson857ce152009-01-31 16:29:18 +00007878 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879 {NULL, NULL}
7880};
7881
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007882static PyObject *
7883unicode_mod(PyObject *v, PyObject *w)
7884{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007885 if (!PyUnicode_Check(v)) {
7886 Py_INCREF(Py_NotImplemented);
7887 return Py_NotImplemented;
7888 }
7889 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007890}
7891
7892static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007893 0, /*nb_add*/
7894 0, /*nb_subtract*/
7895 0, /*nb_multiply*/
7896 0, /*nb_divide*/
7897 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007898};
7899
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007901 (lenfunc) unicode_length, /* sq_length */
7902 PyUnicode_Concat, /* sq_concat */
7903 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7904 (ssizeargfunc) unicode_getitem, /* sq_item */
7905 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7906 0, /* sq_ass_item */
7907 0, /* sq_ass_slice */
7908 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909};
7910
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007911static PyObject*
7912unicode_subscript(PyUnicodeObject* self, PyObject* item)
7913{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007914 if (PyIndex_Check(item)) {
7915 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007916 if (i == -1 && PyErr_Occurred())
7917 return NULL;
7918 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007919 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007920 return unicode_getitem(self, i);
7921 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007922 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007923 Py_UNICODE* source_buf;
7924 Py_UNICODE* result_buf;
7925 PyObject* result;
7926
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007927 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007928 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007929 return NULL;
7930 }
7931
7932 if (slicelength <= 0) {
7933 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007934 } else if (start == 0 && step == 1 && slicelength == self->length &&
7935 PyUnicode_CheckExact(self)) {
7936 Py_INCREF(self);
7937 return (PyObject *)self;
7938 } else if (step == 1) {
7939 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007940 } else {
7941 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007942 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7943 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007944
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007945 if (result_buf == NULL)
7946 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007947
7948 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7949 result_buf[i] = source_buf[cur];
7950 }
Tim Petersced69f82003-09-16 20:30:58 +00007951
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007952 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007953 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007954 return result;
7955 }
7956 } else {
7957 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7958 return NULL;
7959 }
7960}
7961
7962static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007963 (lenfunc)unicode_length, /* mp_length */
7964 (binaryfunc)unicode_subscript, /* mp_subscript */
7965 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007966};
7967
Martin v. Löwis18e16552006-02-15 17:27:45 +00007968static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007970 Py_ssize_t index,
7971 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972{
7973 if (index != 0) {
7974 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007975 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976 return -1;
7977 }
7978 *ptr = (void *) self->str;
7979 return PyUnicode_GET_DATA_SIZE(self);
7980}
7981
Martin v. Löwis18e16552006-02-15 17:27:45 +00007982static Py_ssize_t
7983unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007984 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985{
7986 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007987 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988 return -1;
7989}
7990
7991static int
7992unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007993 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994{
7995 if (lenp)
7996 *lenp = PyUnicode_GET_DATA_SIZE(self);
7997 return 1;
7998}
7999
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008000static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008002 Py_ssize_t index,
8003 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004{
8005 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008006
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007 if (index != 0) {
8008 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008009 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 return -1;
8011 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008012 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008014 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008015 *ptr = (void *) PyString_AS_STRING(str);
8016 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017}
8018
8019/* Helpers for PyUnicode_Format() */
8020
8021static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008022getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008024 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008026 (*p_argidx)++;
8027 if (arglen < 0)
8028 return args;
8029 else
8030 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031 }
8032 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008033 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034 return NULL;
8035}
8036
8037#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008038#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008040#define F_ALT (1<<3)
8041#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042
Martin v. Löwis18e16552006-02-15 17:27:45 +00008043static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008044strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008046 register Py_ssize_t i;
8047 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008049 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051 return len;
8052}
8053
Neal Norwitzfc76d632006-01-10 06:03:13 +00008054static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008055longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8056{
Tim Peters15231542006-02-16 01:08:01 +00008057 Py_ssize_t result;
8058
Neal Norwitzfc76d632006-01-10 06:03:13 +00008059 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008060 result = strtounicode(buffer, (char *)buffer);
8061 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008062}
8063
Guido van Rossum078151d2002-08-11 04:24:12 +00008064/* XXX To save some code duplication, formatfloat/long/int could have been
8065 shared with stringobject.c, converting from 8-bit to Unicode after the
8066 formatting is done. */
8067
Mark Dickinson18cfada2009-11-23 18:46:41 +00008068/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8069
8070static PyObject *
8071formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008073 char *p;
8074 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008076
Guido van Rossumd57fd912000-03-10 22:53:23 +00008077 x = PyFloat_AsDouble(v);
8078 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008079 return NULL;
8080
Guido van Rossumd57fd912000-03-10 22:53:23 +00008081 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008082 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008083
Mark Dickinson18cfada2009-11-23 18:46:41 +00008084 p = PyOS_double_to_string(x, type, prec,
8085 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8086 if (p == NULL)
8087 return NULL;
8088 result = PyUnicode_FromStringAndSize(p, strlen(p));
8089 PyMem_Free(p);
8090 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008091}
8092
Tim Peters38fd5b62000-09-21 05:43:11 +00008093static PyObject*
8094formatlong(PyObject *val, int flags, int prec, int type)
8095{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008096 char *buf;
8097 int i, len;
8098 PyObject *str; /* temporary string object. */
8099 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008100
Benjamin Peterson857ce152009-01-31 16:29:18 +00008101 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8102 if (!str)
8103 return NULL;
8104 result = _PyUnicode_New(len);
8105 if (!result) {
8106 Py_DECREF(str);
8107 return NULL;
8108 }
8109 for (i = 0; i < len; i++)
8110 result->str[i] = buf[i];
8111 result->str[len] = 0;
8112 Py_DECREF(str);
8113 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008114}
8115
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116static int
8117formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008118 size_t buflen,
8119 int flags,
8120 int prec,
8121 int type,
8122 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008124 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008125 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8126 * + 1 + 1
8127 * = 24
8128 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008129 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008130 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 long x;
8132
8133 x = PyInt_AsLong(v);
8134 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008135 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008136 if (x < 0 && type == 'u') {
8137 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008138 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008139 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8140 sign = "-";
8141 else
8142 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008144 prec = 1;
8145
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008146 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8147 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008148 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008149 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008150 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008151 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008152 return -1;
8153 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008154
8155 if ((flags & F_ALT) &&
8156 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008157 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008158 * of issues that cause pain:
8159 * - when 0 is being converted, the C standard leaves off
8160 * the '0x' or '0X', which is inconsistent with other
8161 * %#x/%#X conversions and inconsistent with Python's
8162 * hex() function
8163 * - there are platforms that violate the standard and
8164 * convert 0 with the '0x' or '0X'
8165 * (Metrowerks, Compaq Tru64)
8166 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008167 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008168 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008169 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008170 * We can achieve the desired consistency by inserting our
8171 * own '0x' or '0X' prefix, and substituting %x/%X in place
8172 * of %#x/%#X.
8173 *
8174 * Note that this is the same approach as used in
8175 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008176 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008177 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8178 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008179 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008180 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008181 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8182 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008183 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008184 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008185 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008186 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008187 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008188 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189}
8190
8191static int
8192formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008193 size_t buflen,
8194 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195{
Ezio Melotti32125152010-02-25 17:36:04 +00008196 PyObject *unistr;
8197 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008198 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008199 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008200 if (PyUnicode_GET_SIZE(v) != 1)
8201 goto onError;
8202 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008205 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008206 if (PyString_GET_SIZE(v) != 1)
8207 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008208 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8209 with a UnicodeDecodeError if 'char' is not decodable with the
8210 default encoding (usually ASCII, but it might be something else) */
8211 str = PyString_AS_STRING(v);
8212 if ((unsigned char)str[0] > 0x7F) {
8213 /* the char is not ASCII; try to decode the string using the
8214 default encoding and return -1 to let the UnicodeDecodeError
8215 be raised if the string can't be decoded */
8216 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8217 if (unistr == NULL)
8218 return -1;
8219 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8220 Py_DECREF(unistr);
8221 }
8222 else
8223 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008224 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008225
8226 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008227 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008229 x = PyInt_AsLong(v);
8230 if (x == -1 && PyErr_Occurred())
8231 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008232#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008233 if (x < 0 || x > 0x10ffff) {
8234 PyErr_SetString(PyExc_OverflowError,
8235 "%c arg not in range(0x110000) "
8236 "(wide Python build)");
8237 return -1;
8238 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008239#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008240 if (x < 0 || x > 0xffff) {
8241 PyErr_SetString(PyExc_OverflowError,
8242 "%c arg not in range(0x10000) "
8243 "(narrow Python build)");
8244 return -1;
8245 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008246#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008247 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248 }
8249 buf[1] = '\0';
8250 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008251
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008252 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008253 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008254 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008255 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256}
8257
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008258/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8259
Mark Dickinson18cfada2009-11-23 18:46:41 +00008260 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008261 chars are formatted. XXX This is a magic number. Each formatting
8262 routine does bounds checking to ensure no overflow, but a better
8263 solution may be to malloc a buffer of appropriate size for each
8264 format. For now, the current solution is sufficient.
8265*/
8266#define FORMATBUFLEN (size_t)120
8267
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008269 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270{
8271 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008272 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273 int args_owned = 0;
8274 PyUnicodeObject *result = NULL;
8275 PyObject *dict = NULL;
8276 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008277
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008279 PyErr_BadInternalCall();
8280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281 }
8282 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008283 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 fmt = PyUnicode_AS_UNICODE(uformat);
8286 fmtcnt = PyUnicode_GET_SIZE(uformat);
8287
8288 reslen = rescnt = fmtcnt + 100;
8289 result = _PyUnicode_New(reslen);
8290 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008291 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292 res = PyUnicode_AS_UNICODE(result);
8293
8294 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008295 arglen = PyTuple_Size(args);
8296 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 }
8298 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008299 arglen = -1;
8300 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301 }
Benjamin Peterson23d49d32012-08-28 17:55:35 -04008302 if (PyMapping_Check(args) && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008303 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008304 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305
8306 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008307 if (*fmt != '%') {
8308 if (--rescnt < 0) {
8309 rescnt = fmtcnt + 100;
8310 reslen += rescnt;
8311 if (_PyUnicode_Resize(&result, reslen) < 0)
8312 goto onError;
8313 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8314 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008315 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008316 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008317 }
8318 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008319 /* Got a format specifier */
8320 int flags = 0;
8321 Py_ssize_t width = -1;
8322 int prec = -1;
8323 Py_UNICODE c = '\0';
8324 Py_UNICODE fill;
8325 int isnumok;
8326 PyObject *v = NULL;
8327 PyObject *temp = NULL;
8328 Py_UNICODE *pbuf;
8329 Py_UNICODE sign;
8330 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008331 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008332
8333 fmt++;
8334 if (*fmt == '(') {
8335 Py_UNICODE *keystart;
8336 Py_ssize_t keylen;
8337 PyObject *key;
8338 int pcount = 1;
8339
8340 if (dict == NULL) {
8341 PyErr_SetString(PyExc_TypeError,
8342 "format requires a mapping");
8343 goto onError;
8344 }
8345 ++fmt;
8346 --fmtcnt;
8347 keystart = fmt;
8348 /* Skip over balanced parentheses */
8349 while (pcount > 0 && --fmtcnt >= 0) {
8350 if (*fmt == ')')
8351 --pcount;
8352 else if (*fmt == '(')
8353 ++pcount;
8354 fmt++;
8355 }
8356 keylen = fmt - keystart - 1;
8357 if (fmtcnt < 0 || pcount > 0) {
8358 PyErr_SetString(PyExc_ValueError,
8359 "incomplete format key");
8360 goto onError;
8361 }
8362#if 0
8363 /* keys are converted to strings using UTF-8 and
8364 then looked up since Python uses strings to hold
8365 variables names etc. in its namespaces and we
8366 wouldn't want to break common idioms. */
8367 key = PyUnicode_EncodeUTF8(keystart,
8368 keylen,
8369 NULL);
8370#else
8371 key = PyUnicode_FromUnicode(keystart, keylen);
8372#endif
8373 if (key == NULL)
8374 goto onError;
8375 if (args_owned) {
8376 Py_DECREF(args);
8377 args_owned = 0;
8378 }
8379 args = PyObject_GetItem(dict, key);
8380 Py_DECREF(key);
8381 if (args == NULL) {
8382 goto onError;
8383 }
8384 args_owned = 1;
8385 arglen = -1;
8386 argidx = -2;
8387 }
8388 while (--fmtcnt >= 0) {
8389 switch (c = *fmt++) {
8390 case '-': flags |= F_LJUST; continue;
8391 case '+': flags |= F_SIGN; continue;
8392 case ' ': flags |= F_BLANK; continue;
8393 case '#': flags |= F_ALT; continue;
8394 case '0': flags |= F_ZERO; continue;
8395 }
8396 break;
8397 }
8398 if (c == '*') {
8399 v = getnextarg(args, arglen, &argidx);
8400 if (v == NULL)
8401 goto onError;
8402 if (!PyInt_Check(v)) {
8403 PyErr_SetString(PyExc_TypeError,
8404 "* wants int");
8405 goto onError;
8406 }
8407 width = PyInt_AsLong(v);
8408 if (width < 0) {
8409 flags |= F_LJUST;
8410 width = -width;
8411 }
8412 if (--fmtcnt >= 0)
8413 c = *fmt++;
8414 }
8415 else if (c >= '0' && c <= '9') {
8416 width = c - '0';
8417 while (--fmtcnt >= 0) {
8418 c = *fmt++;
8419 if (c < '0' || c > '9')
8420 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008421 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008422 PyErr_SetString(PyExc_ValueError,
8423 "width too big");
8424 goto onError;
8425 }
8426 width = width*10 + (c - '0');
8427 }
8428 }
8429 if (c == '.') {
8430 prec = 0;
8431 if (--fmtcnt >= 0)
8432 c = *fmt++;
8433 if (c == '*') {
8434 v = getnextarg(args, arglen, &argidx);
8435 if (v == NULL)
8436 goto onError;
8437 if (!PyInt_Check(v)) {
8438 PyErr_SetString(PyExc_TypeError,
8439 "* wants int");
8440 goto onError;
8441 }
8442 prec = PyInt_AsLong(v);
8443 if (prec < 0)
8444 prec = 0;
8445 if (--fmtcnt >= 0)
8446 c = *fmt++;
8447 }
8448 else if (c >= '0' && c <= '9') {
8449 prec = c - '0';
8450 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008451 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008452 if (c < '0' || c > '9')
8453 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008454 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008455 PyErr_SetString(PyExc_ValueError,
8456 "prec too big");
8457 goto onError;
8458 }
8459 prec = prec*10 + (c - '0');
8460 }
8461 }
8462 } /* prec */
8463 if (fmtcnt >= 0) {
8464 if (c == 'h' || c == 'l' || c == 'L') {
8465 if (--fmtcnt >= 0)
8466 c = *fmt++;
8467 }
8468 }
8469 if (fmtcnt < 0) {
8470 PyErr_SetString(PyExc_ValueError,
8471 "incomplete format");
8472 goto onError;
8473 }
8474 if (c != '%') {
8475 v = getnextarg(args, arglen, &argidx);
8476 if (v == NULL)
8477 goto onError;
8478 }
8479 sign = 0;
8480 fill = ' ';
8481 switch (c) {
8482
8483 case '%':
8484 pbuf = formatbuf;
8485 /* presume that buffer length is at least 1 */
8486 pbuf[0] = '%';
8487 len = 1;
8488 break;
8489
8490 case 's':
8491 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008492 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008493 temp = v;
8494 Py_INCREF(temp);
8495 }
8496 else {
8497 PyObject *unicode;
8498 if (c == 's')
8499 temp = PyObject_Unicode(v);
8500 else
8501 temp = PyObject_Repr(v);
8502 if (temp == NULL)
8503 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008504 if (PyUnicode_Check(temp))
8505 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008506 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008507 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008508 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8509 PyString_GET_SIZE(temp),
8510 NULL,
8511 "strict");
8512 Py_DECREF(temp);
8513 temp = unicode;
8514 if (temp == NULL)
8515 goto onError;
8516 }
8517 else {
8518 Py_DECREF(temp);
8519 PyErr_SetString(PyExc_TypeError,
8520 "%s argument has non-string str()");
8521 goto onError;
8522 }
8523 }
8524 pbuf = PyUnicode_AS_UNICODE(temp);
8525 len = PyUnicode_GET_SIZE(temp);
8526 if (prec >= 0 && len > prec)
8527 len = prec;
8528 break;
8529
8530 case 'i':
8531 case 'd':
8532 case 'u':
8533 case 'o':
8534 case 'x':
8535 case 'X':
8536 if (c == 'i')
8537 c = 'd';
8538 isnumok = 0;
8539 if (PyNumber_Check(v)) {
8540 PyObject *iobj=NULL;
8541
8542 if (PyInt_Check(v) || (PyLong_Check(v))) {
8543 iobj = v;
8544 Py_INCREF(iobj);
8545 }
8546 else {
8547 iobj = PyNumber_Int(v);
8548 if (iobj==NULL) iobj = PyNumber_Long(v);
8549 }
8550 if (iobj!=NULL) {
8551 if (PyInt_Check(iobj)) {
8552 isnumok = 1;
8553 pbuf = formatbuf;
8554 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8555 flags, prec, c, iobj);
8556 Py_DECREF(iobj);
8557 if (len < 0)
8558 goto onError;
8559 sign = 1;
8560 }
8561 else if (PyLong_Check(iobj)) {
8562 isnumok = 1;
8563 temp = formatlong(iobj, flags, prec, c);
8564 Py_DECREF(iobj);
8565 if (!temp)
8566 goto onError;
8567 pbuf = PyUnicode_AS_UNICODE(temp);
8568 len = PyUnicode_GET_SIZE(temp);
8569 sign = 1;
8570 }
8571 else {
8572 Py_DECREF(iobj);
8573 }
8574 }
8575 }
8576 if (!isnumok) {
8577 PyErr_Format(PyExc_TypeError,
8578 "%%%c format: a number is required, "
8579 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8580 goto onError;
8581 }
8582 if (flags & F_ZERO)
8583 fill = '0';
8584 break;
8585
8586 case 'e':
8587 case 'E':
8588 case 'f':
8589 case 'F':
8590 case 'g':
8591 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008592 temp = formatfloat(v, flags, prec, c);
8593 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008594 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008595 pbuf = PyUnicode_AS_UNICODE(temp);
8596 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008597 sign = 1;
8598 if (flags & F_ZERO)
8599 fill = '0';
8600 break;
8601
8602 case 'c':
8603 pbuf = formatbuf;
8604 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8605 if (len < 0)
8606 goto onError;
8607 break;
8608
8609 default:
8610 PyErr_Format(PyExc_ValueError,
8611 "unsupported format character '%c' (0x%x) "
8612 "at index %zd",
8613 (31<=c && c<=126) ? (char)c : '?',
8614 (int)c,
8615 (Py_ssize_t)(fmt - 1 -
8616 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008617 goto onError;
8618 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008619 if (sign) {
8620 if (*pbuf == '-' || *pbuf == '+') {
8621 sign = *pbuf++;
8622 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008623 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008624 else if (flags & F_SIGN)
8625 sign = '+';
8626 else if (flags & F_BLANK)
8627 sign = ' ';
8628 else
8629 sign = 0;
8630 }
8631 if (width < len)
8632 width = len;
8633 if (rescnt - (sign != 0) < width) {
8634 reslen -= rescnt;
8635 rescnt = width + fmtcnt + 100;
8636 reslen += rescnt;
8637 if (reslen < 0) {
8638 Py_XDECREF(temp);
8639 PyErr_NoMemory();
8640 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008641 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008642 if (_PyUnicode_Resize(&result, reslen) < 0) {
8643 Py_XDECREF(temp);
8644 goto onError;
8645 }
8646 res = PyUnicode_AS_UNICODE(result)
8647 + reslen - rescnt;
8648 }
8649 if (sign) {
8650 if (fill != ' ')
8651 *res++ = sign;
8652 rescnt--;
8653 if (width > len)
8654 width--;
8655 }
8656 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8657 assert(pbuf[0] == '0');
8658 assert(pbuf[1] == c);
8659 if (fill != ' ') {
8660 *res++ = *pbuf++;
8661 *res++ = *pbuf++;
8662 }
8663 rescnt -= 2;
8664 width -= 2;
8665 if (width < 0)
8666 width = 0;
8667 len -= 2;
8668 }
8669 if (width > len && !(flags & F_LJUST)) {
8670 do {
8671 --rescnt;
8672 *res++ = fill;
8673 } while (--width > len);
8674 }
8675 if (fill == ' ') {
8676 if (sign)
8677 *res++ = sign;
8678 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8679 assert(pbuf[0] == '0');
8680 assert(pbuf[1] == c);
8681 *res++ = *pbuf++;
8682 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008683 }
8684 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008685 Py_UNICODE_COPY(res, pbuf, len);
8686 res += len;
8687 rescnt -= len;
8688 while (--width >= len) {
8689 --rescnt;
8690 *res++ = ' ';
8691 }
8692 if (dict && (argidx < arglen) && c != '%') {
8693 PyErr_SetString(PyExc_TypeError,
8694 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008695 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008696 goto onError;
8697 }
8698 Py_XDECREF(temp);
8699 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 } /* until end */
8701 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008702 PyErr_SetString(PyExc_TypeError,
8703 "not all arguments converted during string formatting");
8704 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705 }
8706
Thomas Woutersa96affe2006-03-12 00:29:36 +00008707 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008708 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008710 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711 }
8712 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713 return (PyObject *)result;
8714
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008715 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 Py_XDECREF(result);
8717 Py_DECREF(uformat);
8718 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008719 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720 }
8721 return NULL;
8722}
8723
8724static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008725 (readbufferproc) unicode_buffer_getreadbuf,
8726 (writebufferproc) unicode_buffer_getwritebuf,
8727 (segcountproc) unicode_buffer_getsegcount,
8728 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729};
8730
Jeremy Hylton938ace62002-07-17 16:30:39 +00008731static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008732unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8733
Tim Peters6d6c1a32001-08-02 04:15:00 +00008734static PyObject *
8735unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8736{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008737 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008738 static char *kwlist[] = {"string", "encoding", "errors", 0};
8739 char *encoding = NULL;
8740 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008741
Benjamin Peterson857ce152009-01-31 16:29:18 +00008742 if (type != &PyUnicode_Type)
8743 return unicode_subtype_new(type, args, kwds);
8744 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008745 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008746 return NULL;
8747 if (x == NULL)
8748 return (PyObject *)_PyUnicode_New(0);
8749 if (encoding == NULL && errors == NULL)
8750 return PyObject_Unicode(x);
8751 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008752 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008753}
8754
Guido van Rossume023fe02001-08-30 03:12:59 +00008755static PyObject *
8756unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8757{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008758 PyUnicodeObject *tmp, *pnew;
8759 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008760
Benjamin Peterson857ce152009-01-31 16:29:18 +00008761 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8762 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8763 if (tmp == NULL)
8764 return NULL;
8765 assert(PyUnicode_Check(tmp));
8766 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8767 if (pnew == NULL) {
8768 Py_DECREF(tmp);
8769 return NULL;
8770 }
8771 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8772 if (pnew->str == NULL) {
8773 _Py_ForgetReference((PyObject *)pnew);
8774 PyObject_Del(pnew);
8775 Py_DECREF(tmp);
8776 return PyErr_NoMemory();
8777 }
8778 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8779 pnew->length = n;
8780 pnew->hash = tmp->hash;
8781 Py_DECREF(tmp);
8782 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008783}
8784
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008785PyDoc_STRVAR(unicode_doc,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07008786 "unicode(object='') -> unicode object\n\
8787unicode(string[, encoding[, errors]]) -> unicode object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008788\n\
8789Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008790encoding defaults to the current default string encoding.\n\
8791errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008792
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008794 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008795 "unicode", /* tp_name */
8796 sizeof(PyUnicodeObject), /* tp_size */
8797 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008799 (destructor)unicode_dealloc, /* tp_dealloc */
8800 0, /* tp_print */
8801 0, /* tp_getattr */
8802 0, /* tp_setattr */
8803 0, /* tp_compare */
8804 unicode_repr, /* tp_repr */
8805 &unicode_as_number, /* tp_as_number */
8806 &unicode_as_sequence, /* tp_as_sequence */
8807 &unicode_as_mapping, /* tp_as_mapping */
8808 (hashfunc) unicode_hash, /* tp_hash*/
8809 0, /* tp_call*/
8810 (reprfunc) unicode_str, /* tp_str */
8811 PyObject_GenericGetAttr, /* tp_getattro */
8812 0, /* tp_setattro */
8813 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008814 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008815 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008816 unicode_doc, /* tp_doc */
8817 0, /* tp_traverse */
8818 0, /* tp_clear */
8819 PyUnicode_RichCompare, /* tp_richcompare */
8820 0, /* tp_weaklistoffset */
8821 0, /* tp_iter */
8822 0, /* tp_iternext */
8823 unicode_methods, /* tp_methods */
8824 0, /* tp_members */
8825 0, /* tp_getset */
8826 &PyBaseString_Type, /* tp_base */
8827 0, /* tp_dict */
8828 0, /* tp_descr_get */
8829 0, /* tp_descr_set */
8830 0, /* tp_dictoffset */
8831 0, /* tp_init */
8832 0, /* tp_alloc */
8833 unicode_new, /* tp_new */
8834 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835};
8836
8837/* Initialize the Unicode implementation */
8838
Thomas Wouters78890102000-07-22 19:25:51 +00008839void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008841 int i;
8842
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008843 /* XXX - move this array to unicodectype.c ? */
8844 Py_UNICODE linebreak[] = {
8845 0x000A, /* LINE FEED */
8846 0x000D, /* CARRIAGE RETURN */
8847 0x001C, /* FILE SEPARATOR */
8848 0x001D, /* GROUP SEPARATOR */
8849 0x001E, /* RECORD SEPARATOR */
8850 0x0085, /* NEXT LINE */
8851 0x2028, /* LINE SEPARATOR */
8852 0x2029, /* PARAGRAPH SEPARATOR */
8853 };
8854
Fred Drakee4315f52000-05-09 19:53:39 +00008855 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008856 free_list = NULL;
8857 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008859 if (!unicode_empty)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008860 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00008861
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008862 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008863 for (i = 0; i < 256; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008864 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008865 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008866 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008867
8868 /* initialize the linebreak bloom filter */
8869 bloom_linebreak = make_bloom_mask(
8870 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8871 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008872
8873 PyType_Ready(&EncodingMapType);
Benjamin Peterson6da3ed62012-10-30 23:21:10 -04008874
8875 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8876 Py_FatalError("Can't initialize field name iterator type");
8877
8878 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8879 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880}
8881
8882/* Finalize the Unicode implementation */
8883
Christian Heimes3b718a72008-02-14 12:47:33 +00008884int
8885PyUnicode_ClearFreeList(void)
8886{
8887 int freelist_size = numfree;
8888 PyUnicodeObject *u;
8889
8890 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008891 PyUnicodeObject *v = u;
8892 u = *(PyUnicodeObject **)u;
8893 if (v->str)
8894 PyObject_DEL(v->str);
8895 Py_XDECREF(v->defenc);
8896 PyObject_Del(v);
8897 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008898 }
8899 free_list = NULL;
8900 assert(numfree == 0);
8901 return freelist_size;
8902}
8903
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904void
Thomas Wouters78890102000-07-22 19:25:51 +00008905_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008906{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008907 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008908
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008909 Py_XDECREF(unicode_empty);
8910 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008911
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008912 for (i = 0; i < 256; i++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008913 if (unicode_latin1[i]) {
8914 Py_DECREF(unicode_latin1[i]);
8915 unicode_latin1[i] = NULL;
8916 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008917 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008918 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008920
Anthony Baxterac6bd462006-04-13 02:06:09 +00008921#ifdef __cplusplus
8922}
8923#endif