blob: c1b38cc4fe04da9eb5a2e2c4bcc312fec9b3927c [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000118/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000119/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000120/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000149/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000150/* 0x000B, * LINE TABULATION */
151/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000152/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000153 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000154 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000155/* 0x001C, * FILE SEPARATOR */
156/* 0x001D, * GROUP SEPARATOR */
157/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000163
Benjamin Peterson857ce152009-01-31 16:29:18 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000172};
173
174
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000176PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000178#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000179 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000180#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184#endif
185}
186
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
Antoine Pitrou10042922010-01-13 14:01:26 +0000195#if LONG_BIT >= 128
196#define BLOOM_WIDTH 128
197#elif LONG_BIT >= 64
198#define BLOOM_WIDTH 64
199#elif LONG_BIT >= 32
200#define BLOOM_WIDTH 32
201#else
202#error "LONG_BIT is smaller than 32"
203#endif
204
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000205#define BLOOM_MASK unsigned long
206
207static BLOOM_MASK bloom_linebreak;
208
Antoine Pitrou10042922010-01-13 14:01:26 +0000209#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000211
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000212#define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000215
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000217{
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
Antoine Pitrou10042922010-01-13 14:01:26 +0000220 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000225 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226
227 return mask;
228}
229
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000231{
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
Fredrik Lundh77633512006-05-23 19:47:35 +0000238 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239}
240
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000241#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244/* --- Unicode Object ----------------------------------------------------- */
245
246static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000247int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000248 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249{
250 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000251
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259
Benjamin Peterson857ce152009-01-31 16:29:18 +0000260 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
263 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 return -1;
267 }
268
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000271 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272 it contains). */
273
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000275 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000276 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000278 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 PyErr_NoMemory();
280 return -1;
281 }
282 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000283 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000285 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000287 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000288 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 }
290 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 return 0;
293}
294
295/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000296 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000299 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Andrew Dalkee0df7622006-05-27 11:04:36 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
Neal Norwitze7d8be82008-07-31 17:17:14 +0000314 /* Ensure we won't overflow the size. */
315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316 return (PyUnicodeObject *)PyErr_NoMemory();
317 }
318
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000320 if (free_list) {
321 unicode = free_list;
322 free_list = *(PyUnicodeObject **)unicode;
323 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000324 if (unicode->str) {
325 /* Keep-Alive optimization: we only upsize the buffer,
326 never downsize it. */
327 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000328 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000329 PyObject_DEL(unicode->str);
330 unicode->str = NULL;
331 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000332 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000336 }
337 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000340 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 if (unicode == NULL)
343 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000346 }
347
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000349 PyErr_NoMemory();
350 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000351 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000352 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000353 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in
357 * that case.
358 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000359 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000361 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000363 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000365
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000366 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000367 /* XXX UNREF/NEWREF interface should be more symmetrical */
368 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000369 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000370 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372}
373
374static
Guido van Rossum9475a232001-10-05 20:51:39 +0000375void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000377 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000378 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000379 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000380 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
381 PyObject_DEL(unicode->str);
382 unicode->str = NULL;
383 unicode->length = 0;
384 }
385 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000386 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000387 }
388 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000389 *(PyUnicodeObject **)unicode = free_list;
390 free_list = unicode;
391 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392 }
393 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000394 PyObject_DEL(unicode->str);
395 Py_XDECREF(unicode->defenc);
396 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397 }
398}
399
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000400static
401int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000402{
403 register PyUnicodeObject *v;
404
405 /* Argument checks */
406 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyErr_BadInternalCall();
408 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000409 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000410 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000411 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000412 PyErr_BadInternalCall();
413 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000414 }
415
416 /* Resizing unicode_empty and single character objects is not
417 possible since these are being shared. We simply return a fresh
418 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000419 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 (v == unicode_empty || v->length == 1)) {
421 PyUnicodeObject *w = _PyUnicode_New(length);
422 if (w == NULL)
423 return -1;
424 Py_UNICODE_COPY(w->str, v->str,
425 length < v->length ? length : v->length);
426 Py_DECREF(*unicode);
427 *unicode = w;
428 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429 }
430
431 /* Note that we don't have to modify *unicode for unshared Unicode
432 objects, since we can modify them in-place. */
433 return unicode_resize(v, length);
434}
435
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000436int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
437{
438 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
439}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000442 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443{
444 PyUnicodeObject *unicode;
445
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000446 /* If the Unicode data is known at construction time, we can apply
447 some optimizations which share commonly used objects. */
448 if (u != NULL) {
449
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000450 /* Optimization for empty strings */
451 if (size == 0 && unicode_empty != NULL) {
452 Py_INCREF(unicode_empty);
453 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000454 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455
456 /* Single character Unicode objects in the Latin-1 range are
457 shared when using this constructor */
458 if (size == 1 && *u < 256) {
459 unicode = unicode_latin1[*u];
460 if (!unicode) {
461 unicode = _PyUnicode_New(1);
462 if (!unicode)
463 return NULL;
464 unicode->str[0] = *u;
465 unicode_latin1[*u] = unicode;
466 }
467 Py_INCREF(unicode);
468 return (PyObject *)unicode;
469 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000470 }
Tim Petersced69f82003-09-16 20:30:58 +0000471
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472 unicode = _PyUnicode_New(size);
473 if (!unicode)
474 return NULL;
475
476 /* Copy the Unicode data into the new object */
477 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000478 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479
480 return (PyObject *)unicode;
481}
482
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000483PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
484{
485 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000486
Benjamin Peterson857ce152009-01-31 16:29:18 +0000487 if (size < 0) {
488 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000490 return NULL;
491 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000492
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000493 /* If the Unicode data is known at construction time, we can apply
494 some optimizations which share commonly used objects.
495 Also, this means the input must be UTF-8, so fall back to the
496 UTF-8 decoder at the end. */
497 if (u != NULL) {
498
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000499 /* Optimization for empty strings */
500 if (size == 0 && unicode_empty != NULL) {
501 Py_INCREF(unicode_empty);
502 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000503 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000504
505 /* Single characters are shared when using this constructor.
506 Restrict to ASCII, since the input must be UTF-8. */
507 if (size == 1 && Py_CHARMASK(*u) < 128) {
508 unicode = unicode_latin1[Py_CHARMASK(*u)];
509 if (!unicode) {
510 unicode = _PyUnicode_New(1);
511 if (!unicode)
512 return NULL;
513 unicode->str[0] = Py_CHARMASK(*u);
514 unicode_latin1[Py_CHARMASK(*u)] = unicode;
515 }
516 Py_INCREF(unicode);
517 return (PyObject *)unicode;
518 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000519
520 return PyUnicode_DecodeUTF8(u, size, NULL);
521 }
522
523 unicode = _PyUnicode_New(size);
524 if (!unicode)
525 return NULL;
526
527 return (PyObject *)unicode;
528}
529
530PyObject *PyUnicode_FromString(const char *u)
531{
532 size_t size = strlen(u);
533 if (size > PY_SSIZE_T_MAX) {
534 PyErr_SetString(PyExc_OverflowError, "input too long");
535 return NULL;
536 }
537
538 return PyUnicode_FromStringAndSize(u, size);
539}
540
Guido van Rossumd57fd912000-03-10 22:53:23 +0000541#ifdef HAVE_WCHAR_H
542
Mark Dickinson6b265f12009-03-18 16:07:26 +0000543#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
544# define CONVERT_WCHAR_TO_SURROGATES
545#endif
546
547#ifdef CONVERT_WCHAR_TO_SURROGATES
548
549/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
550 to convert from UTF32 to UTF16. */
551
552PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
553 Py_ssize_t size)
554{
555 PyUnicodeObject *unicode;
556 register Py_ssize_t i;
557 Py_ssize_t alloc;
558 const wchar_t *orig_w;
559
560 if (w == NULL) {
561 PyErr_BadInternalCall();
562 return NULL;
563 }
564
565 alloc = size;
566 orig_w = w;
567 for (i = size; i > 0; i--) {
568 if (*w > 0xFFFF)
569 alloc++;
570 w++;
571 }
572 w = orig_w;
573 unicode = _PyUnicode_New(alloc);
574 if (!unicode)
575 return NULL;
576
577 /* Copy the wchar_t data into the new object */
578 {
579 register Py_UNICODE *u;
580 u = PyUnicode_AS_UNICODE(unicode);
581 for (i = size; i > 0; i--) {
582 if (*w > 0xFFFF) {
583 wchar_t ordinal = *w++;
584 ordinal -= 0x10000;
585 *u++ = 0xD800 | (ordinal >> 10);
586 *u++ = 0xDC00 | (ordinal & 0x3FF);
587 }
588 else
589 *u++ = *w++;
590 }
591 }
592 return (PyObject *)unicode;
593}
594
595#else
596
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000598 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599{
600 PyUnicodeObject *unicode;
601
602 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000603 PyErr_BadInternalCall();
604 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000605 }
606
607 unicode = _PyUnicode_New(size);
608 if (!unicode)
609 return NULL;
610
611 /* Copy the wchar_t data into the new object */
612#ifdef HAVE_USABLE_WCHAR_T
613 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000614#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000615 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000616 register Py_UNICODE *u;
617 register Py_ssize_t i;
618 u = PyUnicode_AS_UNICODE(unicode);
619 for (i = size; i > 0; i--)
620 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621 }
622#endif
623
624 return (PyObject *)unicode;
625}
626
Mark Dickinson6b265f12009-03-18 16:07:26 +0000627#endif /* CONVERT_WCHAR_TO_SURROGATES */
628
629#undef CONVERT_WCHAR_TO_SURROGATES
630
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000631static void
632makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
633{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000634 *fmt++ = '%';
635 if (width) {
636 if (zeropad)
637 *fmt++ = '0';
638 fmt += sprintf(fmt, "%d", width);
639 }
640 if (precision)
641 fmt += sprintf(fmt, ".%d", precision);
642 if (longflag)
643 *fmt++ = 'l';
644 else if (size_tflag) {
645 char *f = PY_FORMAT_SIZE_T;
646 while (*f)
647 *fmt++ = *f++;
648 }
649 *fmt++ = c;
650 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000651}
652
653#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
654
655PyObject *
656PyUnicode_FromFormatV(const char *format, va_list vargs)
657{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000658 va_list count;
659 Py_ssize_t callcount = 0;
660 PyObject **callresults = NULL;
661 PyObject **callresult = NULL;
662 Py_ssize_t n = 0;
663 int width = 0;
664 int precision = 0;
665 int zeropad;
666 const char* f;
667 Py_UNICODE *s;
668 PyObject *string;
669 /* used by sprintf */
670 char buffer[21];
671 /* use abuffer instead of buffer, if we need more space
672 * (which can happen if there's a format specifier with width). */
673 char *abuffer = NULL;
674 char *realbuffer;
675 Py_ssize_t abuffersize = 0;
676 char fmt[60]; /* should be enough for %0width.precisionld */
677 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000678
679#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000680 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000681#else
682#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000683 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000684#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000685 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000686#endif
687#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000688 /* step 1: count the number of %S/%R/%s format specifications
689 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
690 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000691 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000692 if (*f == '%') {
693 if (*(f+1)=='%')
694 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000695 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000696 ++callcount;
697 while (isdigit((unsigned)*f))
698 width = (width*10) + *f++ - '0';
699 while (*++f && *f != '%' && !isalpha((unsigned)*f))
700 ;
701 if (*f == 's')
702 ++callcount;
703 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000704 }
705 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000706 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000707 if (callcount) {
708 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
709 if (!callresults) {
710 PyErr_NoMemory();
711 return NULL;
712 }
713 callresult = callresults;
714 }
715 /* step 3: figure out how large a buffer we need */
716 for (f = format; *f; f++) {
717 if (*f == '%') {
718 const char* p = f;
719 width = 0;
720 while (isdigit((unsigned)*f))
721 width = (width*10) + *f++ - '0';
722 while (*++f && *f != '%' && !isalpha((unsigned)*f))
723 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000724
Benjamin Peterson857ce152009-01-31 16:29:18 +0000725 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
726 * they don't affect the amount of space we reserve.
727 */
728 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000729 (f[1] == 'd' || f[1] == 'u'))
730 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000731
Benjamin Peterson857ce152009-01-31 16:29:18 +0000732 switch (*f) {
733 case 'c':
734 (void)va_arg(count, int);
735 /* fall through... */
736 case '%':
737 n++;
738 break;
739 case 'd': case 'u': case 'i': case 'x':
740 (void) va_arg(count, int);
741 /* 20 bytes is enough to hold a 64-bit
742 integer. Decimal takes the most space.
743 This isn't enough for octal.
744 If a width is specified we need more
745 (which we allocate later). */
746 if (width < 20)
747 width = 20;
748 n += width;
749 if (abuffersize < width)
750 abuffersize = width;
751 break;
752 case 's':
753 {
754 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000755 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000756 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
757 if (!str)
758 goto fail;
759 n += PyUnicode_GET_SIZE(str);
760 /* Remember the str and switch to the next slot */
761 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000762 break;
763 }
764 case 'U':
765 {
766 PyObject *obj = va_arg(count, PyObject *);
767 assert(obj && PyUnicode_Check(obj));
768 n += PyUnicode_GET_SIZE(obj);
769 break;
770 }
771 case 'V':
772 {
773 PyObject *obj = va_arg(count, PyObject *);
774 const char *str = va_arg(count, const char *);
775 assert(obj || str);
776 assert(!obj || PyUnicode_Check(obj));
777 if (obj)
778 n += PyUnicode_GET_SIZE(obj);
779 else
780 n += strlen(str);
781 break;
782 }
783 case 'S':
784 {
785 PyObject *obj = va_arg(count, PyObject *);
786 PyObject *str;
787 assert(obj);
788 str = PyObject_Str(obj);
789 if (!str)
790 goto fail;
791 n += PyUnicode_GET_SIZE(str);
792 /* Remember the str and switch to the next slot */
793 *callresult++ = str;
794 break;
795 }
796 case 'R':
797 {
798 PyObject *obj = va_arg(count, PyObject *);
799 PyObject *repr;
800 assert(obj);
801 repr = PyObject_Repr(obj);
802 if (!repr)
803 goto fail;
804 n += PyUnicode_GET_SIZE(repr);
805 /* Remember the repr and switch to the next slot */
806 *callresult++ = repr;
807 break;
808 }
809 case 'p':
810 (void) va_arg(count, int);
811 /* maximum 64-bit pointer representation:
812 * 0xffffffffffffffff
813 * so 19 characters is enough.
814 * XXX I count 18 -- what's the extra for?
815 */
816 n += 19;
817 break;
818 default:
819 /* if we stumble upon an unknown
820 formatting code, copy the rest of
821 the format string to the output
822 string. (we cannot just skip the
823 code, since there's no way to know
824 what's in the argument list) */
825 n += strlen(p);
826 goto expand;
827 }
828 } else
829 n++;
830 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000831 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000832 if (abuffersize > 20) {
833 abuffer = PyObject_Malloc(abuffersize);
834 if (!abuffer) {
835 PyErr_NoMemory();
836 goto fail;
837 }
838 realbuffer = abuffer;
839 }
840 else
841 realbuffer = buffer;
842 /* step 4: fill the buffer */
843 /* Since we've analyzed how much space we need for the worst case,
844 we don't have to resize the string.
845 There can be no errors beyond this point. */
846 string = PyUnicode_FromUnicode(NULL, n);
847 if (!string)
848 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000849
Benjamin Peterson857ce152009-01-31 16:29:18 +0000850 s = PyUnicode_AS_UNICODE(string);
851 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000852
Benjamin Peterson857ce152009-01-31 16:29:18 +0000853 for (f = format; *f; f++) {
854 if (*f == '%') {
855 const char* p = f++;
856 int longflag = 0;
857 int size_tflag = 0;
858 zeropad = (*f == '0');
859 /* parse the width.precision part */
860 width = 0;
861 while (isdigit((unsigned)*f))
862 width = (width*10) + *f++ - '0';
863 precision = 0;
864 if (*f == '.') {
865 f++;
866 while (isdigit((unsigned)*f))
867 precision = (precision*10) + *f++ - '0';
868 }
869 /* handle the long flag, but only for %ld and %lu.
870 others can be added when necessary. */
871 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
872 longflag = 1;
873 ++f;
874 }
875 /* handle the size_t flag. */
876 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
877 size_tflag = 1;
878 ++f;
879 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000880
Benjamin Peterson857ce152009-01-31 16:29:18 +0000881 switch (*f) {
882 case 'c':
883 *s++ = va_arg(vargs, int);
884 break;
885 case 'd':
886 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
887 if (longflag)
888 sprintf(realbuffer, fmt, va_arg(vargs, long));
889 else if (size_tflag)
890 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
891 else
892 sprintf(realbuffer, fmt, va_arg(vargs, int));
893 appendstring(realbuffer);
894 break;
895 case 'u':
896 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
897 if (longflag)
898 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
899 else if (size_tflag)
900 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
901 else
902 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
903 appendstring(realbuffer);
904 break;
905 case 'i':
906 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
907 sprintf(realbuffer, fmt, va_arg(vargs, int));
908 appendstring(realbuffer);
909 break;
910 case 'x':
911 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
912 sprintf(realbuffer, fmt, va_arg(vargs, int));
913 appendstring(realbuffer);
914 break;
915 case 's':
916 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000917 /* unused, since we already have the result */
918 (void) va_arg(vargs, char *);
919 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
920 PyUnicode_GET_SIZE(*callresult));
921 s += PyUnicode_GET_SIZE(*callresult);
922 /* We're done with the unicode()/repr() => forget it */
923 Py_DECREF(*callresult);
924 /* switch to next unicode()/repr() result */
925 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000926 break;
927 }
928 case 'U':
929 {
930 PyObject *obj = va_arg(vargs, PyObject *);
931 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
932 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
933 s += size;
934 break;
935 }
936 case 'V':
937 {
938 PyObject *obj = va_arg(vargs, PyObject *);
939 const char *str = va_arg(vargs, const char *);
940 if (obj) {
941 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
942 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
943 s += size;
944 } else {
945 appendstring(str);
946 }
947 break;
948 }
949 case 'S':
950 case 'R':
951 {
952 Py_UNICODE *ucopy;
953 Py_ssize_t usize;
954 Py_ssize_t upos;
955 /* unused, since we already have the result */
956 (void) va_arg(vargs, PyObject *);
957 ucopy = PyUnicode_AS_UNICODE(*callresult);
958 usize = PyUnicode_GET_SIZE(*callresult);
959 for (upos = 0; upos<usize;)
960 *s++ = ucopy[upos++];
961 /* We're done with the unicode()/repr() => forget it */
962 Py_DECREF(*callresult);
963 /* switch to next unicode()/repr() result */
964 ++callresult;
965 break;
966 }
967 case 'p':
968 sprintf(buffer, "%p", va_arg(vargs, void*));
969 /* %p is ill-defined: ensure leading 0x. */
970 if (buffer[1] == 'X')
971 buffer[1] = 'x';
972 else if (buffer[1] != 'x') {
973 memmove(buffer+2, buffer, strlen(buffer)+1);
974 buffer[0] = '0';
975 buffer[1] = 'x';
976 }
977 appendstring(buffer);
978 break;
979 case '%':
980 *s++ = '%';
981 break;
982 default:
983 appendstring(p);
984 goto end;
985 }
986 } else
987 *s++ = *f;
988 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000989
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000990 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000991 if (callresults)
992 PyObject_Free(callresults);
993 if (abuffer)
994 PyObject_Free(abuffer);
995 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
996 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000997 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000998 if (callresults) {
999 PyObject **callresult2 = callresults;
1000 while (callresult2 < callresult) {
1001 Py_DECREF(*callresult2);
1002 ++callresult2;
1003 }
1004 PyObject_Free(callresults);
1005 }
1006 if (abuffer)
1007 PyObject_Free(abuffer);
1008 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001009}
1010
1011#undef appendstring
1012
1013PyObject *
1014PyUnicode_FromFormat(const char *format, ...)
1015{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001016 PyObject* ret;
1017 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001018
1019#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001020 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001021#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001022 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001023#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001024 ret = PyUnicode_FromFormatV(format, vargs);
1025 va_end(vargs);
1026 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001027}
1028
Martin v. Löwis18e16552006-02-15 17:27:45 +00001029Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001030 wchar_t *w,
1031 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001032{
1033 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001034 PyErr_BadInternalCall();
1035 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001037
1038 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001040 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001041
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042#ifdef HAVE_USABLE_WCHAR_T
1043 memcpy(w, unicode->str, size * sizeof(wchar_t));
1044#else
1045 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001046 register Py_UNICODE *u;
1047 register Py_ssize_t i;
1048 u = PyUnicode_AS_UNICODE(unicode);
1049 for (i = size; i > 0; i--)
1050 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051 }
1052#endif
1053
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001054 if (size > PyUnicode_GET_SIZE(unicode))
1055 return PyUnicode_GET_SIZE(unicode);
1056 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001057 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058}
1059
1060#endif
1061
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001062PyObject *PyUnicode_FromOrdinal(int ordinal)
1063{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001064 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001065
1066#ifdef Py_UNICODE_WIDE
1067 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001068 PyErr_SetString(PyExc_ValueError,
1069 "unichr() arg not in range(0x110000) "
1070 "(wide Python build)");
1071 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001072 }
1073#else
1074 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001075 PyErr_SetString(PyExc_ValueError,
1076 "unichr() arg not in range(0x10000) "
1077 "(narrow Python build)");
1078 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001079 }
1080#endif
1081
Hye-Shik Chang40574832004-04-06 07:24:51 +00001082 s[0] = (Py_UNICODE)ordinal;
1083 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001084}
1085
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086PyObject *PyUnicode_FromObject(register PyObject *obj)
1087{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001088 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001089 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001090 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 Py_INCREF(obj);
1092 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001093 }
1094 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001095 /* For a Unicode subtype that's not a Unicode object,
1096 return a true Unicode object with the same data. */
1097 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1098 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001099 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001100 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1101}
1102
1103PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001104 const char *encoding,
1105 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001106{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001107 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001108 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001109 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001110
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001112 PyErr_BadInternalCall();
1113 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001115
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001116#if 0
1117 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001118 that no encodings is given and then redirect to
1119 PyObject_Unicode() which then applies the additional logic for
1120 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001121
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001122 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001123 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001124
1125 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001126 if (PyUnicode_Check(obj)) {
1127 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001128 PyErr_SetString(PyExc_TypeError,
1129 "decoding Unicode is not supported");
1130 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001131 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001132 return PyObject_Unicode(obj);
1133 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001134#else
1135 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001136 PyErr_SetString(PyExc_TypeError,
1137 "decoding Unicode is not supported");
1138 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001139 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001140#endif
1141
1142 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001143 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001144 s = PyString_AS_STRING(obj);
1145 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001146 }
Christian Heimes3497f942008-05-26 12:29:14 +00001147 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001148 /* Python 2.x specific */
1149 PyErr_Format(PyExc_TypeError,
1150 "decoding bytearray is not supported");
1151 return NULL;
1152 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001153 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001154 /* Overwrite the error message with something more useful in
1155 case of a TypeError. */
1156 if (PyErr_ExceptionMatches(PyExc_TypeError))
1157 PyErr_Format(PyExc_TypeError,
1158 "coercing to Unicode: need string or buffer, "
1159 "%.80s found",
1160 Py_TYPE(obj)->tp_name);
1161 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001162 }
Tim Petersced69f82003-09-16 20:30:58 +00001163
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001164 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 if (len == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001166 Py_INCREF(unicode_empty);
1167 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 }
Tim Petersced69f82003-09-16 20:30:58 +00001169 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001170 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001171
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001172 return v;
1173
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001174 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001175 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176}
1177
1178PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001179 Py_ssize_t size,
1180 const char *encoding,
1181 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182{
1183 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001184
1185 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001186 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001187
1188 /* Shortcuts for common default encodings */
1189 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001191 else if (strcmp(encoding, "latin-1") == 0)
1192 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001193#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1194 else if (strcmp(encoding, "mbcs") == 0)
1195 return PyUnicode_DecodeMBCS(s, size, errors);
1196#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001197 else if (strcmp(encoding, "ascii") == 0)
1198 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199
1200 /* Decode via the codec registry */
1201 buffer = PyBuffer_FromMemory((void *)s, size);
1202 if (buffer == NULL)
1203 goto onError;
1204 unicode = PyCodec_Decode(buffer, encoding, errors);
1205 if (unicode == NULL)
1206 goto onError;
1207 if (!PyUnicode_Check(unicode)) {
1208 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001209 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001210 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 Py_DECREF(unicode);
1212 goto onError;
1213 }
1214 Py_DECREF(buffer);
1215 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001216
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001217 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218 Py_XDECREF(buffer);
1219 return NULL;
1220}
1221
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001222PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1223 const char *encoding,
1224 const char *errors)
1225{
1226 PyObject *v;
1227
1228 if (!PyUnicode_Check(unicode)) {
1229 PyErr_BadArgument();
1230 goto onError;
1231 }
1232
1233 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001234 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001235
1236 /* Decode via the codec registry */
1237 v = PyCodec_Decode(unicode, encoding, errors);
1238 if (v == NULL)
1239 goto onError;
1240 return v;
1241
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001242 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001243 return NULL;
1244}
1245
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001247 Py_ssize_t size,
1248 const char *encoding,
1249 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250{
1251 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001252
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 unicode = PyUnicode_FromUnicode(s, size);
1254 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1257 Py_DECREF(unicode);
1258 return v;
1259}
1260
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001261PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1262 const char *encoding,
1263 const char *errors)
1264{
1265 PyObject *v;
1266
1267 if (!PyUnicode_Check(unicode)) {
1268 PyErr_BadArgument();
1269 goto onError;
1270 }
1271
1272 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001273 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001274
1275 /* Encode via the codec registry */
1276 v = PyCodec_Encode(unicode, encoding, errors);
1277 if (v == NULL)
1278 goto onError;
1279 return v;
1280
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001281 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001282 return NULL;
1283}
1284
Guido van Rossumd57fd912000-03-10 22:53:23 +00001285PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1286 const char *encoding,
1287 const char *errors)
1288{
1289 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001290
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 if (!PyUnicode_Check(unicode)) {
1292 PyErr_BadArgument();
1293 goto onError;
1294 }
Fred Drakee4315f52000-05-09 19:53:39 +00001295
Tim Petersced69f82003-09-16 20:30:58 +00001296 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001297 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001298
1299 /* Shortcuts for common default encodings */
1300 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001301 if (strcmp(encoding, "utf-8") == 0)
1302 return PyUnicode_AsUTF8String(unicode);
1303 else if (strcmp(encoding, "latin-1") == 0)
1304 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001305#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001306 else if (strcmp(encoding, "mbcs") == 0)
1307 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001308#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001309 else if (strcmp(encoding, "ascii") == 0)
1310 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001311 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312
1313 /* Encode via the codec registry */
1314 v = PyCodec_Encode(unicode, encoding, errors);
1315 if (v == NULL)
1316 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001317 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001318 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001319 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001320 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 Py_DECREF(v);
1322 goto onError;
1323 }
1324 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001325
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001326 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 return NULL;
1328}
1329
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001330PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001331 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001332{
1333 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1334
1335 if (v)
1336 return v;
1337 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1338 if (v && errors == NULL)
1339 ((PyUnicodeObject *)unicode)->defenc = v;
1340 return v;
1341}
1342
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1344{
1345 if (!PyUnicode_Check(unicode)) {
1346 PyErr_BadArgument();
1347 goto onError;
1348 }
1349 return PyUnicode_AS_UNICODE(unicode);
1350
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001351 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352 return NULL;
1353}
1354
Martin v. Löwis18e16552006-02-15 17:27:45 +00001355Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001356{
1357 if (!PyUnicode_Check(unicode)) {
1358 PyErr_BadArgument();
1359 goto onError;
1360 }
1361 return PyUnicode_GET_SIZE(unicode);
1362
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001363 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001364 return -1;
1365}
1366
Thomas Wouters78890102000-07-22 19:25:51 +00001367const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001368{
1369 return unicode_default_encoding;
1370}
1371
1372int PyUnicode_SetDefaultEncoding(const char *encoding)
1373{
1374 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001375
Fred Drakee4315f52000-05-09 19:53:39 +00001376 /* Make sure the encoding is valid. As side effect, this also
1377 loads the encoding into the codec registry cache. */
1378 v = _PyCodec_Lookup(encoding);
1379 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001380 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001381 Py_DECREF(v);
1382 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001383 encoding,
1384 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001385 return 0;
1386
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001387 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001388 return -1;
1389}
1390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001391/* error handling callback helper:
1392 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001393 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001394 and adjust various state variables.
1395 return 0 on success, -1 on error
1396*/
1397
1398static
1399int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001400 const char *encoding, const char *reason,
1401 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1402 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1403 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001404{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001405 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001406
1407 PyObject *restuple = NULL;
1408 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001409 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1410 Py_ssize_t requiredsize;
1411 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001412 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001413 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 int res = -1;
1415
1416 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001417 *errorHandler = PyCodec_LookupError(errors);
1418 if (*errorHandler == NULL)
1419 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420 }
1421
1422 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001423 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001424 encoding, input, insize, *startinpos, *endinpos, reason);
1425 if (*exceptionObject == NULL)
1426 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001427 }
1428 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001429 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1430 goto onError;
1431 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1432 goto onError;
1433 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1434 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435 }
1436
1437 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1438 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001439 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001440 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001441 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001442 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001443 }
1444 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001445 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001446 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001447 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001448 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001449 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1450 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001451 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001452
1453 /* need more space? (at least enough for what we
1454 have+the replacement+the rest of the string (starting
1455 at the new input position), so we won't have to check space
1456 when there are no errors in the rest of the string) */
1457 repptr = PyUnicode_AS_UNICODE(repunicode);
1458 repsize = PyUnicode_GET_SIZE(repunicode);
1459 requiredsize = *outpos + repsize + insize-newpos;
1460 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001461 if (requiredsize<2*outsize)
1462 requiredsize = 2*outsize;
1463 if (_PyUnicode_Resize(output, requiredsize) < 0)
1464 goto onError;
1465 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001466 }
1467 *endinpos = newpos;
1468 *inptr = input + newpos;
1469 Py_UNICODE_COPY(*outptr, repptr, repsize);
1470 *outptr += repsize;
1471 *outpos += repsize;
1472 /* we made it! */
1473 res = 0;
1474
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001475 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001476 Py_XDECREF(restuple);
1477 return res;
1478}
1479
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001480/* --- UTF-7 Codec -------------------------------------------------------- */
1481
Antoine Pitrou653dece2009-05-04 18:32:32 +00001482/* See RFC2152 for details. We encode conservatively and decode liberally. */
1483
1484/* Three simple macros defining base-64. */
1485
1486/* Is c a base-64 character? */
1487
1488#define IS_BASE64(c) \
1489 (isalnum(c) || (c) == '+' || (c) == '/')
1490
1491/* given that c is a base-64 character, what is its base-64 value? */
1492
1493#define FROM_BASE64(c) \
1494 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1495 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1496 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1497 (c) == '+' ? 62 : 63)
1498
1499/* What is the base-64 character of the bottom 6 bits of n? */
1500
1501#define TO_BASE64(n) \
1502 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1503
1504/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1505 * decoded as itself. We are permissive on decoding; the only ASCII
1506 * byte not decoding to itself is the + which begins a base64
1507 * string. */
1508
1509#define DECODE_DIRECT(c) \
1510 ((c) <= 127 && (c) != '+')
1511
1512/* The UTF-7 encoder treats ASCII characters differently according to
1513 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1514 * the above). See RFC2152. This array identifies these different
1515 * sets:
1516 * 0 : "Set D"
1517 * alphanumeric and '(),-./:?
1518 * 1 : "Set O"
1519 * !"#$%&*;<=>@[]^_`{|}
1520 * 2 : "whitespace"
1521 * ht nl cr sp
1522 * 3 : special (must be base64 encoded)
1523 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1524 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001525
Tim Petersced69f82003-09-16 20:30:58 +00001526static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001527char utf7_category[128] = {
1528/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1529 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1530/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1532/* sp ! " # $ % & ' ( ) * + , - . / */
1533 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1534/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1535 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1536/* @ A B C D E F G H I J K L M N O */
1537 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1538/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1540/* ` a b c d e f g h i j k l m n o */
1541 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1542/* p q r s t u v w x y z { | } ~ del */
1543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001544};
1545
Antoine Pitrou653dece2009-05-04 18:32:32 +00001546/* ENCODE_DIRECT: this character should be encoded as itself. The
1547 * answer depends on whether we are encoding set O as itself, and also
1548 * on whether we are encoding whitespace as itself. RFC2152 makes it
1549 * clear that the answers to these questions vary between
1550 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001551
Antoine Pitrou653dece2009-05-04 18:32:32 +00001552#define ENCODE_DIRECT(c, directO, directWS) \
1553 ((c) < 128 && (c) > 0 && \
1554 ((utf7_category[(c)] == 0) || \
1555 (directWS && (utf7_category[(c)] == 2)) || \
1556 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001557
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001558PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001559 Py_ssize_t size,
1560 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001561{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001562 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1563}
1564
Antoine Pitrou653dece2009-05-04 18:32:32 +00001565/* The decoder. The only state we preserve is our read position,
1566 * i.e. how many characters we have consumed. So if we end in the
1567 * middle of a shift sequence we have to back off the read position
1568 * and the output to the beginning of the sequence, otherwise we lose
1569 * all the shift state (seen bits, number of bits seen, high
1570 * surrogate). */
1571
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001572PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001573 Py_ssize_t size,
1574 const char *errors,
1575 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001576{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001577 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001578 Py_ssize_t startinpos;
1579 Py_ssize_t endinpos;
1580 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001581 const char *e;
1582 PyUnicodeObject *unicode;
1583 Py_UNICODE *p;
1584 const char *errmsg = "";
1585 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001586 Py_UNICODE *shiftOutStart;
1587 unsigned int base64bits = 0;
1588 unsigned long base64buffer = 0;
1589 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001590 PyObject *errorHandler = NULL;
1591 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001592
1593 unicode = _PyUnicode_New(size);
1594 if (!unicode)
1595 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001596 if (size == 0) {
1597 if (consumed)
1598 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001599 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001600 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001601
1602 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001603 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001604 e = s + size;
1605
1606 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001607 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001608
Antoine Pitrou653dece2009-05-04 18:32:32 +00001609 if (inShift) { /* in a base-64 section */
1610 if (IS_BASE64(ch)) { /* consume a base-64 character */
1611 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1612 base64bits += 6;
1613 s++;
1614 if (base64bits >= 16) {
1615 /* we have enough bits for a UTF-16 value */
1616 Py_UNICODE outCh = (Py_UNICODE)
1617 (base64buffer >> (base64bits-16));
1618 base64bits -= 16;
1619 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1620 if (surrogate) {
1621 /* expecting a second surrogate */
1622 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1623#ifdef Py_UNICODE_WIDE
1624 *p++ = (((surrogate & 0x3FF)<<10)
1625 | (outCh & 0x3FF)) + 0x10000;
1626#else
1627 *p++ = surrogate;
1628 *p++ = outCh;
1629#endif
1630 surrogate = 0;
Antoine Pitrou30402542011-11-15 01:49:40 +01001631 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001632 }
1633 else {
Antoine Pitrou30402542011-11-15 01:49:40 +01001634 *p++ = surrogate;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001635 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001636 }
1637 }
Antoine Pitrou30402542011-11-15 01:49:40 +01001638 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001639 /* first surrogate */
1640 surrogate = outCh;
1641 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001642 else {
1643 *p++ = outCh;
1644 }
1645 }
1646 }
1647 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001648 inShift = 0;
1649 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001650 if (surrogate) {
Antoine Pitrou30402542011-11-15 01:49:40 +01001651 *p++ = surrogate;
1652 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001654 if (base64bits > 0) { /* left-over bits */
1655 if (base64bits >= 6) {
1656 /* We've seen at least one base-64 character */
1657 errmsg = "partial character in shift sequence";
1658 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001659 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001660 else {
1661 /* Some bits remain; they should be zero */
1662 if (base64buffer != 0) {
1663 errmsg = "non-zero padding bits in shift sequence";
1664 goto utf7Error;
1665 }
1666 }
1667 }
1668 if (ch != '-') {
1669 /* '-' is absorbed; other terminating
1670 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001671 *p++ = ch;
1672 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001673 }
1674 }
1675 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001676 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001677 s++; /* consume '+' */
1678 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 s++;
1680 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001681 }
1682 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001683 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001684 shiftOutStart = p;
1685 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001686 }
1687 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001688 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689 *p++ = ch;
1690 s++;
1691 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001692 else {
1693 startinpos = s-starts;
1694 s++;
1695 errmsg = "unexpected special character";
1696 goto utf7Error;
1697 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001698 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001699utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001700 outpos = p-PyUnicode_AS_UNICODE(unicode);
1701 endinpos = s-starts;
1702 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001703 errors, &errorHandler,
1704 "utf7", errmsg,
1705 starts, size, &startinpos, &endinpos, &exc, &s,
1706 &unicode, &outpos, &p))
1707 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001708 }
1709
Antoine Pitrou653dece2009-05-04 18:32:32 +00001710 /* end of string */
1711
1712 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1713 /* if we're in an inconsistent state, that's an error */
1714 if (surrogate ||
1715 (base64bits >= 6) ||
1716 (base64bits > 0 && base64buffer != 0)) {
1717 outpos = p-PyUnicode_AS_UNICODE(unicode);
1718 endinpos = size;
1719 if (unicode_decode_call_errorhandler(
1720 errors, &errorHandler,
1721 "utf7", "unterminated shift sequence",
1722 starts, size, &startinpos, &endinpos, &exc, &s,
1723 &unicode, &outpos, &p))
1724 goto onError;
1725 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001726 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001727
1728 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001729 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001730 if (inShift) {
1731 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001732 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001733 }
1734 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001735 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001736 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001737 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001738
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001739 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001740 goto onError;
1741
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 Py_XDECREF(errorHandler);
1743 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744 return (PyObject *)unicode;
1745
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001746 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001747 Py_XDECREF(errorHandler);
1748 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749 Py_DECREF(unicode);
1750 return NULL;
1751}
1752
1753
1754PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001755 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001756 int base64SetO,
1757 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001758 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759{
1760 PyObject *v;
1761 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001762 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001763 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001764 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001765 unsigned int base64bits = 0;
1766 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001767 char * out;
1768 char * start;
1769
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001770 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001771 return PyErr_NoMemory();
1772
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001774 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001775
Antoine Pitrou653dece2009-05-04 18:32:32 +00001776 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001777 if (v == NULL)
1778 return NULL;
1779
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001780 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001781 for (;i < size; ++i) {
1782 Py_UNICODE ch = s[i];
1783
Antoine Pitrou653dece2009-05-04 18:32:32 +00001784 if (inShift) {
1785 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1786 /* shifting out */
1787 if (base64bits) { /* output remaining bits */
1788 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1789 base64buffer = 0;
1790 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001791 }
1792 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001793 /* Characters not in the BASE64 set implicitly unshift the sequence
1794 so no '-' is required, except if the character is itself a '-' */
1795 if (IS_BASE64(ch) || ch == '-') {
1796 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001797 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001798 *out++ = (char) ch;
1799 }
1800 else {
1801 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001802 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001803 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001804 else { /* not in a shift sequence */
1805 if (ch == '+') {
1806 *out++ = '+';
1807 *out++ = '-';
1808 }
1809 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1810 *out++ = (char) ch;
1811 }
1812 else {
1813 *out++ = '+';
1814 inShift = 1;
1815 goto encode_char;
1816 }
1817 }
1818 continue;
1819encode_char:
1820#ifdef Py_UNICODE_WIDE
1821 if (ch >= 0x10000) {
1822 /* code first surrogate */
1823 base64bits += 16;
1824 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1825 while (base64bits >= 6) {
1826 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1827 base64bits -= 6;
1828 }
1829 /* prepare second surrogate */
1830 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1831 }
1832#endif
1833 base64bits += 16;
1834 base64buffer = (base64buffer << 16) | ch;
1835 while (base64bits >= 6) {
1836 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1837 base64bits -= 6;
1838 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001839 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001840 if (base64bits)
1841 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1842 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001843 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001844
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001845 if (_PyString_Resize(&v, out - start))
1846 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001847 return v;
1848}
1849
Antoine Pitrou653dece2009-05-04 18:32:32 +00001850#undef IS_BASE64
1851#undef FROM_BASE64
1852#undef TO_BASE64
1853#undef DECODE_DIRECT
1854#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001855
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856/* --- UTF-8 Codec -------------------------------------------------------- */
1857
Tim Petersced69f82003-09-16 20:30:58 +00001858static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001860 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1861 illegal prefix. See RFC 3629 for details */
1862 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1863 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07001864 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1866 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1867 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1870 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1872 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001873 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1874 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1875 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1876 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1877 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878};
1879
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001881 Py_ssize_t size,
1882 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883{
Walter Dörwald69652032004-09-07 20:24:22 +00001884 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1885}
1886
1887PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001888 Py_ssize_t size,
1889 const char *errors,
1890 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001891{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001892 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001893 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001894 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001895 Py_ssize_t startinpos;
1896 Py_ssize_t endinpos;
1897 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898 const char *e;
1899 PyUnicodeObject *unicode;
1900 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001901 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001902 PyObject *errorHandler = NULL;
1903 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904
1905 /* Note: size will always be longer than the resulting Unicode
1906 character count */
1907 unicode = _PyUnicode_New(size);
1908 if (!unicode)
1909 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001910 if (size == 0) {
1911 if (consumed)
1912 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001914 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915
1916 /* Unpack UTF-8 encoded data */
1917 p = unicode->str;
1918 e = s + size;
1919
1920 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001921 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922
1923 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001924 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001925 s++;
1926 continue;
1927 }
1928
1929 n = utf8_code_length[ch];
1930
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001931 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001932 if (consumed)
1933 break;
1934 else {
1935 errmsg = "unexpected end of data";
1936 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001937 endinpos = startinpos+1;
1938 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1939 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001940 goto utf8Error;
1941 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001942 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943
1944 switch (n) {
1945
1946 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001947 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001948 startinpos = s-starts;
1949 endinpos = startinpos+1;
1950 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951
1952 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001953 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001954 startinpos = s-starts;
1955 endinpos = startinpos+1;
1956 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957
1958 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001960 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001961 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001962 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001963 goto utf8Error;
1964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001965 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00001966 assert ((ch > 0x007F) && (ch <= 0x07FF));
1967 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968 break;
1969
1970 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001971 /* XXX: surrogates shouldn't be valid UTF-8!
1972 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1973 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1974 Uncomment the 2 lines below to make them invalid,
1975 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00001976 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00001977 (s[2] & 0xc0) != 0x80 ||
1978 ((unsigned char)s[0] == 0xE0 &&
1979 (unsigned char)s[1] < 0xA0)/* ||
1980 ((unsigned char)s[0] == 0xED &&
1981 (unsigned char)s[1] > 0x9F)*/) {
1982 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001983 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001984 endinpos = startinpos + 1;
1985
1986 /* if s[1] first two bits are 1 and 0, then the invalid
1987 continuation byte is s[2], so increment endinpos by 1,
1988 if not, s[1] is invalid and endinpos doesn't need to
1989 be incremented. */
1990 if ((s[1] & 0xC0) == 0x80)
1991 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001992 goto utf8Error;
1993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00001995 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
1996 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001997 break;
1998
1999 case 4:
2000 if ((s[1] & 0xc0) != 0x80 ||
2001 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002002 (s[3] & 0xc0) != 0x80 ||
2003 ((unsigned char)s[0] == 0xF0 &&
2004 (unsigned char)s[1] < 0x90) ||
2005 ((unsigned char)s[0] == 0xF4 &&
2006 (unsigned char)s[1] > 0x8F)) {
2007 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002008 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002009 endinpos = startinpos + 1;
2010 if ((s[1] & 0xC0) == 0x80) {
2011 endinpos++;
2012 if ((s[2] & 0xC0) == 0x80)
2013 endinpos++;
2014 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002015 goto utf8Error;
2016 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002017 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002018 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2019 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2020
Fredrik Lundh8f455852001-06-27 18:59:43 +00002021#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002022 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002023#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002024 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002025
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002026 /* translate from 10000..10FFFF to 0..FFFF */
2027 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002028
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002029 /* high surrogate = top 10 bits added to D800 */
2030 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002031
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002032 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002033 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002034#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 }
2037 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002038 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002039
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002040 utf8Error:
2041 outpos = p-PyUnicode_AS_UNICODE(unicode);
2042 if (unicode_decode_call_errorhandler(
2043 errors, &errorHandler,
2044 "utf8", errmsg,
2045 starts, size, &startinpos, &endinpos, &exc, &s,
2046 &unicode, &outpos, &p))
2047 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 }
Walter Dörwald69652032004-09-07 20:24:22 +00002049 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002050 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051
2052 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002053 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 goto onError;
2055
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056 Py_XDECREF(errorHandler);
2057 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 return (PyObject *)unicode;
2059
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002060 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 Py_XDECREF(errorHandler);
2062 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 Py_DECREF(unicode);
2064 return NULL;
2065}
2066
Tim Peters602f7402002-04-27 18:03:26 +00002067/* Allocation strategy: if the string is short, convert into a stack buffer
2068 and allocate exactly as much space needed at the end. Else allocate the
2069 maximum possible needed (4 result bytes per Unicode character), and return
2070 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002071*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002072PyObject *
2073PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002074 Py_ssize_t size,
2075 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076{
Tim Peters602f7402002-04-27 18:03:26 +00002077#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002078
Martin v. Löwis18e16552006-02-15 17:27:45 +00002079 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002080 PyObject *v; /* result string object */
2081 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002082 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002083 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002084 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002085
Tim Peters602f7402002-04-27 18:03:26 +00002086 assert(s != NULL);
2087 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088
Tim Peters602f7402002-04-27 18:03:26 +00002089 if (size <= MAX_SHORT_UNICHARS) {
2090 /* Write into the stack buffer; nallocated can't overflow.
2091 * At the end, we'll allocate exactly as much heap space as it
2092 * turns out we need.
2093 */
2094 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2095 v = NULL; /* will allocate after we're done */
2096 p = stackbuf;
2097 }
2098 else {
2099 /* Overallocate on the heap, and give the excess back at the end. */
2100 nallocated = size * 4;
2101 if (nallocated / 4 != size) /* overflow! */
2102 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002103 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002104 if (v == NULL)
2105 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002106 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002107 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002108
Tim Peters602f7402002-04-27 18:03:26 +00002109 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002110 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002111
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002112 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002113 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002115
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002117 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002118 *p++ = (char)(0xc0 | (ch >> 6));
2119 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002120 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002121 else {
Tim Peters602f7402002-04-27 18:03:26 +00002122 /* Encode UCS2 Unicode ordinals */
2123 if (ch < 0x10000) {
2124 /* Special case: check for high surrogate */
2125 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2126 Py_UCS4 ch2 = s[i];
2127 /* Check for low surrogate and combine the two to
2128 form a UCS4 value */
2129 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002130 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002131 i++;
2132 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002133 }
Tim Peters602f7402002-04-27 18:03:26 +00002134 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002135 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002136 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002137 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2138 *p++ = (char)(0x80 | (ch & 0x3f));
2139 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002140 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002141 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002142 /* Encode UCS4 Unicode ordinals */
2143 *p++ = (char)(0xf0 | (ch >> 18));
2144 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2145 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2146 *p++ = (char)(0x80 | (ch & 0x3f));
2147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002149
Tim Peters602f7402002-04-27 18:03:26 +00002150 if (v == NULL) {
2151 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002152 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002153 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002154 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002155 }
2156 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002157 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002158 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002159 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002160 if (_PyString_Resize(&v, nneeded))
2161 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002162 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002164
Tim Peters602f7402002-04-27 18:03:26 +00002165#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002166}
2167
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2169{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 if (!PyUnicode_Check(unicode)) {
2171 PyErr_BadArgument();
2172 return NULL;
2173 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002174 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002175 PyUnicode_GET_SIZE(unicode),
2176 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177}
2178
Walter Dörwald6e390802007-08-17 16:41:28 +00002179/* --- UTF-32 Codec ------------------------------------------------------- */
2180
2181PyObject *
2182PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002183 Py_ssize_t size,
2184 const char *errors,
2185 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002186{
2187 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2188}
2189
2190PyObject *
2191PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002192 Py_ssize_t size,
2193 const char *errors,
2194 int *byteorder,
2195 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002196{
2197 const char *starts = s;
2198 Py_ssize_t startinpos;
2199 Py_ssize_t endinpos;
2200 Py_ssize_t outpos;
2201 PyUnicodeObject *unicode;
2202 Py_UNICODE *p;
2203#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002204 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002205 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002206#else
2207 const int pairs = 0;
2208#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002209 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002210 int bo = 0; /* assume native ordering by default */
2211 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002212 /* Offsets from q for retrieving bytes in the right order. */
2213#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2214 int iorder[] = {0, 1, 2, 3};
2215#else
2216 int iorder[] = {3, 2, 1, 0};
2217#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002218 PyObject *errorHandler = NULL;
2219 PyObject *exc = NULL;
Chris Jerdonekad4b0002012-10-07 20:37:54 -07002220
Walter Dörwald6e390802007-08-17 16:41:28 +00002221 q = (unsigned char *)s;
2222 e = q + size;
2223
2224 if (byteorder)
2225 bo = *byteorder;
2226
2227 /* Check for BOM marks (U+FEFF) in the input and adjust current
2228 byte order setting accordingly. In native mode, the leading BOM
2229 mark is skipped, in all other modes, it is copied to the output
2230 stream as-is (giving a ZWNBSP character). */
2231 if (bo == 0) {
2232 if (size >= 4) {
2233 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002234 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002235#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002236 if (bom == 0x0000FEFF) {
2237 q += 4;
2238 bo = -1;
2239 }
2240 else if (bom == 0xFFFE0000) {
2241 q += 4;
2242 bo = 1;
2243 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002244#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002245 if (bom == 0x0000FEFF) {
2246 q += 4;
2247 bo = 1;
2248 }
2249 else if (bom == 0xFFFE0000) {
2250 q += 4;
2251 bo = -1;
2252 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002253#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002254 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002255 }
2256
2257 if (bo == -1) {
2258 /* force LE */
2259 iorder[0] = 0;
2260 iorder[1] = 1;
2261 iorder[2] = 2;
2262 iorder[3] = 3;
2263 }
2264 else if (bo == 1) {
2265 /* force BE */
2266 iorder[0] = 3;
2267 iorder[1] = 2;
2268 iorder[2] = 1;
2269 iorder[3] = 0;
2270 }
2271
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002272 /* On narrow builds we split characters outside the BMP into two
2273 codepoints => count how much extra space we need. */
2274#ifndef Py_UNICODE_WIDE
Serhiy Storchakac9631a12013-01-08 22:43:18 +02002275 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002276 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2277 pairs++;
2278#endif
2279
2280 /* This might be one to much, because of a BOM */
2281 unicode = _PyUnicode_New((size+3)/4+pairs);
2282 if (!unicode)
2283 return NULL;
2284 if (size == 0)
2285 return (PyObject *)unicode;
2286
2287 /* Unpack UTF-32 encoded data */
2288 p = unicode->str;
2289
Walter Dörwald6e390802007-08-17 16:41:28 +00002290 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002291 Py_UCS4 ch;
2292 /* remaining bytes at the end? (size should be divisible by 4) */
2293 if (e-q<4) {
2294 if (consumed)
2295 break;
2296 errmsg = "truncated data";
2297 startinpos = ((const char *)q)-starts;
2298 endinpos = ((const char *)e)-starts;
2299 goto utf32Error;
2300 /* The remaining input chars are ignored if the callback
2301 chooses to skip the input */
2302 }
2303 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2304 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002305
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002306 if (ch >= 0x110000)
2307 {
2308 errmsg = "codepoint not in range(0x110000)";
2309 startinpos = ((const char *)q)-starts;
2310 endinpos = startinpos+4;
2311 goto utf32Error;
2312 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002313#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002314 if (ch >= 0x10000)
2315 {
2316 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2317 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2318 }
2319 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002320#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002321 *p++ = ch;
2322 q += 4;
2323 continue;
2324 utf32Error:
2325 outpos = p-PyUnicode_AS_UNICODE(unicode);
2326 if (unicode_decode_call_errorhandler(
2327 errors, &errorHandler,
2328 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002329 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002330 &unicode, &outpos, &p))
2331 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002332 }
2333
2334 if (byteorder)
2335 *byteorder = bo;
2336
2337 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002338 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002339
2340 /* Adjust length */
2341 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2342 goto onError;
2343
2344 Py_XDECREF(errorHandler);
2345 Py_XDECREF(exc);
2346 return (PyObject *)unicode;
2347
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002348 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002349 Py_DECREF(unicode);
2350 Py_XDECREF(errorHandler);
2351 Py_XDECREF(exc);
2352 return NULL;
2353}
2354
2355PyObject *
2356PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002357 Py_ssize_t size,
2358 const char *errors,
2359 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002360{
2361 PyObject *v;
2362 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002363 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002364#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002365 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002366#else
2367 const int pairs = 0;
2368#endif
2369 /* Offsets from p for storing byte pairs in the right order. */
2370#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2371 int iorder[] = {0, 1, 2, 3};
2372#else
2373 int iorder[] = {3, 2, 1, 0};
2374#endif
2375
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002376#define STORECHAR(CH) \
2377 do { \
2378 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2379 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2380 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2381 p[iorder[0]] = (CH) & 0xff; \
2382 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002383 } while(0)
2384
2385 /* In narrow builds we can output surrogate pairs as one codepoint,
2386 so we need less space. */
2387#ifndef Py_UNICODE_WIDE
2388 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002389 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2390 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2391 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002392#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002393 nsize = (size - pairs + (byteorder == 0));
2394 bytesize = nsize * 4;
2395 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002396 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002397 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002398 if (v == NULL)
2399 return NULL;
2400
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002401 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002402 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002403 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002404 if (size == 0)
2405 return v;
2406
2407 if (byteorder == -1) {
2408 /* force LE */
2409 iorder[0] = 0;
2410 iorder[1] = 1;
2411 iorder[2] = 2;
2412 iorder[3] = 3;
2413 }
2414 else if (byteorder == 1) {
2415 /* force BE */
2416 iorder[0] = 3;
2417 iorder[1] = 2;
2418 iorder[2] = 1;
2419 iorder[3] = 0;
2420 }
2421
2422 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002423 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002424#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002425 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2426 Py_UCS4 ch2 = *s;
2427 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2428 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2429 s++;
2430 size--;
2431 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002432 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002433#endif
2434 STORECHAR(ch);
2435 }
2436 return v;
2437#undef STORECHAR
2438}
2439
2440PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2441{
2442 if (!PyUnicode_Check(unicode)) {
2443 PyErr_BadArgument();
2444 return NULL;
2445 }
2446 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002447 PyUnicode_GET_SIZE(unicode),
2448 NULL,
2449 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002450}
2451
Guido van Rossumd57fd912000-03-10 22:53:23 +00002452/* --- UTF-16 Codec ------------------------------------------------------- */
2453
Tim Peters772747b2001-08-09 22:21:55 +00002454PyObject *
2455PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002456 Py_ssize_t size,
2457 const char *errors,
2458 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002459{
Walter Dörwald69652032004-09-07 20:24:22 +00002460 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2461}
2462
2463PyObject *
2464PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002465 Py_ssize_t size,
2466 const char *errors,
2467 int *byteorder,
2468 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002469{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002470 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002471 Py_ssize_t startinpos;
2472 Py_ssize_t endinpos;
2473 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 PyUnicodeObject *unicode;
2475 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002476 const unsigned char *q, *e;
2477 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002478 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002479 /* Offsets from q for retrieving byte pairs in the right order. */
2480#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2481 int ihi = 1, ilo = 0;
2482#else
2483 int ihi = 0, ilo = 1;
2484#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002485 PyObject *errorHandler = NULL;
2486 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487
2488 /* Note: size will always be longer than the resulting Unicode
2489 character count */
2490 unicode = _PyUnicode_New(size);
2491 if (!unicode)
2492 return NULL;
2493 if (size == 0)
2494 return (PyObject *)unicode;
2495
2496 /* Unpack UTF-16 encoded data */
2497 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002498 q = (unsigned char *)s;
2499 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500
2501 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002502 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002504 /* Check for BOM marks (U+FEFF) in the input and adjust current
2505 byte order setting accordingly. In native mode, the leading BOM
2506 mark is skipped, in all other modes, it is copied to the output
2507 stream as-is (giving a ZWNBSP character). */
2508 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002509 if (size >= 2) {
2510 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002511#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002512 if (bom == 0xFEFF) {
2513 q += 2;
2514 bo = -1;
2515 }
2516 else if (bom == 0xFFFE) {
2517 q += 2;
2518 bo = 1;
2519 }
Tim Petersced69f82003-09-16 20:30:58 +00002520#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002521 if (bom == 0xFEFF) {
2522 q += 2;
2523 bo = 1;
2524 }
2525 else if (bom == 0xFFFE) {
2526 q += 2;
2527 bo = -1;
2528 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002529#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002530 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002531 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532
Tim Peters772747b2001-08-09 22:21:55 +00002533 if (bo == -1) {
2534 /* force LE */
2535 ihi = 1;
2536 ilo = 0;
2537 }
2538 else if (bo == 1) {
2539 /* force BE */
2540 ihi = 0;
2541 ilo = 1;
2542 }
2543
2544 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002545 Py_UNICODE ch;
2546 /* remaining bytes at the end? (size should be even) */
2547 if (e-q<2) {
2548 if (consumed)
2549 break;
2550 errmsg = "truncated data";
2551 startinpos = ((const char *)q)-starts;
2552 endinpos = ((const char *)e)-starts;
2553 goto utf16Error;
2554 /* The remaining input chars are ignored if the callback
2555 chooses to skip the input */
2556 }
2557 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002558
Benjamin Peterson857ce152009-01-31 16:29:18 +00002559 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002560
2561 if (ch < 0xD800 || ch > 0xDFFF) {
2562 *p++ = ch;
2563 continue;
2564 }
2565
2566 /* UTF-16 code pair: */
Antoine Pitrou715a63b2012-07-21 00:52:06 +02002567 if (e - q < 2) {
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002568 q -= 2;
2569 if (consumed)
2570 break;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002571 errmsg = "unexpected end of data";
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002572 startinpos = ((const char *)q)-starts;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002573 endinpos = ((const char *)e)-starts;
2574 goto utf16Error;
2575 }
2576 if (0xD800 <= ch && ch <= 0xDBFF) {
2577 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2578 q += 2;
2579 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002580#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002581 *p++ = ch;
2582 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002583#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002584 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002585#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002586 continue;
2587 }
2588 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002589 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002590 startinpos = (((const char *)q)-4)-starts;
2591 endinpos = startinpos+2;
2592 goto utf16Error;
2593 }
2594
Benjamin Peterson857ce152009-01-31 16:29:18 +00002595 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002596 errmsg = "illegal encoding";
2597 startinpos = (((const char *)q)-2)-starts;
2598 endinpos = startinpos+2;
2599 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002600
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002601 utf16Error:
2602 outpos = p-PyUnicode_AS_UNICODE(unicode);
2603 if (unicode_decode_call_errorhandler(
2604 errors, &errorHandler,
2605 "utf16", errmsg,
2606 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2607 &unicode, &outpos, &p))
2608 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 }
2610
2611 if (byteorder)
2612 *byteorder = bo;
2613
Walter Dörwald69652032004-09-07 20:24:22 +00002614 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002615 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002616
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002618 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 goto onError;
2620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002621 Py_XDECREF(errorHandler);
2622 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623 return (PyObject *)unicode;
2624
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002625 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002627 Py_XDECREF(errorHandler);
2628 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 return NULL;
2630}
2631
Tim Peters772747b2001-08-09 22:21:55 +00002632PyObject *
2633PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002634 Py_ssize_t size,
2635 const char *errors,
2636 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637{
2638 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002639 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002640 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002641#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002642 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002643#else
2644 const int pairs = 0;
2645#endif
Tim Peters772747b2001-08-09 22:21:55 +00002646 /* Offsets from p for storing byte pairs in the right order. */
2647#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2648 int ihi = 1, ilo = 0;
2649#else
2650 int ihi = 0, ilo = 1;
2651#endif
2652
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002653#define STORECHAR(CH) \
2654 do { \
2655 p[ihi] = ((CH) >> 8) & 0xff; \
2656 p[ilo] = (CH) & 0xff; \
2657 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002658 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002660#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002661 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002662 if (s[i] >= 0x10000)
2663 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002664#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002665 /* 2 * (size + pairs + (byteorder == 0)) */
2666 if (size > PY_SSIZE_T_MAX ||
2667 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002668 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002669 nsize = size + pairs + (byteorder == 0);
2670 bytesize = nsize * 2;
2671 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002672 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002673 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674 if (v == NULL)
2675 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002677 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002679 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002680 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002681 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002682
2683 if (byteorder == -1) {
2684 /* force LE */
2685 ihi = 1;
2686 ilo = 0;
2687 }
2688 else if (byteorder == 1) {
2689 /* force BE */
2690 ihi = 0;
2691 ilo = 1;
2692 }
2693
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002694 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002695 Py_UNICODE ch = *s++;
2696 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002697#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002698 if (ch >= 0x10000) {
2699 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2700 ch = 0xD800 | ((ch-0x10000) >> 10);
2701 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002702#endif
Tim Peters772747b2001-08-09 22:21:55 +00002703 STORECHAR(ch);
2704 if (ch2)
2705 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002708#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709}
2710
2711PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2712{
2713 if (!PyUnicode_Check(unicode)) {
2714 PyErr_BadArgument();
2715 return NULL;
2716 }
2717 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002718 PyUnicode_GET_SIZE(unicode),
2719 NULL,
2720 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721}
2722
2723/* --- Unicode Escape Codec ----------------------------------------------- */
2724
Fredrik Lundh06d12682001-01-24 07:59:11 +00002725static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002726
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002728 Py_ssize_t size,
2729 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002731 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002732 Py_ssize_t startinpos;
2733 Py_ssize_t endinpos;
2734 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002735 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002737 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002739 char* message;
2740 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 PyObject *errorHandler = NULL;
2742 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002743
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 /* Escaped strings will always be longer than the resulting
2745 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 length after conversion to the true value.
2747 (but if the error callback returns a long replacement string
2748 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749 v = _PyUnicode_New(size);
2750 if (v == NULL)
2751 goto onError;
2752 if (size == 0)
2753 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002754
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002755 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002757
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 while (s < end) {
2759 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002760 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002761 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762
2763 /* Non-escape characters are interpreted as Unicode ordinals */
2764 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002765 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 continue;
2767 }
2768
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002769 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770 /* \ - Escapes */
2771 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002772 c = *s++;
2773 if (s > end)
2774 c = '\0'; /* Invalid after \ */
2775 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002777 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 case '\n': break;
2779 case '\\': *p++ = '\\'; break;
2780 case '\'': *p++ = '\''; break;
2781 case '\"': *p++ = '\"'; break;
2782 case 'b': *p++ = '\b'; break;
2783 case 'f': *p++ = '\014'; break; /* FF */
2784 case 't': *p++ = '\t'; break;
2785 case 'n': *p++ = '\n'; break;
2786 case 'r': *p++ = '\r'; break;
2787 case 'v': *p++ = '\013'; break; /* VT */
2788 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2789
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002790 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 case '0': case '1': case '2': case '3':
2792 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002793 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002794 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002795 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002796 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002797 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002799 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800 break;
2801
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002802 /* hex escapes */
2803 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002805 digits = 2;
2806 message = "truncated \\xXX escape";
2807 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002809 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002811 digits = 4;
2812 message = "truncated \\uXXXX escape";
2813 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002815 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002816 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002817 digits = 8;
2818 message = "truncated \\UXXXXXXXX escape";
2819 hexescape:
2820 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002821 outpos = p-PyUnicode_AS_UNICODE(v);
2822 if (s+digits>end) {
2823 endinpos = size;
2824 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002825 errors, &errorHandler,
2826 "unicodeescape", "end of string in escape sequence",
2827 starts, size, &startinpos, &endinpos, &exc, &s,
2828 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002829 goto onError;
2830 goto nextByte;
2831 }
2832 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002833 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002834 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002835 endinpos = (s+i+1)-starts;
2836 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002837 errors, &errorHandler,
2838 "unicodeescape", message,
2839 starts, size, &startinpos, &endinpos, &exc, &s,
2840 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002841 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002843 }
2844 chr = (chr<<4) & ~0xF;
2845 if (c >= '0' && c <= '9')
2846 chr += c - '0';
2847 else if (c >= 'a' && c <= 'f')
2848 chr += 10 + c - 'a';
2849 else
2850 chr += 10 + c - 'A';
2851 }
2852 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002853 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002854 /* _decoding_error will have already written into the
2855 target buffer. */
2856 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002857 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002858 /* when we get here, chr is a 32-bit unicode character */
2859 if (chr <= 0xffff)
2860 /* UCS-2 character */
2861 *p++ = (Py_UNICODE) chr;
2862 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002863 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002864 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002865#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002866 *p++ = chr;
2867#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002868 chr -= 0x10000L;
2869 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002870 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002871#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002872 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002873 endinpos = s-starts;
2874 outpos = p-PyUnicode_AS_UNICODE(v);
2875 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002876 errors, &errorHandler,
2877 "unicodeescape", "illegal Unicode character",
2878 starts, size, &startinpos, &endinpos, &exc, &s,
2879 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002880 goto onError;
2881 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002882 break;
2883
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002884 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002885 case 'N':
2886 message = "malformed \\N character escape";
2887 if (ucnhash_CAPI == NULL) {
2888 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002889 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002890 if (ucnhash_CAPI == NULL)
2891 goto ucnhashError;
2892 }
2893 if (*s == '{') {
2894 const char *start = s+1;
2895 /* look for the closing brace */
2896 while (*s != '}' && s < end)
2897 s++;
2898 if (s > start && s < end && *s == '}') {
2899 /* found a name. look it up in the unicode database */
2900 message = "unknown Unicode character name";
2901 s++;
Serhiy Storchaka1d3acd42013-01-21 11:48:24 +02002902 if (s - start - 1 <= INT_MAX &&
2903 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002904 goto store;
2905 }
2906 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002907 endinpos = s-starts;
2908 outpos = p-PyUnicode_AS_UNICODE(v);
2909 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002910 errors, &errorHandler,
2911 "unicodeescape", message,
2912 starts, size, &startinpos, &endinpos, &exc, &s,
2913 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002914 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002915 break;
2916
2917 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002918 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002919 message = "\\ at end of string";
2920 s--;
2921 endinpos = s-starts;
2922 outpos = p-PyUnicode_AS_UNICODE(v);
2923 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002924 errors, &errorHandler,
2925 "unicodeescape", message,
2926 starts, size, &startinpos, &endinpos, &exc, &s,
2927 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002928 goto onError;
2929 }
2930 else {
2931 *p++ = '\\';
2932 *p++ = (unsigned char)s[-1];
2933 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002934 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002935 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002936 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002937 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002938 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002939 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002940 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002941 Py_XDECREF(errorHandler);
2942 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002944
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002945 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002946 PyErr_SetString(
2947 PyExc_UnicodeError,
2948 "\\N escapes not supported (can't load unicodedata module)"
2949 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002950 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002951 Py_XDECREF(errorHandler);
2952 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002953 return NULL;
2954
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002955 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002957 Py_XDECREF(errorHandler);
2958 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002959 return NULL;
2960}
2961
2962/* Return a Unicode-Escape string version of the Unicode object.
2963
2964 If quotes is true, the string is enclosed in u"" or u'' quotes as
2965 appropriate.
2966
2967*/
2968
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002969Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002970 Py_ssize_t size,
2971 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002972{
2973 /* like wcschr, but doesn't stop at NULL characters */
2974
2975 while (size-- > 0) {
2976 if (*s == ch)
2977 return s;
2978 s++;
2979 }
2980
2981 return NULL;
2982}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002983
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984static
2985PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002986 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 int quotes)
2988{
2989 PyObject *repr;
2990 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002992 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002993#ifdef Py_UNICODE_WIDE
2994 const Py_ssize_t expandsize = 10;
2995#else
2996 const Py_ssize_t expandsize = 6;
2997#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998
Neal Norwitz17753ec2006-08-21 22:21:19 +00002999 /* XXX(nnorwitz): rather than over-allocating, it would be
3000 better to choose a different scheme. Perhaps scan the
3001 first N-chars of the string and allocate based on that size.
3002 */
3003 /* Initial allocation is based on the longest-possible unichr
3004 escape.
3005
3006 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3007 unichr, so in this case it's the longest unichr escape. In
3008 narrow (UTF-16) builds this is five chars per source unichr
3009 since there are two unichrs in the surrogate pair, so in narrow
3010 (UTF-16) builds it's not the longest unichr escape.
3011
3012 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3013 so in the narrow (UTF-16) build case it's the longest unichr
3014 escape.
3015 */
3016
Neal Norwitze7d8be82008-07-31 17:17:14 +00003017 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003018 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003019
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003020 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003021 2
3022 + expandsize*size
3023 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 if (repr == NULL)
3025 return NULL;
3026
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003027 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028
3029 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003031 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032 !findchar(s, size, '"')) ? '"' : '\'';
3033 }
3034 while (size-- > 0) {
3035 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003036
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003037 /* Escape quotes and backslashes */
3038 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003039 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040 *p++ = '\\';
3041 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003042 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003043 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003044
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003045#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003046 /* Map 21-bit characters to '\U00xxxxxx' */
3047 else if (ch >= 0x10000) {
3048 *p++ = '\\';
3049 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003050 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3051 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3052 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3053 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3054 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3055 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3056 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003057 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003058 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003059 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003060#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003061 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3062 else if (ch >= 0xD800 && ch < 0xDC00) {
3063 Py_UNICODE ch2;
3064 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003065
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003066 ch2 = *s++;
3067 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003068 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003069 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3070 *p++ = '\\';
3071 *p++ = 'U';
3072 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3073 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3074 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3075 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3076 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3077 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3078 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3079 *p++ = hexdigit[ucs & 0x0000000F];
3080 continue;
3081 }
3082 /* Fall through: isolated surrogates are copied as-is */
3083 s--;
3084 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003085 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003086#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003087
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003089 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 *p++ = '\\';
3091 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003092 *p++ = hexdigit[(ch >> 12) & 0x000F];
3093 *p++ = hexdigit[(ch >> 8) & 0x000F];
3094 *p++ = hexdigit[(ch >> 4) & 0x000F];
3095 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003097
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003098 /* Map special whitespace to '\t', \n', '\r' */
3099 else if (ch == '\t') {
3100 *p++ = '\\';
3101 *p++ = 't';
3102 }
3103 else if (ch == '\n') {
3104 *p++ = '\\';
3105 *p++ = 'n';
3106 }
3107 else if (ch == '\r') {
3108 *p++ = '\\';
3109 *p++ = 'r';
3110 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003111
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003112 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003113 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003115 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003116 *p++ = hexdigit[(ch >> 4) & 0x000F];
3117 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003118 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003119
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 /* Copy everything else as-is */
3121 else
3122 *p++ = (char) ch;
3123 }
3124 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003125 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003126
3127 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003128 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3129 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 return repr;
3131}
3132
3133PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003134 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003135{
3136 return unicodeescape_string(s, size, 0);
3137}
3138
3139PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3140{
3141 if (!PyUnicode_Check(unicode)) {
3142 PyErr_BadArgument();
3143 return NULL;
3144 }
3145 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003146 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147}
3148
3149/* --- Raw Unicode Escape Codec ------------------------------------------- */
3150
3151PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003152 Py_ssize_t size,
3153 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003155 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003156 Py_ssize_t startinpos;
3157 Py_ssize_t endinpos;
3158 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003160 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 const char *end;
3162 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003163 PyObject *errorHandler = NULL;
3164 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003165
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166 /* Escaped strings will always be longer than the resulting
3167 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003168 length after conversion to the true value. (But decoding error
3169 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170 v = _PyUnicode_New(size);
3171 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003172 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003174 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003175 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 end = s + size;
3177 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003178 unsigned char c;
3179 Py_UCS4 x;
3180 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003181 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003183 /* Non-escape characters are interpreted as Unicode ordinals */
3184 if (*s != '\\') {
3185 *p++ = (unsigned char)*s++;
3186 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003187 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003188 startinpos = s-starts;
3189
3190 /* \u-escapes are only interpreted iff the number of leading
3191 backslashes if odd */
3192 bs = s;
3193 for (;s < end;) {
3194 if (*s != '\\')
3195 break;
3196 *p++ = (unsigned char)*s++;
3197 }
3198 if (((s - bs) & 1) == 0 ||
3199 s >= end ||
3200 (*s != 'u' && *s != 'U')) {
3201 continue;
3202 }
3203 p--;
3204 count = *s=='u' ? 4 : 8;
3205 s++;
3206
3207 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3208 outpos = p-PyUnicode_AS_UNICODE(v);
3209 for (x = 0, i = 0; i < count; ++i, ++s) {
3210 c = (unsigned char)*s;
3211 if (!isxdigit(c)) {
3212 endinpos = s-starts;
3213 if (unicode_decode_call_errorhandler(
3214 errors, &errorHandler,
3215 "rawunicodeescape", "truncated \\uXXXX",
3216 starts, size, &startinpos, &endinpos, &exc, &s,
3217 &v, &outpos, &p))
3218 goto onError;
3219 goto nextByte;
3220 }
3221 x = (x<<4) & ~0xF;
3222 if (c >= '0' && c <= '9')
3223 x += c - '0';
3224 else if (c >= 'a' && c <= 'f')
3225 x += 10 + c - 'a';
3226 else
3227 x += 10 + c - 'A';
3228 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003229 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003230 /* UCS-2 character */
3231 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003232 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003233 /* UCS-4 character. Either store directly, or as
3234 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003235#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003236 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003237#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003238 x -= 0x10000L;
3239 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3240 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003241#endif
3242 } else {
3243 endinpos = s-starts;
3244 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003245 if (unicode_decode_call_errorhandler(
3246 errors, &errorHandler,
3247 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003248 starts, size, &startinpos, &endinpos, &exc, &s,
3249 &v, &outpos, &p))
3250 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003251 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003252 nextByte:
3253 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003255 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003256 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003257 Py_XDECREF(errorHandler);
3258 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003260
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003261 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003263 Py_XDECREF(errorHandler);
3264 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 return NULL;
3266}
3267
3268PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003269 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270{
3271 PyObject *repr;
3272 char *p;
3273 char *q;
3274
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003275 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003276#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003277 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003278#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003279 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003280#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003281
Neal Norwitze7d8be82008-07-31 17:17:14 +00003282 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003283 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003284
Neal Norwitze7d8be82008-07-31 17:17:14 +00003285 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003286 if (repr == NULL)
3287 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003288 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003289 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003290
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003291 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 while (size-- > 0) {
3293 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003294#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003295 /* Map 32-bit characters to '\Uxxxxxxxx' */
3296 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003297 *p++ = '\\';
3298 *p++ = 'U';
3299 *p++ = hexdigit[(ch >> 28) & 0xf];
3300 *p++ = hexdigit[(ch >> 24) & 0xf];
3301 *p++ = hexdigit[(ch >> 20) & 0xf];
3302 *p++ = hexdigit[(ch >> 16) & 0xf];
3303 *p++ = hexdigit[(ch >> 12) & 0xf];
3304 *p++ = hexdigit[(ch >> 8) & 0xf];
3305 *p++ = hexdigit[(ch >> 4) & 0xf];
3306 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003307 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003308 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003309#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003310 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3311 if (ch >= 0xD800 && ch < 0xDC00) {
3312 Py_UNICODE ch2;
3313 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003314
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003315 ch2 = *s++;
3316 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003317 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003318 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3319 *p++ = '\\';
3320 *p++ = 'U';
3321 *p++ = hexdigit[(ucs >> 28) & 0xf];
3322 *p++ = hexdigit[(ucs >> 24) & 0xf];
3323 *p++ = hexdigit[(ucs >> 20) & 0xf];
3324 *p++ = hexdigit[(ucs >> 16) & 0xf];
3325 *p++ = hexdigit[(ucs >> 12) & 0xf];
3326 *p++ = hexdigit[(ucs >> 8) & 0xf];
3327 *p++ = hexdigit[(ucs >> 4) & 0xf];
3328 *p++ = hexdigit[ucs & 0xf];
3329 continue;
3330 }
3331 /* Fall through: isolated surrogates are copied as-is */
3332 s--;
3333 size++;
3334 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003335#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003336 /* Map 16-bit characters to '\uxxxx' */
3337 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 *p++ = '\\';
3339 *p++ = 'u';
3340 *p++ = hexdigit[(ch >> 12) & 0xf];
3341 *p++ = hexdigit[(ch >> 8) & 0xf];
3342 *p++ = hexdigit[(ch >> 4) & 0xf];
3343 *p++ = hexdigit[ch & 15];
3344 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003345 /* Copy everything else as-is */
3346 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 *p++ = (char) ch;
3348 }
3349 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003350 if (_PyString_Resize(&repr, p - q))
3351 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352 return repr;
3353}
3354
3355PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3356{
3357 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003358 PyErr_BadArgument();
3359 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003360 }
3361 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003362 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363}
3364
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003365/* --- Unicode Internal Codec ------------------------------------------- */
3366
3367PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003368 Py_ssize_t size,
3369 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003370{
3371 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003372 Py_ssize_t startinpos;
3373 Py_ssize_t endinpos;
3374 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003375 PyUnicodeObject *v;
3376 Py_UNICODE *p;
3377 const char *end;
3378 const char *reason;
3379 PyObject *errorHandler = NULL;
3380 PyObject *exc = NULL;
3381
Neal Norwitzd43069c2006-01-08 01:12:10 +00003382#ifdef Py_UNICODE_WIDE
3383 Py_UNICODE unimax = PyUnicode_GetMax();
3384#endif
3385
Armin Rigo7ccbca92006-10-04 12:17:45 +00003386 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003387 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3388 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003389 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003390 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003391 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003392 p = PyUnicode_AS_UNICODE(v);
3393 end = s + size;
3394
3395 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003396 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003397 /* We have to sanity check the raw data, otherwise doom looms for
3398 some malformed UCS-4 data. */
3399 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003400#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003401 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003402#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003403 end-s < Py_UNICODE_SIZE
3404 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003405 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003406 startinpos = s - starts;
3407 if (end-s < Py_UNICODE_SIZE) {
3408 endinpos = end-starts;
3409 reason = "truncated input";
3410 }
3411 else {
3412 endinpos = s - starts + Py_UNICODE_SIZE;
3413 reason = "illegal code point (> 0x10FFFF)";
3414 }
3415 outpos = p - PyUnicode_AS_UNICODE(v);
3416 if (unicode_decode_call_errorhandler(
3417 errors, &errorHandler,
3418 "unicode_internal", reason,
3419 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003420 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003421 goto onError;
3422 }
3423 }
3424 else {
3425 p++;
3426 s += Py_UNICODE_SIZE;
3427 }
3428 }
3429
Martin v. Löwis412fb672006-04-13 06:34:32 +00003430 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003431 goto onError;
3432 Py_XDECREF(errorHandler);
3433 Py_XDECREF(exc);
3434 return (PyObject *)v;
3435
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003436 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003437 Py_XDECREF(v);
3438 Py_XDECREF(errorHandler);
3439 Py_XDECREF(exc);
3440 return NULL;
3441}
3442
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443/* --- Latin-1 Codec ------------------------------------------------------ */
3444
3445PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003446 Py_ssize_t size,
3447 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448{
3449 PyUnicodeObject *v;
3450 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003451
Guido van Rossumd57fd912000-03-10 22:53:23 +00003452 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003453 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003454 Py_UNICODE r = *(unsigned char*)s;
3455 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003456 }
3457
Guido van Rossumd57fd912000-03-10 22:53:23 +00003458 v = _PyUnicode_New(size);
3459 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003460 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003462 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003463 p = PyUnicode_AS_UNICODE(v);
3464 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003465 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003467
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003468 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 Py_XDECREF(v);
3470 return NULL;
3471}
3472
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003473/* create or adjust a UnicodeEncodeError */
3474static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003475 const char *encoding,
3476 const Py_UNICODE *unicode, Py_ssize_t size,
3477 Py_ssize_t startpos, Py_ssize_t endpos,
3478 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003479{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003480 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003481 *exceptionObject = PyUnicodeEncodeError_Create(
3482 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003483 }
3484 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003485 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3486 goto onError;
3487 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3488 goto onError;
3489 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3490 goto onError;
3491 return;
3492 onError:
3493 Py_DECREF(*exceptionObject);
3494 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495 }
3496}
3497
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498/* raises a UnicodeEncodeError */
3499static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003500 const char *encoding,
3501 const Py_UNICODE *unicode, Py_ssize_t size,
3502 Py_ssize_t startpos, Py_ssize_t endpos,
3503 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504{
3505 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003506 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003508 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003509}
3510
3511/* error handling callback helper:
3512 build arguments, call the callback and check the arguments,
3513 put the result into newpos and return the replacement string, which
3514 has to be freed by the caller */
3515static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003516 PyObject **errorHandler,
3517 const char *encoding, const char *reason,
3518 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3519 Py_ssize_t startpos, Py_ssize_t endpos,
3520 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003521{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003522 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523
3524 PyObject *restuple;
3525 PyObject *resunicode;
3526
3527 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003528 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003530 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 }
3532
3533 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003534 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003535 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003536 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003537
3538 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003539 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003541 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003543 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003544 Py_DECREF(restuple);
3545 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 }
3547 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003548 &resunicode, newpos)) {
3549 Py_DECREF(restuple);
3550 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003551 }
3552 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003553 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003554 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003555 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3556 Py_DECREF(restuple);
3557 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003558 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003559 Py_INCREF(resunicode);
3560 Py_DECREF(restuple);
3561 return resunicode;
3562}
3563
3564static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003565 Py_ssize_t size,
3566 const char *errors,
3567 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568{
3569 /* output object */
3570 PyObject *res;
3571 /* pointers to the beginning and end+1 of input */
3572 const Py_UNICODE *startp = p;
3573 const Py_UNICODE *endp = p + size;
3574 /* pointer to the beginning of the unencodable characters */
3575 /* const Py_UNICODE *badp = NULL; */
3576 /* pointer into the output */
3577 char *str;
3578 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003579 Py_ssize_t respos = 0;
3580 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003581 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3582 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003583 PyObject *errorHandler = NULL;
3584 PyObject *exc = NULL;
3585 /* the following variable is used for caching string comparisons
3586 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3587 int known_errorHandler = -1;
3588
3589 /* allocate enough for a simple encoding without
3590 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003591 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 if (res == NULL)
3593 goto onError;
3594 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003595 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003596 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597 ressize = size;
3598
3599 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003600 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003602 /* can we encode this? */
3603 if (c<limit) {
3604 /* no overflow check, because we know that the space is enough */
3605 *str++ = (char)c;
3606 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003607 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003608 else {
3609 Py_ssize_t unicodepos = p-startp;
3610 Py_ssize_t requiredsize;
3611 PyObject *repunicode;
3612 Py_ssize_t repsize;
3613 Py_ssize_t newpos;
3614 Py_ssize_t respos;
3615 Py_UNICODE *uni2;
3616 /* startpos for collecting unencodable chars */
3617 const Py_UNICODE *collstart = p;
3618 const Py_UNICODE *collend = p;
3619 /* find all unecodable characters */
3620 while ((collend < endp) && ((*collend)>=limit))
3621 ++collend;
3622 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3623 if (known_errorHandler==-1) {
3624 if ((errors==NULL) || (!strcmp(errors, "strict")))
3625 known_errorHandler = 1;
3626 else if (!strcmp(errors, "replace"))
3627 known_errorHandler = 2;
3628 else if (!strcmp(errors, "ignore"))
3629 known_errorHandler = 3;
3630 else if (!strcmp(errors, "xmlcharrefreplace"))
3631 known_errorHandler = 4;
3632 else
3633 known_errorHandler = 0;
3634 }
3635 switch (known_errorHandler) {
3636 case 1: /* strict */
3637 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3638 goto onError;
3639 case 2: /* replace */
3640 while (collstart++<collend)
3641 *str++ = '?'; /* fall through */
3642 case 3: /* ignore */
3643 p = collend;
3644 break;
3645 case 4: /* xmlcharrefreplace */
3646 respos = str-PyString_AS_STRING(res);
3647 /* determine replacement size (temporarily (mis)uses p) */
3648 for (p = collstart, repsize = 0; p < collend; ++p) {
3649 if (*p<10)
3650 repsize += 2+1+1;
3651 else if (*p<100)
3652 repsize += 2+2+1;
3653 else if (*p<1000)
3654 repsize += 2+3+1;
3655 else if (*p<10000)
3656 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003657#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003658 else
3659 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003660#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003661 else if (*p<100000)
3662 repsize += 2+5+1;
3663 else if (*p<1000000)
3664 repsize += 2+6+1;
3665 else
3666 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003667#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003668 }
3669 requiredsize = respos+repsize+(endp-collend);
3670 if (requiredsize > ressize) {
3671 if (requiredsize<2*ressize)
3672 requiredsize = 2*ressize;
3673 if (_PyString_Resize(&res, requiredsize))
3674 goto onError;
3675 str = PyString_AS_STRING(res) + respos;
3676 ressize = requiredsize;
3677 }
3678 /* generate replacement (temporarily (mis)uses p) */
3679 for (p = collstart; p < collend; ++p) {
3680 str += sprintf(str, "&#%d;", (int)*p);
3681 }
3682 p = collend;
3683 break;
3684 default:
3685 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3686 encoding, reason, startp, size, &exc,
3687 collstart-startp, collend-startp, &newpos);
3688 if (repunicode == NULL)
3689 goto onError;
3690 /* need more space? (at least enough for what we have+the
3691 replacement+the rest of the string, so we won't have to
3692 check space for encodable characters) */
3693 respos = str-PyString_AS_STRING(res);
3694 repsize = PyUnicode_GET_SIZE(repunicode);
3695 requiredsize = respos+repsize+(endp-collend);
3696 if (requiredsize > ressize) {
3697 if (requiredsize<2*ressize)
3698 requiredsize = 2*ressize;
3699 if (_PyString_Resize(&res, requiredsize)) {
3700 Py_DECREF(repunicode);
3701 goto onError;
3702 }
3703 str = PyString_AS_STRING(res) + respos;
3704 ressize = requiredsize;
3705 }
3706 /* check if there is anything unencodable in the replacement
3707 and copy it to the output */
3708 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3709 c = *uni2;
3710 if (c >= limit) {
3711 raise_encode_exception(&exc, encoding, startp, size,
3712 unicodepos, unicodepos+1, reason);
3713 Py_DECREF(repunicode);
3714 goto onError;
3715 }
3716 *str = (char)c;
3717 }
3718 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003719 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003720 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003721 }
3722 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003723 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003724 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003725 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003726 /* If this falls res will be NULL */
3727 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003728 Py_XDECREF(errorHandler);
3729 Py_XDECREF(exc);
3730 return res;
3731
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003732 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003733 Py_XDECREF(res);
3734 Py_XDECREF(errorHandler);
3735 Py_XDECREF(exc);
3736 return NULL;
3737}
3738
Guido van Rossumd57fd912000-03-10 22:53:23 +00003739PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003740 Py_ssize_t size,
3741 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003743 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744}
3745
3746PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3747{
3748 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003749 PyErr_BadArgument();
3750 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751 }
3752 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003753 PyUnicode_GET_SIZE(unicode),
3754 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755}
3756
3757/* --- 7-bit ASCII Codec -------------------------------------------------- */
3758
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003760 Py_ssize_t size,
3761 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003763 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764 PyUnicodeObject *v;
3765 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003766 Py_ssize_t startinpos;
3767 Py_ssize_t endinpos;
3768 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003769 const char *e;
3770 PyObject *errorHandler = NULL;
3771 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003772
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003774 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003775 Py_UNICODE r = *(unsigned char*)s;
3776 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003777 }
Tim Petersced69f82003-09-16 20:30:58 +00003778
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 v = _PyUnicode_New(size);
3780 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003781 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003783 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003785 e = s + size;
3786 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003787 register unsigned char c = (unsigned char)*s;
3788 if (c < 128) {
3789 *p++ = c;
3790 ++s;
3791 }
3792 else {
3793 startinpos = s-starts;
3794 endinpos = startinpos + 1;
3795 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3796 if (unicode_decode_call_errorhandler(
3797 errors, &errorHandler,
3798 "ascii", "ordinal not in range(128)",
3799 starts, size, &startinpos, &endinpos, &exc, &s,
3800 &v, &outpos, &p))
3801 goto onError;
3802 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003804 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003805 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3806 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003807 Py_XDECREF(errorHandler);
3808 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003810
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003811 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003813 Py_XDECREF(errorHandler);
3814 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 return NULL;
3816}
3817
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003819 Py_ssize_t size,
3820 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003822 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823}
3824
3825PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3826{
3827 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003828 PyErr_BadArgument();
3829 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830 }
3831 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003832 PyUnicode_GET_SIZE(unicode),
3833 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834}
3835
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003836#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003837
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003838/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003839
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003840#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003841#define NEED_RETRY
3842#endif
3843
3844/* XXX This code is limited to "true" double-byte encodings, as
3845 a) it assumes an incomplete character consists of a single byte, and
3846 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003847 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003848
3849static int is_dbcs_lead_byte(const char *s, int offset)
3850{
3851 const char *curr = s + offset;
3852
3853 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003854 const char *prev = CharPrev(s, curr);
3855 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003856 }
3857 return 0;
3858}
3859
3860/*
3861 * Decode MBCS string into unicode object. If 'final' is set, converts
3862 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3863 */
3864static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003865 const char *s, /* MBCS string */
3866 int size, /* sizeof MBCS string */
3867 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003868{
3869 Py_UNICODE *p;
3870 Py_ssize_t n = 0;
3871 int usize = 0;
3872
3873 assert(size >= 0);
3874
3875 /* Skip trailing lead-byte unless 'final' is set */
3876 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003877 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003878
3879 /* First get the size of the result */
3880 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003881 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3882 if (usize == 0) {
3883 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3884 return -1;
3885 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003886 }
3887
3888 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003889 /* Create unicode object */
3890 *v = _PyUnicode_New(usize);
3891 if (*v == NULL)
3892 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003893 }
3894 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003895 /* Extend unicode object */
3896 n = PyUnicode_GET_SIZE(*v);
3897 if (_PyUnicode_Resize(v, n + usize) < 0)
3898 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003899 }
3900
3901 /* Do the conversion */
3902 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003903 p = PyUnicode_AS_UNICODE(*v) + n;
3904 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3905 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3906 return -1;
3907 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003908 }
3909
3910 return size;
3911}
3912
3913PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003914 Py_ssize_t size,
3915 const char *errors,
3916 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003917{
3918 PyUnicodeObject *v = NULL;
3919 int done;
3920
3921 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003922 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003923
3924#ifdef NEED_RETRY
3925 retry:
3926 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003927 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003928 else
3929#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003930 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003931
3932 if (done < 0) {
3933 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003934 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003935 }
3936
3937 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003938 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003939
3940#ifdef NEED_RETRY
3941 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003942 s += done;
3943 size -= done;
3944 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003945 }
3946#endif
3947
3948 return (PyObject *)v;
3949}
3950
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003951PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003952 Py_ssize_t size,
3953 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003954{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003955 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3956}
3957
3958/*
3959 * Convert unicode into string object (MBCS).
3960 * Returns 0 if succeed, -1 otherwise.
3961 */
3962static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003963 const Py_UNICODE *p, /* unicode */
3964 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003965{
3966 int mbcssize = 0;
3967 Py_ssize_t n = 0;
3968
3969 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003970
3971 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003972 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003973 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3974 if (mbcssize == 0) {
3975 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3976 return -1;
3977 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003978 }
3979
Martin v. Löwisd8251432006-06-14 05:21:04 +00003980 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003981 /* Create string object */
3982 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3983 if (*repr == NULL)
3984 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003985 }
3986 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003987 /* Extend string object */
3988 n = PyString_Size(*repr);
3989 if (_PyString_Resize(repr, n + mbcssize) < 0)
3990 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003991 }
3992
3993 /* Do the conversion */
3994 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003995 char *s = PyString_AS_STRING(*repr) + n;
3996 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3997 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3998 return -1;
3999 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004000 }
4001
4002 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004003}
4004
4005PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004006 Py_ssize_t size,
4007 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004008{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004009 PyObject *repr = NULL;
4010 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004011
Martin v. Löwisd8251432006-06-14 05:21:04 +00004012#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004013 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004014 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004015 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004016 else
4017#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004018 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004019
Martin v. Löwisd8251432006-06-14 05:21:04 +00004020 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004021 Py_XDECREF(repr);
4022 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004023 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004024
4025#ifdef NEED_RETRY
4026 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004027 p += INT_MAX;
4028 size -= INT_MAX;
4029 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004030 }
4031#endif
4032
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004033 return repr;
4034}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004035
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004036PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4037{
4038 if (!PyUnicode_Check(unicode)) {
4039 PyErr_BadArgument();
4040 return NULL;
4041 }
4042 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004043 PyUnicode_GET_SIZE(unicode),
4044 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004045}
4046
Martin v. Löwisd8251432006-06-14 05:21:04 +00004047#undef NEED_RETRY
4048
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004049#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004050
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051/* --- Character Mapping Codec -------------------------------------------- */
4052
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004054 Py_ssize_t size,
4055 PyObject *mapping,
4056 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004059 Py_ssize_t startinpos;
4060 Py_ssize_t endinpos;
4061 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004062 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063 PyUnicodeObject *v;
4064 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004065 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004066 PyObject *errorHandler = NULL;
4067 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004068 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004069 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004070
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071 /* Default to Latin-1 */
4072 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004073 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004074
4075 v = _PyUnicode_New(size);
4076 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004077 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004079 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004081 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004082 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004083 mapstring = PyUnicode_AS_UNICODE(mapping);
4084 maplen = PyUnicode_GET_SIZE(mapping);
4085 while (s < e) {
4086 unsigned char ch = *s;
4087 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004089 if (ch < maplen)
4090 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004092 if (x == 0xfffe) {
4093 /* undefined mapping */
4094 outpos = p-PyUnicode_AS_UNICODE(v);
4095 startinpos = s-starts;
4096 endinpos = startinpos+1;
4097 if (unicode_decode_call_errorhandler(
4098 errors, &errorHandler,
4099 "charmap", "character maps to <undefined>",
4100 starts, size, &startinpos, &endinpos, &exc, &s,
4101 &v, &outpos, &p)) {
4102 goto onError;
4103 }
4104 continue;
4105 }
4106 *p++ = x;
4107 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004108 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004109 }
4110 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004111 while (s < e) {
4112 unsigned char ch = *s;
4113 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004114
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004115 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4116 w = PyInt_FromLong((long)ch);
4117 if (w == NULL)
4118 goto onError;
4119 x = PyObject_GetItem(mapping, w);
4120 Py_DECREF(w);
4121 if (x == NULL) {
4122 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4123 /* No mapping found means: mapping is undefined. */
4124 PyErr_Clear();
Serhiy Storchaka95997452013-01-15 14:42:59 +02004125 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004126 } else
4127 goto onError;
4128 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004129
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004130 /* Apply mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004131 if (x == Py_None)
4132 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004133 if (PyInt_Check(x)) {
4134 long value = PyInt_AS_LONG(x);
Serhiy Storchaka95997452013-01-15 14:42:59 +02004135 if (value == 0xFFFE)
4136 goto Undefined;
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004137 if (value < 0 || value > 0x10FFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004138 PyErr_SetString(PyExc_TypeError,
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004139 "character mapping must be in range(0x110000)");
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004140 Py_DECREF(x);
4141 goto onError;
4142 }
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004143
4144#ifndef Py_UNICODE_WIDE
4145 if (value > 0xFFFF) {
4146 /* see the code for 1-n mapping below */
4147 if (extrachars < 2) {
4148 /* resize first */
4149 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4150 Py_ssize_t needed = 10 - extrachars;
4151 extrachars += needed;
4152 /* XXX overflow detection missing */
4153 if (_PyUnicode_Resize(&v,
4154 PyUnicode_GET_SIZE(v) + needed) < 0) {
4155 Py_DECREF(x);
4156 goto onError;
4157 }
4158 p = PyUnicode_AS_UNICODE(v) + oldpos;
4159 }
4160 value -= 0x10000;
4161 *p++ = 0xD800 | (value >> 10);
4162 *p++ = 0xDC00 | (value & 0x3FF);
4163 extrachars -= 2;
4164 }
4165 else
4166#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004167 *p++ = (Py_UNICODE)value;
4168 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004169 else if (PyUnicode_Check(x)) {
4170 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004171
Serhiy Storchaka95997452013-01-15 14:42:59 +02004172 if (targetsize == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004173 /* 1-1 mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004174 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4175 if (value == 0xFFFE)
4176 goto Undefined;
4177 *p++ = value;
4178 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004179 else if (targetsize > 1) {
4180 /* 1-n mapping */
4181 if (targetsize > extrachars) {
4182 /* resize first */
4183 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4184 Py_ssize_t needed = (targetsize - extrachars) + \
4185 (targetsize << 2);
4186 extrachars += needed;
4187 /* XXX overflow detection missing */
4188 if (_PyUnicode_Resize(&v,
4189 PyUnicode_GET_SIZE(v) + needed) < 0) {
4190 Py_DECREF(x);
4191 goto onError;
4192 }
4193 p = PyUnicode_AS_UNICODE(v) + oldpos;
4194 }
4195 Py_UNICODE_COPY(p,
4196 PyUnicode_AS_UNICODE(x),
4197 targetsize);
4198 p += targetsize;
4199 extrachars -= targetsize;
4200 }
4201 /* 1-0 mapping: skip the character */
4202 }
4203 else {
4204 /* wrong return value */
4205 PyErr_SetString(PyExc_TypeError,
4206 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004207 Py_DECREF(x);
4208 goto onError;
4209 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004210 Py_DECREF(x);
4211 ++s;
Serhiy Storchaka95997452013-01-15 14:42:59 +02004212 continue;
4213Undefined:
4214 /* undefined mapping */
4215 Py_XDECREF(x);
4216 outpos = p-PyUnicode_AS_UNICODE(v);
4217 startinpos = s-starts;
4218 endinpos = startinpos+1;
4219 if (unicode_decode_call_errorhandler(
4220 errors, &errorHandler,
4221 "charmap", "character maps to <undefined>",
4222 starts, size, &startinpos, &endinpos, &exc, &s,
4223 &v, &outpos, &p)) {
4224 goto onError;
4225 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227 }
4228 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004229 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4230 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004231 Py_XDECREF(errorHandler);
4232 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004233 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004234
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004235 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236 Py_XDECREF(errorHandler);
4237 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238 Py_XDECREF(v);
4239 return NULL;
4240}
4241
Martin v. Löwis3f767792006-06-04 19:36:28 +00004242/* Charmap encoding: the lookup table */
4243
4244struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004245 PyObject_HEAD
4246 unsigned char level1[32];
4247 int count2, count3;
4248 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004249};
4250
4251static PyObject*
4252encoding_map_size(PyObject *obj, PyObject* args)
4253{
4254 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004255 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004256 128*map->count3);
4257}
4258
4259static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004260 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004261 PyDoc_STR("Return the size (in bytes) of this object") },
4262 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004263};
4264
4265static void
4266encoding_map_dealloc(PyObject* o)
4267{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004268 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004269}
4270
4271static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004272 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004273 "EncodingMap", /*tp_name*/
4274 sizeof(struct encoding_map), /*tp_basicsize*/
4275 0, /*tp_itemsize*/
4276 /* methods */
4277 encoding_map_dealloc, /*tp_dealloc*/
4278 0, /*tp_print*/
4279 0, /*tp_getattr*/
4280 0, /*tp_setattr*/
4281 0, /*tp_compare*/
4282 0, /*tp_repr*/
4283 0, /*tp_as_number*/
4284 0, /*tp_as_sequence*/
4285 0, /*tp_as_mapping*/
4286 0, /*tp_hash*/
4287 0, /*tp_call*/
4288 0, /*tp_str*/
4289 0, /*tp_getattro*/
4290 0, /*tp_setattro*/
4291 0, /*tp_as_buffer*/
4292 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4293 0, /*tp_doc*/
4294 0, /*tp_traverse*/
4295 0, /*tp_clear*/
4296 0, /*tp_richcompare*/
4297 0, /*tp_weaklistoffset*/
4298 0, /*tp_iter*/
4299 0, /*tp_iternext*/
4300 encoding_map_methods, /*tp_methods*/
4301 0, /*tp_members*/
4302 0, /*tp_getset*/
4303 0, /*tp_base*/
4304 0, /*tp_dict*/
4305 0, /*tp_descr_get*/
4306 0, /*tp_descr_set*/
4307 0, /*tp_dictoffset*/
4308 0, /*tp_init*/
4309 0, /*tp_alloc*/
4310 0, /*tp_new*/
4311 0, /*tp_free*/
4312 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004313};
4314
4315PyObject*
4316PyUnicode_BuildEncodingMap(PyObject* string)
4317{
4318 Py_UNICODE *decode;
4319 PyObject *result;
4320 struct encoding_map *mresult;
4321 int i;
4322 int need_dict = 0;
4323 unsigned char level1[32];
4324 unsigned char level2[512];
4325 unsigned char *mlevel1, *mlevel2, *mlevel3;
4326 int count2 = 0, count3 = 0;
4327
4328 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4329 PyErr_BadArgument();
4330 return NULL;
4331 }
4332 decode = PyUnicode_AS_UNICODE(string);
4333 memset(level1, 0xFF, sizeof level1);
4334 memset(level2, 0xFF, sizeof level2);
4335
4336 /* If there isn't a one-to-one mapping of NULL to \0,
4337 or if there are non-BMP characters, we need to use
4338 a mapping dictionary. */
4339 if (decode[0] != 0)
4340 need_dict = 1;
4341 for (i = 1; i < 256; i++) {
4342 int l1, l2;
4343 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004344#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004345 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004346#endif
4347 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004348 need_dict = 1;
4349 break;
4350 }
4351 if (decode[i] == 0xFFFE)
4352 /* unmapped character */
4353 continue;
4354 l1 = decode[i] >> 11;
4355 l2 = decode[i] >> 7;
4356 if (level1[l1] == 0xFF)
4357 level1[l1] = count2++;
4358 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004359 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004360 }
4361
4362 if (count2 >= 0xFF || count3 >= 0xFF)
4363 need_dict = 1;
4364
4365 if (need_dict) {
4366 PyObject *result = PyDict_New();
4367 PyObject *key, *value;
4368 if (!result)
4369 return NULL;
4370 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004371 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004372 key = PyInt_FromLong(decode[i]);
4373 value = PyInt_FromLong(i);
4374 if (!key || !value)
4375 goto failed1;
4376 if (PyDict_SetItem(result, key, value) == -1)
4377 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004378 Py_DECREF(key);
4379 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004380 }
4381 return result;
4382 failed1:
4383 Py_XDECREF(key);
4384 Py_XDECREF(value);
4385 Py_DECREF(result);
4386 return NULL;
4387 }
4388
4389 /* Create a three-level trie */
4390 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4391 16*count2 + 128*count3 - 1);
4392 if (!result)
4393 return PyErr_NoMemory();
4394 PyObject_Init(result, &EncodingMapType);
4395 mresult = (struct encoding_map*)result;
4396 mresult->count2 = count2;
4397 mresult->count3 = count3;
4398 mlevel1 = mresult->level1;
4399 mlevel2 = mresult->level23;
4400 mlevel3 = mresult->level23 + 16*count2;
4401 memcpy(mlevel1, level1, 32);
4402 memset(mlevel2, 0xFF, 16*count2);
4403 memset(mlevel3, 0, 128*count3);
4404 count3 = 0;
4405 for (i = 1; i < 256; i++) {
4406 int o1, o2, o3, i2, i3;
4407 if (decode[i] == 0xFFFE)
4408 /* unmapped character */
4409 continue;
4410 o1 = decode[i]>>11;
4411 o2 = (decode[i]>>7) & 0xF;
4412 i2 = 16*mlevel1[o1] + o2;
4413 if (mlevel2[i2] == 0xFF)
4414 mlevel2[i2] = count3++;
4415 o3 = decode[i] & 0x7F;
4416 i3 = 128*mlevel2[i2] + o3;
4417 mlevel3[i3] = i;
4418 }
4419 return result;
4420}
4421
4422static int
4423encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4424{
4425 struct encoding_map *map = (struct encoding_map*)mapping;
4426 int l1 = c>>11;
4427 int l2 = (c>>7) & 0xF;
4428 int l3 = c & 0x7F;
4429 int i;
4430
4431#ifdef Py_UNICODE_WIDE
4432 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004433 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004434 }
4435#endif
4436 if (c == 0)
4437 return 0;
4438 /* level 1*/
4439 i = map->level1[l1];
4440 if (i == 0xFF) {
4441 return -1;
4442 }
4443 /* level 2*/
4444 i = map->level23[16*i+l2];
4445 if (i == 0xFF) {
4446 return -1;
4447 }
4448 /* level 3 */
4449 i = map->level23[16*map->count2 + 128*i + l3];
4450 if (i == 0) {
4451 return -1;
4452 }
4453 return i;
4454}
4455
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004456/* Lookup the character ch in the mapping. If the character
4457 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004458 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 PyObject *w = PyInt_FromLong((long)c);
4462 PyObject *x;
4463
4464 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004465 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004466 x = PyObject_GetItem(mapping, w);
4467 Py_DECREF(w);
4468 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004469 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4470 /* No mapping found means: mapping is undefined. */
4471 PyErr_Clear();
4472 x = Py_None;
4473 Py_INCREF(x);
4474 return x;
4475 } else
4476 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004478 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004479 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004480 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004481 long value = PyInt_AS_LONG(x);
4482 if (value < 0 || value > 255) {
4483 PyErr_SetString(PyExc_TypeError,
4484 "character mapping must be in range(256)");
4485 Py_DECREF(x);
4486 return NULL;
4487 }
4488 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004490 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004491 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004493 /* wrong return value */
4494 PyErr_SetString(PyExc_TypeError,
4495 "character mapping must return integer, None or str");
4496 Py_DECREF(x);
4497 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 }
4499}
4500
Martin v. Löwis3f767792006-06-04 19:36:28 +00004501static int
4502charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4503{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004504 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4505 /* exponentially overallocate to minimize reallocations */
4506 if (requiredsize < 2*outsize)
4507 requiredsize = 2*outsize;
4508 if (_PyString_Resize(outobj, requiredsize)) {
4509 return 0;
4510 }
4511 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004512}
4513
Benjamin Peterson857ce152009-01-31 16:29:18 +00004514typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004515 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004516}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004517/* lookup the character, put the result in the output string and adjust
4518 various state variables. Reallocate the output string if not enough
4519 space is available. Return a new reference to the object that
4520 was put in the output buffer, or Py_None, if the mapping was undefined
4521 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004522 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004524charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004525 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004527 PyObject *rep;
4528 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004529 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530
Christian Heimese93237d2007-12-19 02:37:44 +00004531 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004532 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004533 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004534 if (res == -1)
4535 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004536 if (outsize<requiredsize)
4537 if (!charmapencode_resize(outobj, outpos, requiredsize))
4538 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004539 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004540 outstart[(*outpos)++] = (char)res;
4541 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004542 }
4543
4544 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004546 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004547 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004548 Py_DECREF(rep);
4549 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004550 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004551 if (PyInt_Check(rep)) {
4552 Py_ssize_t requiredsize = *outpos+1;
4553 if (outsize<requiredsize)
4554 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4555 Py_DECREF(rep);
4556 return enc_EXCEPTION;
4557 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004558 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004559 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004560 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004561 else {
4562 const char *repchars = PyString_AS_STRING(rep);
4563 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4564 Py_ssize_t requiredsize = *outpos+repsize;
4565 if (outsize<requiredsize)
4566 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4567 Py_DECREF(rep);
4568 return enc_EXCEPTION;
4569 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004570 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004571 memcpy(outstart + *outpos, repchars, repsize);
4572 *outpos += repsize;
4573 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574 }
Georg Brandl9f167602006-06-04 21:46:16 +00004575 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004576 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577}
4578
4579/* handle an error in PyUnicode_EncodeCharmap
4580 Return 0 on success, -1 on error */
4581static
4582int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004583 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004584 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004585 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004586 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587{
4588 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004589 Py_ssize_t repsize;
4590 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 Py_UNICODE *uni2;
4592 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004593 Py_ssize_t collstartpos = *inpos;
4594 Py_ssize_t collendpos = *inpos+1;
4595 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596 char *encoding = "charmap";
4597 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004598 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004600 /* find all unencodable characters */
4601 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004602 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004603 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004604 int res = encoding_map_lookup(p[collendpos], mapping);
4605 if (res != -1)
4606 break;
4607 ++collendpos;
4608 continue;
4609 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004610
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004611 rep = charmapencode_lookup(p[collendpos], mapping);
4612 if (rep==NULL)
4613 return -1;
4614 else if (rep!=Py_None) {
4615 Py_DECREF(rep);
4616 break;
4617 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004618 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004619 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004620 }
4621 /* cache callback name lookup
4622 * (if not done yet, i.e. it's the first error) */
4623 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004624 if ((errors==NULL) || (!strcmp(errors, "strict")))
4625 *known_errorHandler = 1;
4626 else if (!strcmp(errors, "replace"))
4627 *known_errorHandler = 2;
4628 else if (!strcmp(errors, "ignore"))
4629 *known_errorHandler = 3;
4630 else if (!strcmp(errors, "xmlcharrefreplace"))
4631 *known_errorHandler = 4;
4632 else
4633 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004634 }
4635 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004636 case 1: /* strict */
4637 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4638 return -1;
4639 case 2: /* replace */
4640 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004641 x = charmapencode_output('?', mapping, res, respos);
4642 if (x==enc_EXCEPTION) {
4643 return -1;
4644 }
4645 else if (x==enc_FAILED) {
4646 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4647 return -1;
4648 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004649 }
4650 /* fall through */
4651 case 3: /* ignore */
4652 *inpos = collendpos;
4653 break;
4654 case 4: /* xmlcharrefreplace */
4655 /* generate replacement (temporarily (mis)uses p) */
4656 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004657 char buffer[2+29+1+1];
4658 char *cp;
4659 sprintf(buffer, "&#%d;", (int)p[collpos]);
4660 for (cp = buffer; *cp; ++cp) {
4661 x = charmapencode_output(*cp, mapping, res, respos);
4662 if (x==enc_EXCEPTION)
4663 return -1;
4664 else if (x==enc_FAILED) {
4665 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4666 return -1;
4667 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004668 }
4669 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004670 *inpos = collendpos;
4671 break;
4672 default:
4673 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004674 encoding, reason, p, size, exceptionObject,
4675 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004676 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004677 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004678 /* generate replacement */
4679 repsize = PyUnicode_GET_SIZE(repunicode);
4680 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004681 x = charmapencode_output(*uni2, mapping, res, respos);
4682 if (x==enc_EXCEPTION) {
4683 return -1;
4684 }
4685 else if (x==enc_FAILED) {
4686 Py_DECREF(repunicode);
4687 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4688 return -1;
4689 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004690 }
4691 *inpos = newpos;
4692 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004693 }
4694 return 0;
4695}
4696
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004698 Py_ssize_t size,
4699 PyObject *mapping,
4700 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004702 /* output object */
4703 PyObject *res = NULL;
4704 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004705 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004706 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004707 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004708 PyObject *errorHandler = NULL;
4709 PyObject *exc = NULL;
4710 /* the following variable is used for caching string comparisons
4711 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4712 * 3=ignore, 4=xmlcharrefreplace */
4713 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714
4715 /* Default to Latin-1 */
4716 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004717 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004719 /* allocate enough for a simple encoding without
4720 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004721 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004722 if (res == NULL)
4723 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004724 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004725 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004727 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004728 /* try to encode it */
4729 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4730 if (x==enc_EXCEPTION) /* error */
4731 goto onError;
4732 if (x==enc_FAILED) { /* unencodable character */
4733 if (charmap_encoding_error(p, size, &inpos, mapping,
4734 &exc,
4735 &known_errorHandler, &errorHandler, errors,
4736 &res, &respos)) {
4737 goto onError;
4738 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004739 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004740 else
4741 /* done with this character => adjust input position */
4742 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004745 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004746 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004747 if (_PyString_Resize(&res, respos))
4748 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004749 }
4750 Py_XDECREF(exc);
4751 Py_XDECREF(errorHandler);
4752 return res;
4753
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004754 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004755 Py_XDECREF(res);
4756 Py_XDECREF(exc);
4757 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758 return NULL;
4759}
4760
4761PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004762 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763{
4764 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004765 PyErr_BadArgument();
4766 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767 }
4768 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004769 PyUnicode_GET_SIZE(unicode),
4770 mapping,
4771 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772}
4773
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774/* create or adjust a UnicodeTranslateError */
4775static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004776 const Py_UNICODE *unicode, Py_ssize_t size,
4777 Py_ssize_t startpos, Py_ssize_t endpos,
4778 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004781 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004782 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 }
4784 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004785 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4786 goto onError;
4787 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4788 goto onError;
4789 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4790 goto onError;
4791 return;
4792 onError:
4793 Py_DECREF(*exceptionObject);
4794 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004795 }
4796}
4797
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004798/* raises a UnicodeTranslateError */
4799static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004800 const Py_UNICODE *unicode, Py_ssize_t size,
4801 Py_ssize_t startpos, Py_ssize_t endpos,
4802 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004803{
4804 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004805 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004806 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004807 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004808}
4809
4810/* error handling callback helper:
4811 build arguments, call the callback and check the arguments,
4812 put the result into newpos and return the replacement string, which
4813 has to be freed by the caller */
4814static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004815 PyObject **errorHandler,
4816 const char *reason,
4817 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4818 Py_ssize_t startpos, Py_ssize_t endpos,
4819 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004821 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004822
Martin v. Löwis412fb672006-04-13 06:34:32 +00004823 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 PyObject *restuple;
4825 PyObject *resunicode;
4826
4827 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004828 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004829 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004830 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004831 }
4832
4833 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004834 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004835 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004836 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004837
4838 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004839 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004840 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004841 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004842 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004843 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004844 Py_DECREF(restuple);
4845 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 }
4847 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004848 &resunicode, &i_newpos)) {
4849 Py_DECREF(restuple);
4850 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004851 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004852 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004853 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004854 else
4855 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004856 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004857 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4858 Py_DECREF(restuple);
4859 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004860 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004861 Py_INCREF(resunicode);
4862 Py_DECREF(restuple);
4863 return resunicode;
4864}
4865
4866/* Lookup the character ch in the mapping and put the result in result,
4867 which must be decrefed by the caller.
4868 Return 0 on success, -1 on error */
4869static
4870int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4871{
4872 PyObject *w = PyInt_FromLong((long)c);
4873 PyObject *x;
4874
4875 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004876 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004877 x = PyObject_GetItem(mapping, w);
4878 Py_DECREF(w);
4879 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004880 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4881 /* No mapping found means: use 1:1 mapping. */
4882 PyErr_Clear();
4883 *result = NULL;
4884 return 0;
4885 } else
4886 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004887 }
4888 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004889 *result = x;
4890 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004891 }
4892 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004893 long value = PyInt_AS_LONG(x);
4894 long max = PyUnicode_GetMax();
4895 if (value < 0 || value > max) {
4896 PyErr_Format(PyExc_TypeError,
4897 "character mapping must be in range(0x%lx)", max+1);
4898 Py_DECREF(x);
4899 return -1;
4900 }
4901 *result = x;
4902 return 0;
4903 }
4904 else if (PyUnicode_Check(x)) {
4905 *result = x;
4906 return 0;
4907 }
4908 else {
4909 /* wrong return value */
4910 PyErr_SetString(PyExc_TypeError,
4911 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004912 Py_DECREF(x);
4913 return -1;
4914 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004915}
4916/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004917 if not reallocate and adjust various state variables.
4918 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004919static
Walter Dörwald4894c302003-10-24 14:25:28 +00004920int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004921 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004922{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004923 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004924 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004925 /* remember old output position */
4926 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4927 /* exponentially overallocate to minimize reallocations */
4928 if (requiredsize < 2 * oldsize)
4929 requiredsize = 2 * oldsize;
4930 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4931 return -1;
4932 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004933 }
4934 return 0;
4935}
4936/* lookup the character, put the result in the output string and adjust
4937 various state variables. Return a new reference to the object that
4938 was put in the output buffer in *result, or Py_None, if the mapping was
4939 undefined (in which case no character was written).
4940 The called must decref result.
4941 Return 0 on success, -1 on error. */
4942static
Walter Dörwald4894c302003-10-24 14:25:28 +00004943int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004944 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4945 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004946{
Walter Dörwald4894c302003-10-24 14:25:28 +00004947 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004948 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004949 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004950 /* not found => default to 1:1 mapping */
4951 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004952 }
4953 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004954 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004955 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004956 /* no overflow check, because we know that the space is enough */
4957 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004958 }
4959 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004960 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4961 if (repsize==1) {
4962 /* no overflow check, because we know that the space is enough */
4963 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4964 }
4965 else if (repsize!=0) {
4966 /* more than one character */
4967 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4968 (insize - (curinp-startinp)) +
4969 repsize - 1;
4970 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4971 return -1;
4972 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4973 *outp += repsize;
4974 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004975 }
4976 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004977 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004978 return 0;
4979}
4980
4981PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004982 Py_ssize_t size,
4983 PyObject *mapping,
4984 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004985{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004986 /* output object */
4987 PyObject *res = NULL;
4988 /* pointers to the beginning and end+1 of input */
4989 const Py_UNICODE *startp = p;
4990 const Py_UNICODE *endp = p + size;
4991 /* pointer into the output */
4992 Py_UNICODE *str;
4993 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004994 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004995 char *reason = "character maps to <undefined>";
4996 PyObject *errorHandler = NULL;
4997 PyObject *exc = NULL;
4998 /* the following variable is used for caching string comparisons
4999 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5000 * 3=ignore, 4=xmlcharrefreplace */
5001 int known_errorHandler = -1;
5002
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005004 PyErr_BadArgument();
5005 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005007
5008 /* allocate enough for a simple 1:1 translation without
5009 replacements, if we need more, we'll resize */
5010 res = PyUnicode_FromUnicode(NULL, size);
5011 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005012 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005014 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005015 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005017 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005018 /* try to encode it */
5019 PyObject *x = NULL;
5020 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5021 Py_XDECREF(x);
5022 goto onError;
5023 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005024 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005025 if (x!=Py_None) /* it worked => adjust input pointer */
5026 ++p;
5027 else { /* untranslatable character */
5028 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5029 Py_ssize_t repsize;
5030 Py_ssize_t newpos;
5031 Py_UNICODE *uni2;
5032 /* startpos for collecting untranslatable chars */
5033 const Py_UNICODE *collstart = p;
5034 const Py_UNICODE *collend = p+1;
5035 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005037 /* find all untranslatable characters */
5038 while (collend < endp) {
5039 if (charmaptranslate_lookup(*collend, mapping, &x))
5040 goto onError;
5041 Py_XDECREF(x);
5042 if (x!=Py_None)
5043 break;
5044 ++collend;
5045 }
5046 /* cache callback name lookup
5047 * (if not done yet, i.e. it's the first error) */
5048 if (known_errorHandler==-1) {
5049 if ((errors==NULL) || (!strcmp(errors, "strict")))
5050 known_errorHandler = 1;
5051 else if (!strcmp(errors, "replace"))
5052 known_errorHandler = 2;
5053 else if (!strcmp(errors, "ignore"))
5054 known_errorHandler = 3;
5055 else if (!strcmp(errors, "xmlcharrefreplace"))
5056 known_errorHandler = 4;
5057 else
5058 known_errorHandler = 0;
5059 }
5060 switch (known_errorHandler) {
5061 case 1: /* strict */
5062 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005063 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005064 case 2: /* replace */
5065 /* No need to check for space, this is a 1:1 replacement */
5066 for (coll = collstart; coll<collend; ++coll)
5067 *str++ = '?';
5068 /* fall through */
5069 case 3: /* ignore */
5070 p = collend;
5071 break;
5072 case 4: /* xmlcharrefreplace */
5073 /* generate replacement (temporarily (mis)uses p) */
5074 for (p = collstart; p < collend; ++p) {
5075 char buffer[2+29+1+1];
5076 char *cp;
5077 sprintf(buffer, "&#%d;", (int)*p);
5078 if (charmaptranslate_makespace(&res, &str,
5079 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5080 goto onError;
5081 for (cp = buffer; *cp; ++cp)
5082 *str++ = *cp;
5083 }
5084 p = collend;
5085 break;
5086 default:
5087 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5088 reason, startp, size, &exc,
5089 collstart-startp, collend-startp, &newpos);
5090 if (repunicode == NULL)
5091 goto onError;
5092 /* generate replacement */
5093 repsize = PyUnicode_GET_SIZE(repunicode);
5094 if (charmaptranslate_makespace(&res, &str,
5095 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5096 Py_DECREF(repunicode);
5097 goto onError;
5098 }
5099 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5100 *str++ = *uni2;
5101 p = startp + newpos;
5102 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005103 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005104 }
5105 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005106 /* Resize if we allocated to much */
5107 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005108 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005109 if (PyUnicode_Resize(&res, respos) < 0)
5110 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005111 }
5112 Py_XDECREF(exc);
5113 Py_XDECREF(errorHandler);
5114 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005116 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005117 Py_XDECREF(res);
5118 Py_XDECREF(exc);
5119 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 return NULL;
5121}
5122
5123PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005124 PyObject *mapping,
5125 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126{
5127 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005128
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129 str = PyUnicode_FromObject(str);
5130 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005131 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005133 PyUnicode_GET_SIZE(str),
5134 mapping,
5135 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 Py_DECREF(str);
5137 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005138
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005139 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140 Py_XDECREF(str);
5141 return NULL;
5142}
Tim Petersced69f82003-09-16 20:30:58 +00005143
Guido van Rossum9e896b32000-04-05 20:11:21 +00005144/* --- Decimal Encoder ---------------------------------------------------- */
5145
5146int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005147 Py_ssize_t length,
5148 char *output,
5149 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005150{
5151 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005152 PyObject *errorHandler = NULL;
5153 PyObject *exc = NULL;
5154 const char *encoding = "decimal";
5155 const char *reason = "invalid decimal Unicode string";
5156 /* the following variable is used for caching string comparisons
5157 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5158 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005159
5160 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005161 PyErr_BadArgument();
5162 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005163 }
5164
5165 p = s;
5166 end = s + length;
5167 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005168 register Py_UNICODE ch = *p;
5169 int decimal;
5170 PyObject *repunicode;
5171 Py_ssize_t repsize;
5172 Py_ssize_t newpos;
5173 Py_UNICODE *uni2;
5174 Py_UNICODE *collstart;
5175 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005176
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005177 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005178 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005179 ++p;
5180 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005181 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005182 decimal = Py_UNICODE_TODECIMAL(ch);
5183 if (decimal >= 0) {
5184 *output++ = '0' + decimal;
5185 ++p;
5186 continue;
5187 }
5188 if (0 < ch && ch < 256) {
5189 *output++ = (char)ch;
5190 ++p;
5191 continue;
5192 }
5193 /* All other characters are considered unencodable */
5194 collstart = p;
Victor Stinner975134e2011-11-22 01:54:19 +01005195 for (collend = p+1; collend < end; collend++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005196 if ((0 < *collend && *collend < 256) ||
Victor Stinner975134e2011-11-22 01:54:19 +01005197 Py_UNICODE_ISSPACE(*collend) ||
5198 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005199 break;
5200 }
5201 /* cache callback name lookup
5202 * (if not done yet, i.e. it's the first error) */
5203 if (known_errorHandler==-1) {
5204 if ((errors==NULL) || (!strcmp(errors, "strict")))
5205 known_errorHandler = 1;
5206 else if (!strcmp(errors, "replace"))
5207 known_errorHandler = 2;
5208 else if (!strcmp(errors, "ignore"))
5209 known_errorHandler = 3;
5210 else if (!strcmp(errors, "xmlcharrefreplace"))
5211 known_errorHandler = 4;
5212 else
5213 known_errorHandler = 0;
5214 }
5215 switch (known_errorHandler) {
5216 case 1: /* strict */
5217 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5218 goto onError;
5219 case 2: /* replace */
5220 for (p = collstart; p < collend; ++p)
5221 *output++ = '?';
5222 /* fall through */
5223 case 3: /* ignore */
5224 p = collend;
5225 break;
5226 case 4: /* xmlcharrefreplace */
5227 /* generate replacement (temporarily (mis)uses p) */
5228 for (p = collstart; p < collend; ++p)
5229 output += sprintf(output, "&#%d;", (int)*p);
5230 p = collend;
5231 break;
5232 default:
5233 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5234 encoding, reason, s, length, &exc,
5235 collstart-s, collend-s, &newpos);
5236 if (repunicode == NULL)
5237 goto onError;
5238 /* generate replacement */
5239 repsize = PyUnicode_GET_SIZE(repunicode);
5240 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5241 Py_UNICODE ch = *uni2;
5242 if (Py_UNICODE_ISSPACE(ch))
5243 *output++ = ' ';
5244 else {
5245 decimal = Py_UNICODE_TODECIMAL(ch);
5246 if (decimal >= 0)
5247 *output++ = '0' + decimal;
5248 else if (0 < ch && ch < 256)
5249 *output++ = (char)ch;
5250 else {
5251 Py_DECREF(repunicode);
5252 raise_encode_exception(&exc, encoding,
5253 s, length, collstart-s, collend-s, reason);
5254 goto onError;
5255 }
5256 }
5257 }
5258 p = s + newpos;
5259 Py_DECREF(repunicode);
5260 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005261 }
5262 /* 0-terminate the output string */
5263 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005264 Py_XDECREF(exc);
5265 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005266 return 0;
5267
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005268 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005269 Py_XDECREF(exc);
5270 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005271 return -1;
5272}
5273
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274/* --- Helpers ------------------------------------------------------------ */
5275
Eric Smitha9f7d622008-02-17 19:46:49 +00005276#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005277#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005278
5279#include "stringlib/count.h"
5280#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005281#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005282#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005283
Fredrik Lundhc8162812006-05-26 19:33:03 +00005284/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005285#define ADJUST_INDICES(start, end, len) \
5286 if (end > len) \
5287 end = len; \
5288 else if (end < 0) { \
5289 end += len; \
5290 if (end < 0) \
5291 end = 0; \
5292 } \
5293 if (start < 0) { \
5294 start += len; \
5295 if (start < 0) \
5296 start = 0; \
5297 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005298
Martin v. Löwis18e16552006-02-15 17:27:45 +00005299Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005300 PyObject *substr,
5301 Py_ssize_t start,
5302 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005304 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005305 PyUnicodeObject* str_obj;
5306 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005307
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005308 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5309 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005310 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005311 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5312 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005313 Py_DECREF(str_obj);
5314 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 }
Tim Petersced69f82003-09-16 20:30:58 +00005316
Antoine Pitrou64672132010-01-13 07:55:48 +00005317 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005318 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005319 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5320 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005321 );
5322
5323 Py_DECREF(sub_obj);
5324 Py_DECREF(str_obj);
5325
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 return result;
5327}
5328
Martin v. Löwis18e16552006-02-15 17:27:45 +00005329Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005330 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005331 Py_ssize_t start,
5332 Py_ssize_t end,
5333 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005335 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005336
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005337 str = PyUnicode_FromObject(str);
5338 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005339 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005340 sub = PyUnicode_FromObject(sub);
5341 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005342 Py_DECREF(str);
5343 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344 }
Tim Petersced69f82003-09-16 20:30:58 +00005345
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005346 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005347 result = stringlib_find_slice(
5348 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5349 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5350 start, end
5351 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005352 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005353 result = stringlib_rfind_slice(
5354 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5355 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5356 start, end
5357 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005358
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005359 Py_DECREF(str);
5360 Py_DECREF(sub);
5361
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 return result;
5363}
5364
Tim Petersced69f82003-09-16 20:30:58 +00005365static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005367 PyUnicodeObject *substring,
5368 Py_ssize_t start,
5369 Py_ssize_t end,
5370 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 if (substring->length == 0)
5373 return 1;
5374
Antoine Pitrou64672132010-01-13 07:55:48 +00005375 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 end -= substring->length;
5377 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005378 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379
5380 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005381 if (Py_UNICODE_MATCH(self, end, substring))
5382 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 } else {
5384 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005385 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 }
5387
5388 return 0;
5389}
5390
Martin v. Löwis18e16552006-02-15 17:27:45 +00005391Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005392 PyObject *substr,
5393 Py_ssize_t start,
5394 Py_ssize_t end,
5395 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005397 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005398
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 str = PyUnicode_FromObject(str);
5400 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005401 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 substr = PyUnicode_FromObject(substr);
5403 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005404 Py_DECREF(str);
5405 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 }
Tim Petersced69f82003-09-16 20:30:58 +00005407
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005409 (PyUnicodeObject *)substr,
5410 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411 Py_DECREF(str);
5412 Py_DECREF(substr);
5413 return result;
5414}
5415
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416/* Apply fixfct filter to the Unicode object self and return a
5417 reference to the modified object */
5418
Tim Petersced69f82003-09-16 20:30:58 +00005419static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005421 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422{
5423
5424 PyUnicodeObject *u;
5425
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005426 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005428 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005429
5430 Py_UNICODE_COPY(u->str, self->str, self->length);
5431
Tim Peters7a29bd52001-09-12 03:03:31 +00005432 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005433 /* fixfct should return TRUE if it modified the buffer. If
5434 FALSE, return a reference to the original buffer instead
5435 (to save space, not time) */
5436 Py_INCREF(self);
5437 Py_DECREF(u);
5438 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439 }
5440 return (PyObject*) u;
5441}
5442
Tim Petersced69f82003-09-16 20:30:58 +00005443static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444int fixupper(PyUnicodeObject *self)
5445{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005446 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 Py_UNICODE *s = self->str;
5448 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005449
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005451 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005452
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005453 ch = Py_UNICODE_TOUPPER(*s);
5454 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005456 *s = ch;
5457 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 s++;
5459 }
5460
5461 return status;
5462}
5463
Tim Petersced69f82003-09-16 20:30:58 +00005464static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465int fixlower(PyUnicodeObject *self)
5466{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005467 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 Py_UNICODE *s = self->str;
5469 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005470
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005472 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005473
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005474 ch = Py_UNICODE_TOLOWER(*s);
5475 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005477 *s = ch;
5478 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479 s++;
5480 }
5481
5482 return status;
5483}
5484
Tim Petersced69f82003-09-16 20:30:58 +00005485static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486int fixswapcase(PyUnicodeObject *self)
5487{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005488 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489 Py_UNICODE *s = self->str;
5490 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005491
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 while (len-- > 0) {
5493 if (Py_UNICODE_ISUPPER(*s)) {
5494 *s = Py_UNICODE_TOLOWER(*s);
5495 status = 1;
5496 } else if (Py_UNICODE_ISLOWER(*s)) {
5497 *s = Py_UNICODE_TOUPPER(*s);
5498 status = 1;
5499 }
5500 s++;
5501 }
5502
5503 return status;
5504}
5505
Tim Petersced69f82003-09-16 20:30:58 +00005506static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507int fixcapitalize(PyUnicodeObject *self)
5508{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005509 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005510 Py_UNICODE *s = self->str;
5511 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005512
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005513 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005514 return 0;
Ezio Melotti15d6b652011-08-15 09:22:24 +03005515 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005516 *s = Py_UNICODE_TOUPPER(*s);
5517 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005519 s++;
5520 while (--len > 0) {
Ezio Melotti15d6b652011-08-15 09:22:24 +03005521 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005522 *s = Py_UNICODE_TOLOWER(*s);
5523 status = 1;
5524 }
5525 s++;
5526 }
5527 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528}
5529
5530static
5531int fixtitle(PyUnicodeObject *self)
5532{
5533 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5534 register Py_UNICODE *e;
5535 int previous_is_cased;
5536
5537 /* Shortcut for single character strings */
5538 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005539 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5540 if (*p != ch) {
5541 *p = ch;
5542 return 1;
5543 }
5544 else
5545 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546 }
Tim Petersced69f82003-09-16 20:30:58 +00005547
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548 e = p + PyUnicode_GET_SIZE(self);
5549 previous_is_cased = 0;
5550 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005551 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005552
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005553 if (previous_is_cased)
5554 *p = Py_UNICODE_TOLOWER(ch);
5555 else
5556 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005557
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005558 if (Py_UNICODE_ISLOWER(ch) ||
5559 Py_UNICODE_ISUPPER(ch) ||
5560 Py_UNICODE_ISTITLE(ch))
5561 previous_is_cased = 1;
5562 else
5563 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564 }
5565 return 1;
5566}
5567
Tim Peters8ce9f162004-08-27 01:49:32 +00005568PyObject *
5569PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570{
Tim Peters8ce9f162004-08-27 01:49:32 +00005571 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005572 const Py_UNICODE blank = ' ';
5573 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005574 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005575 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005576 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5577 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005578 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5579 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005580 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005581 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005582 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583
Tim Peters05eba1f2004-08-27 21:32:02 +00005584 fseq = PySequence_Fast(seq, "");
5585 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005586 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005587 }
5588
Tim Peters91879ab2004-08-27 22:35:44 +00005589 /* Grrrr. A codec may be invoked to convert str objects to
5590 * Unicode, and so it's possible to call back into Python code
5591 * during PyUnicode_FromObject(), and so it's possible for a sick
5592 * codec to change the size of fseq (if seq is a list). Therefore
5593 * we have to keep refetching the size -- can't assume seqlen
5594 * is invariant.
5595 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005596 seqlen = PySequence_Fast_GET_SIZE(fseq);
5597 /* If empty sequence, return u"". */
5598 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005599 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5600 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005601 }
5602 /* If singleton sequence with an exact Unicode, return that. */
5603 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005604 item = PySequence_Fast_GET_ITEM(fseq, 0);
5605 if (PyUnicode_CheckExact(item)) {
5606 Py_INCREF(item);
5607 res = (PyUnicodeObject *)item;
5608 goto Done;
5609 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005610 }
5611
Tim Peters05eba1f2004-08-27 21:32:02 +00005612 /* At least two items to join, or one that isn't exact Unicode. */
5613 if (seqlen > 1) {
5614 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005615 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005616 sep = &blank;
5617 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005618 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005619 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005620 internal_separator = PyUnicode_FromObject(separator);
5621 if (internal_separator == NULL)
5622 goto onError;
5623 sep = PyUnicode_AS_UNICODE(internal_separator);
5624 seplen = PyUnicode_GET_SIZE(internal_separator);
5625 /* In case PyUnicode_FromObject() mutated seq. */
5626 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005627 }
5628 }
5629
5630 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005631 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005632 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005633 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005634 res_p = PyUnicode_AS_UNICODE(res);
5635 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005636
Tim Peters05eba1f2004-08-27 21:32:02 +00005637 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005638 Py_ssize_t itemlen;
5639 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005640
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005641 item = PySequence_Fast_GET_ITEM(fseq, i);
5642 /* Convert item to Unicode. */
5643 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5644 PyErr_Format(PyExc_TypeError,
5645 "sequence item %zd: expected string or Unicode,"
5646 " %.80s found",
5647 i, Py_TYPE(item)->tp_name);
5648 goto onError;
5649 }
5650 item = PyUnicode_FromObject(item);
5651 if (item == NULL)
5652 goto onError;
5653 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005654
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005655 /* In case PyUnicode_FromObject() mutated seq. */
5656 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005657
Tim Peters8ce9f162004-08-27 01:49:32 +00005658 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005659 itemlen = PyUnicode_GET_SIZE(item);
5660 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005661 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005662 goto Overflow;
5663 if (i < seqlen - 1) {
5664 new_res_used += seplen;
5665 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005666 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005667 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005668 if (new_res_used > res_alloc) {
5669 /* double allocated size until it's big enough */
5670 do {
5671 res_alloc += res_alloc;
5672 if (res_alloc <= 0)
5673 goto Overflow;
5674 } while (new_res_used > res_alloc);
5675 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5676 Py_DECREF(item);
5677 goto onError;
5678 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005679 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005680 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005681
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005682 /* Copy item, and maybe the separator. */
5683 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5684 res_p += itemlen;
5685 if (i < seqlen - 1) {
5686 Py_UNICODE_COPY(res_p, sep, seplen);
5687 res_p += seplen;
5688 }
5689 Py_DECREF(item);
5690 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005691 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005692
Tim Peters05eba1f2004-08-27 21:32:02 +00005693 /* Shrink res to match the used area; this probably can't fail,
5694 * but it's cheap to check.
5695 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005696 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005697 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005698
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005699 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005700 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005701 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 return (PyObject *)res;
5703
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005704 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005705 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005706 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005707 Py_DECREF(item);
5708 /* fall through */
5709
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005710 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005711 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005712 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005713 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 return NULL;
5715}
5716
Tim Petersced69f82003-09-16 20:30:58 +00005717static
5718PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005719 Py_ssize_t left,
5720 Py_ssize_t right,
5721 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722{
5723 PyUnicodeObject *u;
5724
5725 if (left < 0)
5726 left = 0;
5727 if (right < 0)
5728 right = 0;
5729
Tim Peters7a29bd52001-09-12 03:03:31 +00005730 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731 Py_INCREF(self);
5732 return self;
5733 }
5734
Neal Norwitze7d8be82008-07-31 17:17:14 +00005735 if (left > PY_SSIZE_T_MAX - self->length ||
5736 right > PY_SSIZE_T_MAX - (left + self->length)) {
5737 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5738 return NULL;
5739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 u = _PyUnicode_New(left + self->length + right);
5741 if (u) {
5742 if (left)
5743 Py_UNICODE_FILL(u->str, fill, left);
5744 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5745 if (right)
5746 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5747 }
5748
5749 return u;
5750}
5751
Antoine Pitrou64672132010-01-13 07:55:48 +00005752PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755
5756 string = PyUnicode_FromObject(string);
5757 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005758 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759
Antoine Pitrou64672132010-01-13 07:55:48 +00005760 list = stringlib_splitlines(
5761 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5762 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763
5764 Py_DECREF(string);
5765 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766}
5767
Tim Petersced69f82003-09-16 20:30:58 +00005768static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005770 PyUnicodeObject *substring,
5771 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005774 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005777 return stringlib_split_whitespace(
5778 (PyObject*) self, self->str, self->length, maxcount
5779 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005780
Antoine Pitrou64672132010-01-13 07:55:48 +00005781 return stringlib_split(
5782 (PyObject*) self, self->str, self->length,
5783 substring->str, substring->length,
5784 maxcount
5785 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786}
5787
Tim Petersced69f82003-09-16 20:30:58 +00005788static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005789PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005790 PyUnicodeObject *substring,
5791 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005792{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005793 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005794 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005795
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005796 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005797 return stringlib_rsplit_whitespace(
5798 (PyObject*) self, self->str, self->length, maxcount
5799 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005800
Antoine Pitrou64672132010-01-13 07:55:48 +00005801 return stringlib_rsplit(
5802 (PyObject*) self, self->str, self->length,
5803 substring->str, substring->length,
5804 maxcount
5805 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005806}
5807
5808static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005810 PyUnicodeObject *str1,
5811 PyUnicodeObject *str2,
5812 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813{
5814 PyUnicodeObject *u;
5815
5816 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005817 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005818 else if (maxcount == 0 || self->length == 0)
5819 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820
Fredrik Lundh347ee272006-05-24 16:35:18 +00005821 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005822 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005823 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005824 if (str1->length == 0)
5825 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005826 if (str1->length == 1) {
5827 /* replace characters */
5828 Py_UNICODE u1, u2;
5829 if (!findchar(self->str, self->length, str1->str[0]))
5830 goto nothing;
5831 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5832 if (!u)
5833 return NULL;
5834 Py_UNICODE_COPY(u->str, self->str, self->length);
5835 u1 = str1->str[0];
5836 u2 = str2->str[0];
5837 for (i = 0; i < u->length; i++)
5838 if (u->str[i] == u1) {
5839 if (--maxcount < 0)
5840 break;
5841 u->str[i] = u2;
5842 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005844 i = stringlib_find(
5845 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005847 if (i < 0)
5848 goto nothing;
5849 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5850 if (!u)
5851 return NULL;
5852 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005853
5854 /* change everything in-place, starting with this one */
5855 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5856 i += str1->length;
5857
5858 while ( --maxcount > 0) {
5859 i = stringlib_find(self->str+i, self->length-i,
5860 str1->str, str1->length,
5861 i);
5862 if (i == -1)
5863 break;
5864 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5865 i += str1->length;
5866 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005869
Brett Cannona7f13ee2010-05-04 01:16:51 +00005870 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005871 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 Py_UNICODE *p;
5873
5874 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005875 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5876 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005877 if (n == 0)
5878 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005879 /* new_size = self->length + n * (str2->length - str1->length)); */
5880 delta = (str2->length - str1->length);
5881 if (delta == 0) {
5882 new_size = self->length;
5883 } else {
5884 product = n * (str2->length - str1->length);
5885 if ((product / (str2->length - str1->length)) != n) {
5886 PyErr_SetString(PyExc_OverflowError,
5887 "replace string is too long");
5888 return NULL;
5889 }
5890 new_size = self->length + product;
5891 if (new_size < 0) {
5892 PyErr_SetString(PyExc_OverflowError,
5893 "replace string is too long");
5894 return NULL;
5895 }
5896 }
5897 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005898 if (!u)
5899 return NULL;
5900 i = 0;
5901 p = u->str;
5902 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005903 while (n-- > 0) {
5904 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005905 j = stringlib_find(self->str+i, self->length-i,
5906 str1->str, str1->length,
5907 i);
5908 if (j == -1)
5909 break;
5910 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005911 /* copy unchanged part [i:j] */
5912 Py_UNICODE_COPY(p, self->str+i, j-i);
5913 p += j - i;
5914 }
5915 /* copy substitution string */
5916 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005917 Py_UNICODE_COPY(p, str2->str, str2->length);
5918 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005919 }
5920 i = j + str1->length;
5921 }
5922 if (i < self->length)
5923 /* copy tail [i:] */
5924 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005925 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005926 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005927 while (n > 0) {
5928 Py_UNICODE_COPY(p, str2->str, str2->length);
5929 p += str2->length;
5930 if (--n <= 0)
5931 break;
5932 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005934 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 }
5936 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005938
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005939 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005940 /* nothing to replace; return original string (when possible) */
5941 if (PyUnicode_CheckExact(self)) {
5942 Py_INCREF(self);
5943 return (PyObject *) self;
5944 }
5945 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946}
5947
5948/* --- Unicode Object Methods --------------------------------------------- */
5949
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005950PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005951 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952\n\
5953Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005954characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955
5956static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005957unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 return fixup(self, fixtitle);
5960}
5961
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005962PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005963 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964\n\
5965Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00005966have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967
5968static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005969unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971 return fixup(self, fixcapitalize);
5972}
5973
5974#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005975PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005976 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977\n\
5978Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005979normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980
5981static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005982unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983{
5984 PyObject *list;
5985 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005986 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 /* Split into words */
5989 list = split(self, NULL, -1);
5990 if (!list)
5991 return NULL;
5992
5993 /* Capitalize each word */
5994 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5995 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005996 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 if (item == NULL)
5998 goto onError;
5999 Py_DECREF(PyList_GET_ITEM(list, i));
6000 PyList_SET_ITEM(list, i, item);
6001 }
6002
6003 /* Join the words to form a new string */
6004 item = PyUnicode_Join(NULL, list);
6005
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006006 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 Py_DECREF(list);
6008 return (PyObject *)item;
6009}
6010#endif
6011
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006012/* Argument converter. Coerces to a single unicode character */
6013
6014static int
6015convert_uc(PyObject *obj, void *addr)
6016{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006017 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6018 PyObject *uniobj;
6019 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006020
Benjamin Peterson857ce152009-01-31 16:29:18 +00006021 uniobj = PyUnicode_FromObject(obj);
6022 if (uniobj == NULL) {
6023 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006024 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006025 return 0;
6026 }
6027 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6028 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006029 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006030 Py_DECREF(uniobj);
6031 return 0;
6032 }
6033 unistr = PyUnicode_AS_UNICODE(uniobj);
6034 *fillcharloc = unistr[0];
6035 Py_DECREF(uniobj);
6036 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006037}
6038
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006039PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006040 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006042Return S centered in a Unicode string of length width. Padding is\n\
6043done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044
6045static PyObject *
6046unicode_center(PyUnicodeObject *self, PyObject *args)
6047{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006048 Py_ssize_t marg, left;
6049 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006050 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051
Thomas Woutersde017742006-02-16 19:34:37 +00006052 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 return NULL;
6054
Tim Peters7a29bd52001-09-12 03:03:31 +00006055 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 Py_INCREF(self);
6057 return (PyObject*) self;
6058 }
6059
6060 marg = width - self->length;
6061 left = marg / 2 + (marg & width & 1);
6062
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006063 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064}
6065
Marc-André Lemburge5034372000-08-08 08:04:29 +00006066#if 0
6067
6068/* This code should go into some future Unicode collation support
6069 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006070 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006071
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006072/* speedy UTF-16 code point order comparison */
6073/* gleaned from: */
6074/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6075
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006076static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006077{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006078 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006079 0, 0, 0, 0, 0, 0, 0, 0,
6080 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006081 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006082};
6083
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084static int
6085unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6086{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006087 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006088
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089 Py_UNICODE *s1 = str1->str;
6090 Py_UNICODE *s2 = str2->str;
6091
6092 len1 = str1->length;
6093 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006094
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006096 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006097
6098 c1 = *s1++;
6099 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006100
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006101 if (c1 > (1<<11) * 26)
6102 c1 += utf16Fixup[c1>>11];
6103 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006104 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006105 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006106
6107 if (c1 != c2)
6108 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006109
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006110 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 }
6112
6113 return (len1 < len2) ? -1 : (len1 != len2);
6114}
6115
Marc-André Lemburge5034372000-08-08 08:04:29 +00006116#else
6117
6118static int
6119unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6120{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006121 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006122
6123 Py_UNICODE *s1 = str1->str;
6124 Py_UNICODE *s2 = str2->str;
6125
6126 len1 = str1->length;
6127 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006128
Marc-André Lemburge5034372000-08-08 08:04:29 +00006129 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006130 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006131
Fredrik Lundh45714e92001-06-26 16:39:36 +00006132 c1 = *s1++;
6133 c2 = *s2++;
6134
6135 if (c1 != c2)
6136 return (c1 < c2) ? -1 : 1;
6137
Marc-André Lemburge5034372000-08-08 08:04:29 +00006138 len1--; len2--;
6139 }
6140
6141 return (len1 < len2) ? -1 : (len1 != len2);
6142}
6143
6144#endif
6145
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006147 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148{
6149 PyUnicodeObject *u = NULL, *v = NULL;
6150 int result;
6151
6152 /* Coerce the two arguments */
6153 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6154 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006155 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6157 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006158 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159
Thomas Wouters7e474022000-07-16 12:04:32 +00006160 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006162 Py_DECREF(u);
6163 Py_DECREF(v);
6164 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 }
6166
6167 result = unicode_compare(u, v);
6168
6169 Py_DECREF(u);
6170 Py_DECREF(v);
6171 return result;
6172
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006173 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 Py_XDECREF(u);
6175 Py_XDECREF(v);
6176 return -1;
6177}
6178
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006179PyObject *PyUnicode_RichCompare(PyObject *left,
6180 PyObject *right,
6181 int op)
6182{
6183 int result;
6184
6185 result = PyUnicode_Compare(left, right);
6186 if (result == -1 && PyErr_Occurred())
6187 goto onError;
6188
6189 /* Convert the return value to a Boolean */
6190 switch (op) {
6191 case Py_EQ:
6192 result = (result == 0);
6193 break;
6194 case Py_NE:
6195 result = (result != 0);
6196 break;
6197 case Py_LE:
6198 result = (result <= 0);
6199 break;
6200 case Py_GE:
6201 result = (result >= 0);
6202 break;
6203 case Py_LT:
6204 result = (result == -1);
6205 break;
6206 case Py_GT:
6207 result = (result == 1);
6208 break;
6209 }
6210 return PyBool_FromLong(result);
6211
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006212 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006213
6214 /* Standard case
6215
6216 Type errors mean that PyUnicode_FromObject() could not convert
6217 one of the arguments (usually the right hand side) to Unicode,
6218 ie. we can't handle the comparison request. However, it is
6219 possible that the other object knows a comparison method, which
6220 is why we return Py_NotImplemented to give the other object a
6221 chance.
6222
6223 */
6224 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6225 PyErr_Clear();
6226 Py_INCREF(Py_NotImplemented);
6227 return Py_NotImplemented;
6228 }
6229 if (op != Py_EQ && op != Py_NE)
6230 return NULL;
6231
6232 /* Equality comparison.
6233
6234 This is a special case: we silence any PyExc_UnicodeDecodeError
6235 and instead turn it into a PyErr_UnicodeWarning.
6236
6237 */
6238 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6239 return NULL;
6240 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006241 if (PyErr_Warn(PyExc_UnicodeWarning,
6242 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006243 "Unicode equal comparison "
6244 "failed to convert both arguments to Unicode - "
6245 "interpreting them as being unequal" :
6246 "Unicode unequal comparison "
6247 "failed to convert both arguments to Unicode - "
6248 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006249 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006250 return NULL;
6251 result = (op == Py_NE);
6252 return PyBool_FromLong(result);
6253}
6254
Guido van Rossum403d68b2000-03-13 15:55:09 +00006255int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006256 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006257{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006258 PyObject *str, *sub;
6259 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006260
6261 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006262 sub = PyUnicode_FromObject(element);
6263 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006264 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006265 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006266
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006267 str = PyUnicode_FromObject(container);
6268 if (!str) {
6269 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006270 return -1;
6271 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006272
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006273 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006274
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006275 Py_DECREF(str);
6276 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006277
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006278 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006279}
6280
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281/* Concat to string or Unicode object giving a new Unicode object. */
6282
6283PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006284 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285{
6286 PyUnicodeObject *u = NULL, *v = NULL, *w;
6287
6288 /* Coerce the two arguments */
6289 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6290 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006291 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6293 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006294 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295
6296 /* Shortcuts */
6297 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006298 Py_DECREF(v);
6299 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 }
6301 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006302 Py_DECREF(u);
6303 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 }
6305
6306 /* Concat the two Unicode strings */
6307 w = _PyUnicode_New(u->length + v->length);
6308 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006309 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310 Py_UNICODE_COPY(w->str, u->str, u->length);
6311 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6312
6313 Py_DECREF(u);
6314 Py_DECREF(v);
6315 return (PyObject *)w;
6316
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006317 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318 Py_XDECREF(u);
6319 Py_XDECREF(v);
6320 return NULL;
6321}
6322
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006323PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006324 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006326Return the number of non-overlapping occurrences of substring sub in\n\
6327Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006328interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006329
6330static PyObject *
6331unicode_count(PyUnicodeObject *self, PyObject *args)
6332{
6333 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006334 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006335 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336 PyObject *result;
6337
Jesus Cea44e81682011-04-20 16:39:15 +02006338 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6339 &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006340 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006341
Antoine Pitrou64672132010-01-13 07:55:48 +00006342 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006343 result = PyInt_FromSsize_t(
6344 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006345 substring->str, substring->length,
6346 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006347 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348
6349 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006350
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351 return result;
6352}
6353
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006354PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006355 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006357Encodes S using the codec registered for encoding. encoding defaults\n\
6358to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006359handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6361'xmlcharrefreplace' as well as any other name registered with\n\
6362codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363
6364static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006365unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006366{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006367 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368 char *encoding = NULL;
6369 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006370 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006371
Benjamin Peterson332d7212009-09-18 21:14:55 +00006372 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6373 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006375 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006376 if (v == NULL)
6377 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006378 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006379 PyErr_Format(PyExc_TypeError,
6380 "encoder did not return a string/unicode object "
6381 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006382 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006383 Py_DECREF(v);
6384 return NULL;
6385 }
6386 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006387
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006388 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006389 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006390}
6391
6392PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006393 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006394\n\
6395Decodes S using the codec registered for encoding. encoding defaults\n\
6396to the default encoding. errors may be given to set a different error\n\
6397handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6398a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
Éric Araujoa4c81b02012-02-20 02:07:31 +01006399as well as any other name registered with codecs.register_error that is\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006400able to handle UnicodeDecodeErrors.");
6401
6402static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006403unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006404{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006405 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006406 char *encoding = NULL;
6407 char *errors = NULL;
6408 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006409
Benjamin Peterson332d7212009-09-18 21:14:55 +00006410 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6411 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006412 return NULL;
6413 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006414 if (v == NULL)
6415 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006416 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006417 PyErr_Format(PyExc_TypeError,
6418 "decoder did not return a string/unicode object "
6419 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006420 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006421 Py_DECREF(v);
6422 return NULL;
6423 }
6424 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006425
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006426 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006427 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428}
6429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006430PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006431 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432\n\
6433Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006434If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435
6436static PyObject*
6437unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6438{
6439 Py_UNICODE *e;
6440 Py_UNICODE *p;
6441 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006442 Py_UNICODE *qe;
6443 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 PyUnicodeObject *u;
6445 int tabsize = 8;
6446
6447 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006448 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449
Thomas Wouters7e474022000-07-16 12:04:32 +00006450 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006451 i = 0; /* chars up to and including most recent \n or \r */
6452 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6453 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006454 for (p = self->str; p < e; p++)
6455 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006456 if (tabsize > 0) {
6457 incr = tabsize - (j % tabsize); /* cannot overflow */
6458 if (j > PY_SSIZE_T_MAX - incr)
6459 goto overflow1;
6460 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006461 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006462 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006464 if (j > PY_SSIZE_T_MAX - 1)
6465 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 j++;
6467 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006468 if (i > PY_SSIZE_T_MAX - j)
6469 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006471 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472 }
6473 }
6474
Guido van Rossum5bdff602008-03-11 21:18:06 +00006475 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006476 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006477
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 /* Second pass: create output string and fill it */
6479 u = _PyUnicode_New(i + j);
6480 if (!u)
6481 return NULL;
6482
Guido van Rossum5bdff602008-03-11 21:18:06 +00006483 j = 0; /* same as in first pass */
6484 q = u->str; /* next output char */
6485 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486
6487 for (p = self->str; p < e; p++)
6488 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006489 if (tabsize > 0) {
6490 i = tabsize - (j % tabsize);
6491 j += i;
6492 while (i--) {
6493 if (q >= qe)
6494 goto overflow2;
6495 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006496 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006497 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006498 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006499 else {
6500 if (q >= qe)
6501 goto overflow2;
6502 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006503 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 if (*p == '\n' || *p == '\r')
6505 j = 0;
6506 }
6507
6508 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006509
6510 overflow2:
6511 Py_DECREF(u);
6512 overflow1:
6513 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6514 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515}
6516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006517PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006518 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519\n\
6520Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08006521such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522arguments start and end are interpreted as in slice notation.\n\
6523\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006524Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525
6526static PyObject *
6527unicode_find(PyUnicodeObject *self, PyObject *args)
6528{
Jesus Cea44e81682011-04-20 16:39:15 +02006529 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006530 Py_ssize_t start;
6531 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006532 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533
Jesus Cea44e81682011-04-20 16:39:15 +02006534 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6535 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006538 result = stringlib_find_slice(
6539 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6540 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6541 start, end
6542 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543
6544 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006545
6546 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547}
6548
6549static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006550unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551{
6552 if (index < 0 || index >= self->length) {
6553 PyErr_SetString(PyExc_IndexError, "string index out of range");
6554 return NULL;
6555 }
6556
6557 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6558}
6559
6560static long
6561unicode_hash(PyUnicodeObject *self)
6562{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006563 /* Since Unicode objects compare equal to their ASCII string
6564 counterparts, they should use the individual character values
6565 as basis for their hash value. This is needed to assure that
6566 strings and Unicode objects behave in the same way as
6567 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568
Martin v. Löwis18e16552006-02-15 17:27:45 +00006569 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006570 register Py_UNICODE *p;
6571 register long x;
6572
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006573#ifdef Py_DEBUG
Benjamin Peterson26da9202012-02-21 11:08:50 -05006574 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006575#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006577 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006578 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006579 /*
6580 We make the hash of the empty string be 0, rather than using
6581 (prefix ^ suffix), since this slightly obfuscates the hash secret
6582 */
6583 if (len == 0) {
6584 self->hash = 0;
6585 return 0;
6586 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006587 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006588 x = _Py_HashSecret.prefix;
6589 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006590 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006591 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006592 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006593 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006594 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006595 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006596 self->hash = x;
6597 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598}
6599
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006600PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006601 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006603Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604
6605static PyObject *
6606unicode_index(PyUnicodeObject *self, PyObject *args)
6607{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006608 Py_ssize_t result;
Jesus Cea44e81682011-04-20 16:39:15 +02006609 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006610 Py_ssize_t start;
6611 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612
Jesus Cea44e81682011-04-20 16:39:15 +02006613 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6614 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006617 result = stringlib_find_slice(
6618 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6619 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6620 start, end
6621 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622
6623 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006624
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 if (result < 0) {
6626 PyErr_SetString(PyExc_ValueError, "substring not found");
6627 return NULL;
6628 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006629
Martin v. Löwis18e16552006-02-15 17:27:45 +00006630 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631}
6632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006633PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006634 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006636Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006637at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638
6639static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006640unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641{
6642 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6643 register const Py_UNICODE *e;
6644 int cased;
6645
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 /* Shortcut for single character strings */
6647 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006648 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006650 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006651 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006652 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006653
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654 e = p + PyUnicode_GET_SIZE(self);
6655 cased = 0;
6656 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006657 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006658
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006659 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6660 return PyBool_FromLong(0);
6661 else if (!cased && Py_UNICODE_ISLOWER(ch))
6662 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006664 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665}
6666
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006667PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006668 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006670Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006671at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672
6673static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006674unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675{
6676 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6677 register const Py_UNICODE *e;
6678 int cased;
6679
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 /* Shortcut for single character strings */
6681 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006682 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006684 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006685 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006686 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006687
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688 e = p + PyUnicode_GET_SIZE(self);
6689 cased = 0;
6690 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006691 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006692
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006693 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6694 return PyBool_FromLong(0);
6695 else if (!cased && Py_UNICODE_ISUPPER(ch))
6696 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006698 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699}
6700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006701PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006702 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006703\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006704Return True if S is a titlecased string and there is at least one\n\
6705character in S, i.e. upper- and titlecase characters may only\n\
6706follow uncased characters and lowercase characters only cased ones.\n\
6707Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708
6709static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006710unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006711{
6712 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6713 register const Py_UNICODE *e;
6714 int cased, previous_is_cased;
6715
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716 /* Shortcut for single character strings */
6717 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006718 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6719 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006721 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006722 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006723 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006724
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 e = p + PyUnicode_GET_SIZE(self);
6726 cased = 0;
6727 previous_is_cased = 0;
6728 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006729 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006730
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006731 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6732 if (previous_is_cased)
6733 return PyBool_FromLong(0);
6734 previous_is_cased = 1;
6735 cased = 1;
6736 }
6737 else if (Py_UNICODE_ISLOWER(ch)) {
6738 if (!previous_is_cased)
6739 return PyBool_FromLong(0);
6740 previous_is_cased = 1;
6741 cased = 1;
6742 }
6743 else
6744 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006746 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747}
6748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006749PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006750 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006752Return True if all characters in S are whitespace\n\
6753and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754
6755static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006756unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757{
6758 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6759 register const Py_UNICODE *e;
6760
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 /* Shortcut for single character strings */
6762 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006763 Py_UNICODE_ISSPACE(*p))
6764 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006766 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006767 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006768 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006769
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 e = p + PyUnicode_GET_SIZE(self);
6771 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006772 if (!Py_UNICODE_ISSPACE(*p))
6773 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006775 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776}
6777
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006778PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006779 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006780\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006781Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006782and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006783
6784static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006785unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006786{
6787 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6788 register const Py_UNICODE *e;
6789
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006790 /* Shortcut for single character strings */
6791 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006792 Py_UNICODE_ISALPHA(*p))
6793 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006794
6795 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006796 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006797 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006798
6799 e = p + PyUnicode_GET_SIZE(self);
6800 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006801 if (!Py_UNICODE_ISALPHA(*p))
6802 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006803 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006804 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006805}
6806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006807PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006808 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006809\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006810Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006811and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006812
6813static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006814unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006815{
6816 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6817 register const Py_UNICODE *e;
6818
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006819 /* Shortcut for single character strings */
6820 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006821 Py_UNICODE_ISALNUM(*p))
6822 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006823
6824 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006825 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006826 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006827
6828 e = p + PyUnicode_GET_SIZE(self);
6829 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006830 if (!Py_UNICODE_ISALNUM(*p))
6831 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006832 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006833 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006834}
6835
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006836PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006837 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006839Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006840False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841
6842static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006843unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844{
6845 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6846 register const Py_UNICODE *e;
6847
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848 /* Shortcut for single character strings */
6849 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006850 Py_UNICODE_ISDECIMAL(*p))
6851 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006853 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006854 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006855 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006856
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 e = p + PyUnicode_GET_SIZE(self);
6858 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006859 if (!Py_UNICODE_ISDECIMAL(*p))
6860 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006862 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863}
6864
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006865PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006866 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006868Return True if all characters in S are digits\n\
6869and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870
6871static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006872unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873{
6874 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6875 register const Py_UNICODE *e;
6876
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 /* Shortcut for single character strings */
6878 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006879 Py_UNICODE_ISDIGIT(*p))
6880 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006882 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006883 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006884 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006885
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886 e = p + PyUnicode_GET_SIZE(self);
6887 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006888 if (!Py_UNICODE_ISDIGIT(*p))
6889 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006891 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892}
6893
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006894PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006895 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006897Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006898False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899
6900static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006901unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902{
6903 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6904 register const Py_UNICODE *e;
6905
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 /* Shortcut for single character strings */
6907 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006908 Py_UNICODE_ISNUMERIC(*p))
6909 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006911 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006912 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006913 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006914
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915 e = p + PyUnicode_GET_SIZE(self);
6916 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006917 if (!Py_UNICODE_ISNUMERIC(*p))
6918 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006920 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921}
6922
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006923PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006924 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925\n\
6926Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006927iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928
6929static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006930unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006932 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006933}
6934
Martin v. Löwis18e16552006-02-15 17:27:45 +00006935static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936unicode_length(PyUnicodeObject *self)
6937{
6938 return self->length;
6939}
6940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006941PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006942 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006944Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006945done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946
6947static PyObject *
6948unicode_ljust(PyUnicodeObject *self, PyObject *args)
6949{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006950 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006951 Py_UNICODE fillchar = ' ';
6952
Martin v. Löwis412fb672006-04-13 06:34:32 +00006953 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954 return NULL;
6955
Tim Peters7a29bd52001-09-12 03:03:31 +00006956 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957 Py_INCREF(self);
6958 return (PyObject*) self;
6959 }
6960
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006961 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962}
6963
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006964PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006965 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006967Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968
6969static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006970unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972 return fixup(self, fixlower);
6973}
6974
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006975#define LEFTSTRIP 0
6976#define RIGHTSTRIP 1
6977#define BOTHSTRIP 2
6978
6979/* Arrays indexed by above */
6980static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6981
6982#define STRIPNAME(i) (stripformat[i]+3)
6983
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006984/* externally visible for str.strip(unicode) */
6985PyObject *
6986_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6987{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006988 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6989 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6990 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6991 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6992 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006993
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006994 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006995
Benjamin Peterson857ce152009-01-31 16:29:18 +00006996 i = 0;
6997 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006998 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6999 i++;
7000 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00007001 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007002
Benjamin Peterson857ce152009-01-31 16:29:18 +00007003 j = len;
7004 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007005 do {
7006 j--;
7007 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7008 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007009 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007010
Benjamin Peterson857ce152009-01-31 16:29:18 +00007011 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007012 Py_INCREF(self);
7013 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007014 }
7015 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007016 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007017}
7018
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019
7020static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007021do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007023 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7024 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007025
Benjamin Peterson857ce152009-01-31 16:29:18 +00007026 i = 0;
7027 if (striptype != RIGHTSTRIP) {
7028 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7029 i++;
7030 }
7031 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007032
Benjamin Peterson857ce152009-01-31 16:29:18 +00007033 j = len;
7034 if (striptype != LEFTSTRIP) {
7035 do {
7036 j--;
7037 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7038 j++;
7039 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007040
Benjamin Peterson857ce152009-01-31 16:29:18 +00007041 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7042 Py_INCREF(self);
7043 return (PyObject*)self;
7044 }
7045 else
7046 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007047}
7048
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007049
7050static PyObject *
7051do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7052{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007053 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007054
Benjamin Peterson857ce152009-01-31 16:29:18 +00007055 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7056 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007057
Benjamin Peterson857ce152009-01-31 16:29:18 +00007058 if (sep != NULL && sep != Py_None) {
7059 if (PyUnicode_Check(sep))
7060 return _PyUnicode_XStrip(self, striptype, sep);
7061 else if (PyString_Check(sep)) {
7062 PyObject *res;
7063 sep = PyUnicode_FromObject(sep);
7064 if (sep==NULL)
7065 return NULL;
7066 res = _PyUnicode_XStrip(self, striptype, sep);
7067 Py_DECREF(sep);
7068 return res;
7069 }
7070 else {
7071 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007072 "%s arg must be None, unicode or str",
7073 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007074 return NULL;
7075 }
7076 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007077
Benjamin Peterson857ce152009-01-31 16:29:18 +00007078 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007079}
7080
7081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007082PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007083 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007084\n\
7085Return a copy of the string S with leading and trailing\n\
7086whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007087If chars is given and not None, remove characters in chars instead.\n\
7088If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007089
7090static PyObject *
7091unicode_strip(PyUnicodeObject *self, PyObject *args)
7092{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007093 if (PyTuple_GET_SIZE(args) == 0)
7094 return do_strip(self, BOTHSTRIP); /* Common case */
7095 else
7096 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007097}
7098
7099
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007100PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007101 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007102\n\
7103Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007104If chars is given and not None, remove characters in chars instead.\n\
7105If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007106
7107static PyObject *
7108unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7109{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007110 if (PyTuple_GET_SIZE(args) == 0)
7111 return do_strip(self, LEFTSTRIP); /* Common case */
7112 else
7113 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007114}
7115
7116
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007117PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007118 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007119\n\
7120Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007121If chars is given and not None, remove characters in chars instead.\n\
7122If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007123
7124static PyObject *
7125unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7126{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007127 if (PyTuple_GET_SIZE(args) == 0)
7128 return do_strip(self, RIGHTSTRIP); /* Common case */
7129 else
7130 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007131}
7132
7133
Guido van Rossumd57fd912000-03-10 22:53:23 +00007134static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007135unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136{
7137 PyUnicodeObject *u;
7138 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007139 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007140 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141
7142 if (len < 0)
7143 len = 0;
7144
Tim Peters7a29bd52001-09-12 03:03:31 +00007145 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146 /* no repeat, return original string */
7147 Py_INCREF(str);
7148 return (PyObject*) str;
7149 }
Tim Peters8f422462000-09-09 06:13:41 +00007150
7151 /* ensure # of chars needed doesn't overflow int and # of bytes
7152 * needed doesn't overflow size_t
7153 */
7154 nchars = len * str->length;
7155 if (len && nchars / len != str->length) {
7156 PyErr_SetString(PyExc_OverflowError,
7157 "repeated string is too long");
7158 return NULL;
7159 }
7160 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7161 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7162 PyErr_SetString(PyExc_OverflowError,
7163 "repeated string is too long");
7164 return NULL;
7165 }
7166 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167 if (!u)
7168 return NULL;
7169
7170 p = u->str;
7171
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007172 if (str->length == 1 && len > 0) {
7173 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007174 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007175 Py_ssize_t done = 0; /* number of characters copied this far */
7176 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007177 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007178 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007179 }
7180 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007181 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007182 Py_UNICODE_COPY(p+done, p, n);
7183 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007184 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007185 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186
7187 return (PyObject*) u;
7188}
7189
7190PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007191 PyObject *subobj,
7192 PyObject *replobj,
7193 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194{
7195 PyObject *self;
7196 PyObject *str1;
7197 PyObject *str2;
7198 PyObject *result;
7199
7200 self = PyUnicode_FromObject(obj);
7201 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203 str1 = PyUnicode_FromObject(subobj);
7204 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007205 Py_DECREF(self);
7206 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 }
7208 str2 = PyUnicode_FromObject(replobj);
7209 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007210 Py_DECREF(self);
7211 Py_DECREF(str1);
7212 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213 }
Tim Petersced69f82003-09-16 20:30:58 +00007214 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007215 (PyUnicodeObject *)str1,
7216 (PyUnicodeObject *)str2,
7217 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218 Py_DECREF(self);
7219 Py_DECREF(str1);
7220 Py_DECREF(str2);
7221 return result;
7222}
7223
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007224PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007225 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226\n\
7227Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007228old replaced by new. If the optional argument count is\n\
7229given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230
7231static PyObject*
7232unicode_replace(PyUnicodeObject *self, PyObject *args)
7233{
7234 PyUnicodeObject *str1;
7235 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007236 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237 PyObject *result;
7238
Martin v. Löwis18e16552006-02-15 17:27:45 +00007239 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240 return NULL;
7241 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7242 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007245 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007246 Py_DECREF(str1);
7247 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007248 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249
7250 result = replace(self, str1, str2, maxcount);
7251
7252 Py_DECREF(str1);
7253 Py_DECREF(str2);
7254 return result;
7255}
7256
7257static
7258PyObject *unicode_repr(PyObject *unicode)
7259{
7260 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007261 PyUnicode_GET_SIZE(unicode),
7262 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263}
7264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007265PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007266 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267\n\
7268Return the highest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08007269such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270arguments start and end are interpreted as in slice notation.\n\
7271\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007272Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273
7274static PyObject *
7275unicode_rfind(PyUnicodeObject *self, PyObject *args)
7276{
Jesus Cea44e81682011-04-20 16:39:15 +02007277 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007278 Py_ssize_t start;
7279 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007280 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281
Jesus Cea44e81682011-04-20 16:39:15 +02007282 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7283 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007285
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007286 result = stringlib_rfind_slice(
7287 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7288 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7289 start, end
7290 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007291
7292 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007293
7294 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295}
7296
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007297PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007298 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007299\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007300Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301
7302static PyObject *
7303unicode_rindex(PyUnicodeObject *self, PyObject *args)
7304{
Jesus Cea44e81682011-04-20 16:39:15 +02007305 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007306 Py_ssize_t start;
7307 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007308 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309
Jesus Cea44e81682011-04-20 16:39:15 +02007310 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7311 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007312 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007314 result = stringlib_rfind_slice(
7315 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7316 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7317 start, end
7318 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319
7320 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007321
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322 if (result < 0) {
7323 PyErr_SetString(PyExc_ValueError, "substring not found");
7324 return NULL;
7325 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007326 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327}
7328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007329PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007330 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007331\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007332Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007333done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334
7335static PyObject *
7336unicode_rjust(PyUnicodeObject *self, PyObject *args)
7337{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007338 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007339 Py_UNICODE fillchar = ' ';
7340
Martin v. Löwis412fb672006-04-13 06:34:32 +00007341 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342 return NULL;
7343
Tim Peters7a29bd52001-09-12 03:03:31 +00007344 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345 Py_INCREF(self);
7346 return (PyObject*) self;
7347 }
7348
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007349 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350}
7351
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007353unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354{
7355 /* standard clamping */
7356 if (start < 0)
7357 start = 0;
7358 if (end < 0)
7359 end = 0;
7360 if (end > self->length)
7361 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007362 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363 /* full slice, return original string */
7364 Py_INCREF(self);
7365 return (PyObject*) self;
7366 }
7367 if (start > end)
7368 start = end;
7369 /* copy slice */
7370 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007371 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007372}
7373
7374PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007375 PyObject *sep,
7376 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377{
7378 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007379
Guido van Rossumd57fd912000-03-10 22:53:23 +00007380 s = PyUnicode_FromObject(s);
7381 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007382 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007383 if (sep != NULL) {
7384 sep = PyUnicode_FromObject(sep);
7385 if (sep == NULL) {
7386 Py_DECREF(s);
7387 return NULL;
7388 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007389 }
7390
7391 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7392
7393 Py_DECREF(s);
7394 Py_XDECREF(sep);
7395 return result;
7396}
7397
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007398PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007399 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400\n\
7401Return a list of the words in S, using sep as the\n\
7402delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007403splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007404whitespace string is a separator and empty strings are\n\
7405removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406
7407static PyObject*
7408unicode_split(PyUnicodeObject *self, PyObject *args)
7409{
7410 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007411 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412
Martin v. Löwis18e16552006-02-15 17:27:45 +00007413 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007414 return NULL;
7415
7416 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007417 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007419 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007421 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007422}
7423
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007424PyObject *
7425PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7426{
7427 PyObject* str_obj;
7428 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007429 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007430
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007431 str_obj = PyUnicode_FromObject(str_in);
7432 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007433 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007434 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007435 if (!sep_obj) {
7436 Py_DECREF(str_obj);
7437 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007438 }
7439
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007440 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007441 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7442 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7443 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007444
Fredrik Lundhb9479482006-05-26 17:22:38 +00007445 Py_DECREF(sep_obj);
7446 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007447
7448 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007449}
7450
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007451
7452PyObject *
7453PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7454{
7455 PyObject* str_obj;
7456 PyObject* sep_obj;
7457 PyObject* out;
7458
7459 str_obj = PyUnicode_FromObject(str_in);
7460 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007461 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007462 sep_obj = PyUnicode_FromObject(sep_in);
7463 if (!sep_obj) {
7464 Py_DECREF(str_obj);
7465 return NULL;
7466 }
7467
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007468 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007469 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7470 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7471 );
7472
7473 Py_DECREF(sep_obj);
7474 Py_DECREF(str_obj);
7475
7476 return out;
7477}
7478
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007479PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007480 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007481\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007482Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007483the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007484found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007485
7486static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007487unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007488{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007489 return PyUnicode_Partition((PyObject *)self, separator);
7490}
7491
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007492PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007493 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007494\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007495Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007496the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007497separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007498
7499static PyObject*
7500unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7501{
7502 return PyUnicode_RPartition((PyObject *)self, separator);
7503}
7504
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007505PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007506 PyObject *sep,
7507 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007508{
7509 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007510
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007511 s = PyUnicode_FromObject(s);
7512 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007513 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007514 if (sep != NULL) {
7515 sep = PyUnicode_FromObject(sep);
7516 if (sep == NULL) {
7517 Py_DECREF(s);
7518 return NULL;
7519 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007520 }
7521
7522 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7523
7524 Py_DECREF(s);
7525 Py_XDECREF(sep);
7526 return result;
7527}
7528
7529PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007530 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007531\n\
7532Return a list of the words in S, using sep as the\n\
7533delimiter string, starting at the end of the string and\n\
7534working to the front. If maxsplit is given, at most maxsplit\n\
7535splits are done. If sep is not specified, any whitespace string\n\
7536is a separator.");
7537
7538static PyObject*
7539unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7540{
7541 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007542 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007543
Martin v. Löwis18e16552006-02-15 17:27:45 +00007544 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007545 return NULL;
7546
7547 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007548 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007549 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007550 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007551 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007552 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007553}
7554
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007555PyDoc_STRVAR(splitlines__doc__,
Raymond Hettingeraad5b022012-06-02 01:42:58 -04007556 "S.splitlines(keepends=False) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557\n\
7558Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007559Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007560is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561
7562static PyObject*
7563unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7564{
Guido van Rossum86662912000-04-11 15:38:46 +00007565 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566
Guido van Rossum86662912000-04-11 15:38:46 +00007567 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568 return NULL;
7569
Guido van Rossum86662912000-04-11 15:38:46 +00007570 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571}
7572
7573static
7574PyObject *unicode_str(PyUnicodeObject *self)
7575{
Fred Drakee4315f52000-05-09 19:53:39 +00007576 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577}
7578
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007579PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007580 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581\n\
7582Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007583and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584
7585static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007586unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588 return fixup(self, fixswapcase);
7589}
7590
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007591PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007592 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593\n\
7594Return a copy of the string S, where all characters have been mapped\n\
7595through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007596Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7597Unmapped characters are left untouched. Characters mapped to None\n\
7598are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599
7600static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007601unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602{
Tim Petersced69f82003-09-16 20:30:58 +00007603 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007604 self->length,
7605 table,
7606 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607}
7608
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007609PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007610 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007612Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613
7614static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007615unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007617 return fixup(self, fixupper);
7618}
7619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007620PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007621 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622\n\
Georg Brandl98064072008-09-09 19:26:00 +00007623Pad a numeric string S with zeros on the left, to fill a field\n\
7624of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007625
7626static PyObject *
7627unicode_zfill(PyUnicodeObject *self, PyObject *args)
7628{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007629 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630 PyUnicodeObject *u;
7631
Martin v. Löwis18e16552006-02-15 17:27:45 +00007632 Py_ssize_t width;
7633 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634 return NULL;
7635
7636 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007637 if (PyUnicode_CheckExact(self)) {
7638 Py_INCREF(self);
7639 return (PyObject*) self;
7640 }
7641 else
7642 return PyUnicode_FromUnicode(
7643 PyUnicode_AS_UNICODE(self),
7644 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007645 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646 }
7647
7648 fill = width - self->length;
7649
7650 u = pad(self, fill, 0, '0');
7651
Walter Dörwald068325e2002-04-15 13:36:47 +00007652 if (u == NULL)
7653 return NULL;
7654
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655 if (u->str[fill] == '+' || u->str[fill] == '-') {
7656 /* move sign to beginning of string */
7657 u->str[0] = u->str[fill];
7658 u->str[fill] = '0';
7659 }
7660
7661 return (PyObject*) u;
7662}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663
7664#if 0
7665static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007666free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007668 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669}
7670#endif
7671
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007672PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007673 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007675Return True if S starts with the specified prefix, False otherwise.\n\
7676With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007677With optional end, stop comparing S at that position.\n\
7678prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679
7680static PyObject *
7681unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007682 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683{
Georg Brandl24250812006-06-09 18:45:48 +00007684 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007686 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007687 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007688 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689
Jesus Cea44e81682011-04-20 16:39:15 +02007690 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007691 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007692 if (PyTuple_Check(subobj)) {
7693 Py_ssize_t i;
7694 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7695 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007696 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007697 if (substring == NULL)
7698 return NULL;
7699 result = tailmatch(self, substring, start, end, -1);
7700 Py_DECREF(substring);
7701 if (result) {
7702 Py_RETURN_TRUE;
7703 }
7704 }
7705 /* nothing matched */
7706 Py_RETURN_FALSE;
7707 }
7708 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007709 if (substring == NULL) {
7710 if (PyErr_ExceptionMatches(PyExc_TypeError))
7711 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7712 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007713 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007714 }
Georg Brandl24250812006-06-09 18:45:48 +00007715 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007717 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718}
7719
7720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007721PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007722 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007724Return True if S ends with the specified suffix, False otherwise.\n\
7725With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007726With optional end, stop comparing S at that position.\n\
7727suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728
7729static PyObject *
7730unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007731 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732{
Georg Brandl24250812006-06-09 18:45:48 +00007733 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007735 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007736 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007737 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738
Jesus Cea44e81682011-04-20 16:39:15 +02007739 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007740 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007741 if (PyTuple_Check(subobj)) {
7742 Py_ssize_t i;
7743 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7744 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007745 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007746 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007747 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007748 result = tailmatch(self, substring, start, end, +1);
7749 Py_DECREF(substring);
7750 if (result) {
7751 Py_RETURN_TRUE;
7752 }
7753 }
7754 Py_RETURN_FALSE;
7755 }
7756 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007757 if (substring == NULL) {
7758 if (PyErr_ExceptionMatches(PyExc_TypeError))
7759 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7760 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007761 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007762 }
Georg Brandl24250812006-06-09 18:45:48 +00007763 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007765 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766}
7767
7768
Eric Smitha9f7d622008-02-17 19:46:49 +00007769/* Implements do_string_format, which is unicode because of stringlib */
7770#include "stringlib/string_format.h"
7771
7772PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007773 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007774\n\
Eric Smith6c840852010-11-06 19:43:44 +00007775Return a formatted version of S, using substitutions from args and kwargs.\n\
7776The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007777
Eric Smithdc13b792008-05-30 18:10:04 +00007778static PyObject *
7779unicode__format__(PyObject *self, PyObject *args)
7780{
7781 PyObject *format_spec;
7782 PyObject *result = NULL;
7783 PyObject *tmp = NULL;
7784
7785 /* If 2.x, convert format_spec to the same type as value */
7786 /* This is to allow things like u''.format('') */
7787 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7788 goto done;
7789 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7790 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007791 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007792 goto done;
7793 }
7794 tmp = PyObject_Unicode(format_spec);
7795 if (tmp == NULL)
7796 goto done;
7797 format_spec = tmp;
7798
7799 result = _PyUnicode_FormatAdvanced(self,
7800 PyUnicode_AS_UNICODE(format_spec),
7801 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007802 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007803 Py_XDECREF(tmp);
7804 return result;
7805}
7806
Eric Smitha9f7d622008-02-17 19:46:49 +00007807PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007808 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007809\n\
Eric Smith6c840852010-11-06 19:43:44 +00007810Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007811
Robert Schuppenies901c9972008-06-10 10:10:31 +00007812static PyObject *
7813unicode__sizeof__(PyUnicodeObject *v)
7814{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007815 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7816 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007817}
7818
7819PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007820 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007821\n\
7822");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007823
7824static PyObject *
7825unicode_getnewargs(PyUnicodeObject *v)
7826{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007827 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007828}
7829
7830
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831static PyMethodDef unicode_methods[] = {
Benjamin Peterson332d7212009-09-18 21:14:55 +00007832 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007833 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7834 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007835 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007836 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7837 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7838 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7839 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7840 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7841 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7842 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007843 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007844 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7845 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7846 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007847 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007848 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007849/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7850 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7851 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7852 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007853 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007854 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007855 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007856 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007857 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7858 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7859 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7860 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7861 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7862 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7863 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7864 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7865 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7866 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7867 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7868 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7869 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7870 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007871 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007872 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7873 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7874 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7875 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007876 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007877#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007878 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879#endif
7880
7881#if 0
7882 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007883 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884#endif
7885
Benjamin Peterson857ce152009-01-31 16:29:18 +00007886 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887 {NULL, NULL}
7888};
7889
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007890static PyObject *
7891unicode_mod(PyObject *v, PyObject *w)
7892{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007893 if (!PyUnicode_Check(v)) {
7894 Py_INCREF(Py_NotImplemented);
7895 return Py_NotImplemented;
7896 }
7897 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007898}
7899
7900static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007901 0, /*nb_add*/
7902 0, /*nb_subtract*/
7903 0, /*nb_multiply*/
7904 0, /*nb_divide*/
7905 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007906};
7907
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007909 (lenfunc) unicode_length, /* sq_length */
7910 PyUnicode_Concat, /* sq_concat */
7911 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7912 (ssizeargfunc) unicode_getitem, /* sq_item */
7913 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7914 0, /* sq_ass_item */
7915 0, /* sq_ass_slice */
7916 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917};
7918
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007919static PyObject*
7920unicode_subscript(PyUnicodeObject* self, PyObject* item)
7921{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007922 if (PyIndex_Check(item)) {
7923 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007924 if (i == -1 && PyErr_Occurred())
7925 return NULL;
7926 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007927 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007928 return unicode_getitem(self, i);
7929 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007930 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007931 Py_UNICODE* source_buf;
7932 Py_UNICODE* result_buf;
7933 PyObject* result;
7934
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007935 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007936 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007937 return NULL;
7938 }
7939
7940 if (slicelength <= 0) {
7941 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007942 } else if (start == 0 && step == 1 && slicelength == self->length &&
7943 PyUnicode_CheckExact(self)) {
7944 Py_INCREF(self);
7945 return (PyObject *)self;
7946 } else if (step == 1) {
7947 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007948 } else {
7949 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007950 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7951 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007952
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007953 if (result_buf == NULL)
7954 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007955
7956 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7957 result_buf[i] = source_buf[cur];
7958 }
Tim Petersced69f82003-09-16 20:30:58 +00007959
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007960 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007961 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007962 return result;
7963 }
7964 } else {
7965 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7966 return NULL;
7967 }
7968}
7969
7970static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007971 (lenfunc)unicode_length, /* mp_length */
7972 (binaryfunc)unicode_subscript, /* mp_subscript */
7973 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007974};
7975
Martin v. Löwis18e16552006-02-15 17:27:45 +00007976static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007978 Py_ssize_t index,
7979 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980{
7981 if (index != 0) {
7982 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007983 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984 return -1;
7985 }
7986 *ptr = (void *) self->str;
7987 return PyUnicode_GET_DATA_SIZE(self);
7988}
7989
Martin v. Löwis18e16552006-02-15 17:27:45 +00007990static Py_ssize_t
7991unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007992 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993{
7994 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007995 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 return -1;
7997}
7998
7999static int
8000unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008001 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002{
8003 if (lenp)
8004 *lenp = PyUnicode_GET_DATA_SIZE(self);
8005 return 1;
8006}
8007
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008008static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008010 Py_ssize_t index,
8011 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012{
8013 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008014
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 if (index != 0) {
8016 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008017 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 return -1;
8019 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008020 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008022 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008023 *ptr = (void *) PyString_AS_STRING(str);
8024 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025}
8026
8027/* Helpers for PyUnicode_Format() */
8028
8029static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008030getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008032 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008034 (*p_argidx)++;
8035 if (arglen < 0)
8036 return args;
8037 else
8038 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039 }
8040 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008041 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042 return NULL;
8043}
8044
8045#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008046#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008048#define F_ALT (1<<3)
8049#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050
Martin v. Löwis18e16552006-02-15 17:27:45 +00008051static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008052strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008054 register Py_ssize_t i;
8055 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008057 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058
Guido van Rossumd57fd912000-03-10 22:53:23 +00008059 return len;
8060}
8061
Neal Norwitzfc76d632006-01-10 06:03:13 +00008062static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008063longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8064{
Tim Peters15231542006-02-16 01:08:01 +00008065 Py_ssize_t result;
8066
Neal Norwitzfc76d632006-01-10 06:03:13 +00008067 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008068 result = strtounicode(buffer, (char *)buffer);
8069 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008070}
8071
Guido van Rossum078151d2002-08-11 04:24:12 +00008072/* XXX To save some code duplication, formatfloat/long/int could have been
8073 shared with stringobject.c, converting from 8-bit to Unicode after the
8074 formatting is done. */
8075
Mark Dickinson18cfada2009-11-23 18:46:41 +00008076/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8077
8078static PyObject *
8079formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008081 char *p;
8082 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008084
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085 x = PyFloat_AsDouble(v);
8086 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008087 return NULL;
8088
Guido van Rossumd57fd912000-03-10 22:53:23 +00008089 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008090 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008091
Mark Dickinson18cfada2009-11-23 18:46:41 +00008092 p = PyOS_double_to_string(x, type, prec,
8093 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8094 if (p == NULL)
8095 return NULL;
8096 result = PyUnicode_FromStringAndSize(p, strlen(p));
8097 PyMem_Free(p);
8098 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099}
8100
Tim Peters38fd5b62000-09-21 05:43:11 +00008101static PyObject*
8102formatlong(PyObject *val, int flags, int prec, int type)
8103{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008104 char *buf;
8105 int i, len;
8106 PyObject *str; /* temporary string object. */
8107 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008108
Benjamin Peterson857ce152009-01-31 16:29:18 +00008109 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8110 if (!str)
8111 return NULL;
8112 result = _PyUnicode_New(len);
8113 if (!result) {
8114 Py_DECREF(str);
8115 return NULL;
8116 }
8117 for (i = 0; i < len; i++)
8118 result->str[i] = buf[i];
8119 result->str[len] = 0;
8120 Py_DECREF(str);
8121 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008122}
8123
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124static int
8125formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008126 size_t buflen,
8127 int flags,
8128 int prec,
8129 int type,
8130 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008132 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008133 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8134 * + 1 + 1
8135 * = 24
8136 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008137 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008138 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139 long x;
8140
8141 x = PyInt_AsLong(v);
8142 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008143 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008144 if (x < 0 && type == 'u') {
8145 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008146 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008147 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8148 sign = "-";
8149 else
8150 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008152 prec = 1;
8153
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008154 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8155 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008156 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008157 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008158 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008159 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008160 return -1;
8161 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008162
8163 if ((flags & F_ALT) &&
8164 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008165 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008166 * of issues that cause pain:
8167 * - when 0 is being converted, the C standard leaves off
8168 * the '0x' or '0X', which is inconsistent with other
8169 * %#x/%#X conversions and inconsistent with Python's
8170 * hex() function
8171 * - there are platforms that violate the standard and
8172 * convert 0 with the '0x' or '0X'
8173 * (Metrowerks, Compaq Tru64)
8174 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008175 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008176 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008177 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008178 * We can achieve the desired consistency by inserting our
8179 * own '0x' or '0X' prefix, and substituting %x/%X in place
8180 * of %#x/%#X.
8181 *
8182 * Note that this is the same approach as used in
8183 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008184 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008185 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8186 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008187 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008188 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008189 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8190 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008191 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008192 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008193 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008194 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008195 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008196 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197}
8198
8199static int
8200formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008201 size_t buflen,
8202 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203{
Ezio Melotti32125152010-02-25 17:36:04 +00008204 PyObject *unistr;
8205 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008206 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008207 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008208 if (PyUnicode_GET_SIZE(v) != 1)
8209 goto onError;
8210 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008213 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008214 if (PyString_GET_SIZE(v) != 1)
8215 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008216 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8217 with a UnicodeDecodeError if 'char' is not decodable with the
8218 default encoding (usually ASCII, but it might be something else) */
8219 str = PyString_AS_STRING(v);
8220 if ((unsigned char)str[0] > 0x7F) {
8221 /* the char is not ASCII; try to decode the string using the
8222 default encoding and return -1 to let the UnicodeDecodeError
8223 be raised if the string can't be decoded */
8224 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8225 if (unistr == NULL)
8226 return -1;
8227 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8228 Py_DECREF(unistr);
8229 }
8230 else
8231 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233
8234 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008235 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008237 x = PyInt_AsLong(v);
8238 if (x == -1 && PyErr_Occurred())
8239 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008240#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008241 if (x < 0 || x > 0x10ffff) {
8242 PyErr_SetString(PyExc_OverflowError,
8243 "%c arg not in range(0x110000) "
8244 "(wide Python build)");
8245 return -1;
8246 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008247#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008248 if (x < 0 || x > 0xffff) {
8249 PyErr_SetString(PyExc_OverflowError,
8250 "%c arg not in range(0x10000) "
8251 "(narrow Python build)");
8252 return -1;
8253 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008254#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008255 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256 }
8257 buf[1] = '\0';
8258 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008259
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008260 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008261 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008262 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008263 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264}
8265
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008266/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8267
Mark Dickinson18cfada2009-11-23 18:46:41 +00008268 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008269 chars are formatted. XXX This is a magic number. Each formatting
8270 routine does bounds checking to ensure no overflow, but a better
8271 solution may be to malloc a buffer of appropriate size for each
8272 format. For now, the current solution is sufficient.
8273*/
8274#define FORMATBUFLEN (size_t)120
8275
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008277 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278{
8279 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008280 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281 int args_owned = 0;
8282 PyUnicodeObject *result = NULL;
8283 PyObject *dict = NULL;
8284 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008285
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008287 PyErr_BadInternalCall();
8288 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 }
8290 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008291 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008292 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 fmt = PyUnicode_AS_UNICODE(uformat);
8294 fmtcnt = PyUnicode_GET_SIZE(uformat);
8295
8296 reslen = rescnt = fmtcnt + 100;
8297 result = _PyUnicode_New(reslen);
8298 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008299 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 res = PyUnicode_AS_UNICODE(result);
8301
8302 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008303 arglen = PyTuple_Size(args);
8304 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305 }
8306 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008307 arglen = -1;
8308 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 }
Benjamin Peterson23d49d32012-08-28 17:55:35 -04008310 if (PyMapping_Check(args) && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008311 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008312 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313
8314 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008315 if (*fmt != '%') {
8316 if (--rescnt < 0) {
8317 rescnt = fmtcnt + 100;
8318 reslen += rescnt;
8319 if (_PyUnicode_Resize(&result, reslen) < 0)
8320 goto onError;
8321 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8322 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008323 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008324 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008325 }
8326 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008327 /* Got a format specifier */
8328 int flags = 0;
8329 Py_ssize_t width = -1;
8330 int prec = -1;
8331 Py_UNICODE c = '\0';
8332 Py_UNICODE fill;
8333 int isnumok;
8334 PyObject *v = NULL;
8335 PyObject *temp = NULL;
8336 Py_UNICODE *pbuf;
8337 Py_UNICODE sign;
8338 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008339 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008340
8341 fmt++;
8342 if (*fmt == '(') {
8343 Py_UNICODE *keystart;
8344 Py_ssize_t keylen;
8345 PyObject *key;
8346 int pcount = 1;
8347
8348 if (dict == NULL) {
8349 PyErr_SetString(PyExc_TypeError,
8350 "format requires a mapping");
8351 goto onError;
8352 }
8353 ++fmt;
8354 --fmtcnt;
8355 keystart = fmt;
8356 /* Skip over balanced parentheses */
8357 while (pcount > 0 && --fmtcnt >= 0) {
8358 if (*fmt == ')')
8359 --pcount;
8360 else if (*fmt == '(')
8361 ++pcount;
8362 fmt++;
8363 }
8364 keylen = fmt - keystart - 1;
8365 if (fmtcnt < 0 || pcount > 0) {
8366 PyErr_SetString(PyExc_ValueError,
8367 "incomplete format key");
8368 goto onError;
8369 }
8370#if 0
8371 /* keys are converted to strings using UTF-8 and
8372 then looked up since Python uses strings to hold
8373 variables names etc. in its namespaces and we
8374 wouldn't want to break common idioms. */
8375 key = PyUnicode_EncodeUTF8(keystart,
8376 keylen,
8377 NULL);
8378#else
8379 key = PyUnicode_FromUnicode(keystart, keylen);
8380#endif
8381 if (key == NULL)
8382 goto onError;
8383 if (args_owned) {
8384 Py_DECREF(args);
8385 args_owned = 0;
8386 }
8387 args = PyObject_GetItem(dict, key);
8388 Py_DECREF(key);
8389 if (args == NULL) {
8390 goto onError;
8391 }
8392 args_owned = 1;
8393 arglen = -1;
8394 argidx = -2;
8395 }
8396 while (--fmtcnt >= 0) {
8397 switch (c = *fmt++) {
8398 case '-': flags |= F_LJUST; continue;
8399 case '+': flags |= F_SIGN; continue;
8400 case ' ': flags |= F_BLANK; continue;
8401 case '#': flags |= F_ALT; continue;
8402 case '0': flags |= F_ZERO; continue;
8403 }
8404 break;
8405 }
8406 if (c == '*') {
8407 v = getnextarg(args, arglen, &argidx);
8408 if (v == NULL)
8409 goto onError;
8410 if (!PyInt_Check(v)) {
8411 PyErr_SetString(PyExc_TypeError,
8412 "* wants int");
8413 goto onError;
8414 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008415 width = PyInt_AsSsize_t(v);
8416 if (width == -1 && PyErr_Occurred())
8417 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008418 if (width < 0) {
8419 flags |= F_LJUST;
8420 width = -width;
8421 }
8422 if (--fmtcnt >= 0)
8423 c = *fmt++;
8424 }
8425 else if (c >= '0' && c <= '9') {
8426 width = c - '0';
8427 while (--fmtcnt >= 0) {
8428 c = *fmt++;
8429 if (c < '0' || c > '9')
8430 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008431 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008432 PyErr_SetString(PyExc_ValueError,
8433 "width too big");
8434 goto onError;
8435 }
8436 width = width*10 + (c - '0');
8437 }
8438 }
8439 if (c == '.') {
8440 prec = 0;
8441 if (--fmtcnt >= 0)
8442 c = *fmt++;
8443 if (c == '*') {
8444 v = getnextarg(args, arglen, &argidx);
8445 if (v == NULL)
8446 goto onError;
8447 if (!PyInt_Check(v)) {
8448 PyErr_SetString(PyExc_TypeError,
8449 "* wants int");
8450 goto onError;
8451 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008452 prec = _PyInt_AsInt(v);
8453 if (prec == -1 && PyErr_Occurred())
8454 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008455 if (prec < 0)
8456 prec = 0;
8457 if (--fmtcnt >= 0)
8458 c = *fmt++;
8459 }
8460 else if (c >= '0' && c <= '9') {
8461 prec = c - '0';
8462 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008463 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008464 if (c < '0' || c > '9')
8465 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008466 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008467 PyErr_SetString(PyExc_ValueError,
8468 "prec too big");
8469 goto onError;
8470 }
8471 prec = prec*10 + (c - '0');
8472 }
8473 }
8474 } /* prec */
8475 if (fmtcnt >= 0) {
8476 if (c == 'h' || c == 'l' || c == 'L') {
8477 if (--fmtcnt >= 0)
8478 c = *fmt++;
8479 }
8480 }
8481 if (fmtcnt < 0) {
8482 PyErr_SetString(PyExc_ValueError,
8483 "incomplete format");
8484 goto onError;
8485 }
8486 if (c != '%') {
8487 v = getnextarg(args, arglen, &argidx);
8488 if (v == NULL)
8489 goto onError;
8490 }
8491 sign = 0;
8492 fill = ' ';
8493 switch (c) {
8494
8495 case '%':
8496 pbuf = formatbuf;
8497 /* presume that buffer length is at least 1 */
8498 pbuf[0] = '%';
8499 len = 1;
8500 break;
8501
8502 case 's':
8503 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008504 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008505 temp = v;
8506 Py_INCREF(temp);
8507 }
8508 else {
8509 PyObject *unicode;
8510 if (c == 's')
8511 temp = PyObject_Unicode(v);
8512 else
8513 temp = PyObject_Repr(v);
8514 if (temp == NULL)
8515 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008516 if (PyUnicode_Check(temp))
8517 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008518 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008519 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008520 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8521 PyString_GET_SIZE(temp),
8522 NULL,
8523 "strict");
8524 Py_DECREF(temp);
8525 temp = unicode;
8526 if (temp == NULL)
8527 goto onError;
8528 }
8529 else {
8530 Py_DECREF(temp);
8531 PyErr_SetString(PyExc_TypeError,
8532 "%s argument has non-string str()");
8533 goto onError;
8534 }
8535 }
8536 pbuf = PyUnicode_AS_UNICODE(temp);
8537 len = PyUnicode_GET_SIZE(temp);
8538 if (prec >= 0 && len > prec)
8539 len = prec;
8540 break;
8541
8542 case 'i':
8543 case 'd':
8544 case 'u':
8545 case 'o':
8546 case 'x':
8547 case 'X':
8548 if (c == 'i')
8549 c = 'd';
8550 isnumok = 0;
8551 if (PyNumber_Check(v)) {
8552 PyObject *iobj=NULL;
8553
8554 if (PyInt_Check(v) || (PyLong_Check(v))) {
8555 iobj = v;
8556 Py_INCREF(iobj);
8557 }
8558 else {
8559 iobj = PyNumber_Int(v);
8560 if (iobj==NULL) iobj = PyNumber_Long(v);
8561 }
8562 if (iobj!=NULL) {
8563 if (PyInt_Check(iobj)) {
8564 isnumok = 1;
8565 pbuf = formatbuf;
8566 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8567 flags, prec, c, iobj);
8568 Py_DECREF(iobj);
8569 if (len < 0)
8570 goto onError;
8571 sign = 1;
8572 }
8573 else if (PyLong_Check(iobj)) {
8574 isnumok = 1;
8575 temp = formatlong(iobj, flags, prec, c);
8576 Py_DECREF(iobj);
8577 if (!temp)
8578 goto onError;
8579 pbuf = PyUnicode_AS_UNICODE(temp);
8580 len = PyUnicode_GET_SIZE(temp);
8581 sign = 1;
8582 }
8583 else {
8584 Py_DECREF(iobj);
8585 }
8586 }
8587 }
8588 if (!isnumok) {
8589 PyErr_Format(PyExc_TypeError,
8590 "%%%c format: a number is required, "
8591 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8592 goto onError;
8593 }
8594 if (flags & F_ZERO)
8595 fill = '0';
8596 break;
8597
8598 case 'e':
8599 case 'E':
8600 case 'f':
8601 case 'F':
8602 case 'g':
8603 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008604 temp = formatfloat(v, flags, prec, c);
8605 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008606 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008607 pbuf = PyUnicode_AS_UNICODE(temp);
8608 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008609 sign = 1;
8610 if (flags & F_ZERO)
8611 fill = '0';
8612 break;
8613
8614 case 'c':
8615 pbuf = formatbuf;
8616 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8617 if (len < 0)
8618 goto onError;
8619 break;
8620
8621 default:
8622 PyErr_Format(PyExc_ValueError,
8623 "unsupported format character '%c' (0x%x) "
8624 "at index %zd",
8625 (31<=c && c<=126) ? (char)c : '?',
8626 (int)c,
8627 (Py_ssize_t)(fmt - 1 -
8628 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008629 goto onError;
8630 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008631 if (sign) {
8632 if (*pbuf == '-' || *pbuf == '+') {
8633 sign = *pbuf++;
8634 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008635 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008636 else if (flags & F_SIGN)
8637 sign = '+';
8638 else if (flags & F_BLANK)
8639 sign = ' ';
8640 else
8641 sign = 0;
8642 }
8643 if (width < len)
8644 width = len;
8645 if (rescnt - (sign != 0) < width) {
8646 reslen -= rescnt;
8647 rescnt = width + fmtcnt + 100;
8648 reslen += rescnt;
8649 if (reslen < 0) {
8650 Py_XDECREF(temp);
8651 PyErr_NoMemory();
8652 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008653 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008654 if (_PyUnicode_Resize(&result, reslen) < 0) {
8655 Py_XDECREF(temp);
8656 goto onError;
8657 }
8658 res = PyUnicode_AS_UNICODE(result)
8659 + reslen - rescnt;
8660 }
8661 if (sign) {
8662 if (fill != ' ')
8663 *res++ = sign;
8664 rescnt--;
8665 if (width > len)
8666 width--;
8667 }
8668 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8669 assert(pbuf[0] == '0');
8670 assert(pbuf[1] == c);
8671 if (fill != ' ') {
8672 *res++ = *pbuf++;
8673 *res++ = *pbuf++;
8674 }
8675 rescnt -= 2;
8676 width -= 2;
8677 if (width < 0)
8678 width = 0;
8679 len -= 2;
8680 }
8681 if (width > len && !(flags & F_LJUST)) {
8682 do {
8683 --rescnt;
8684 *res++ = fill;
8685 } while (--width > len);
8686 }
8687 if (fill == ' ') {
8688 if (sign)
8689 *res++ = sign;
8690 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8691 assert(pbuf[0] == '0');
8692 assert(pbuf[1] == c);
8693 *res++ = *pbuf++;
8694 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008695 }
8696 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008697 Py_UNICODE_COPY(res, pbuf, len);
8698 res += len;
8699 rescnt -= len;
8700 while (--width >= len) {
8701 --rescnt;
8702 *res++ = ' ';
8703 }
8704 if (dict && (argidx < arglen) && c != '%') {
8705 PyErr_SetString(PyExc_TypeError,
8706 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008707 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008708 goto onError;
8709 }
8710 Py_XDECREF(temp);
8711 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 } /* until end */
8713 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008714 PyErr_SetString(PyExc_TypeError,
8715 "not all arguments converted during string formatting");
8716 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 }
8718
Thomas Woutersa96affe2006-03-12 00:29:36 +00008719 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008720 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008722 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723 }
8724 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008725 return (PyObject *)result;
8726
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008727 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728 Py_XDECREF(result);
8729 Py_DECREF(uformat);
8730 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008731 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008732 }
8733 return NULL;
8734}
8735
8736static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008737 (readbufferproc) unicode_buffer_getreadbuf,
8738 (writebufferproc) unicode_buffer_getwritebuf,
8739 (segcountproc) unicode_buffer_getsegcount,
8740 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741};
8742
Jeremy Hylton938ace62002-07-17 16:30:39 +00008743static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008744unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8745
Tim Peters6d6c1a32001-08-02 04:15:00 +00008746static PyObject *
8747unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8748{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008749 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008750 static char *kwlist[] = {"string", "encoding", "errors", 0};
8751 char *encoding = NULL;
8752 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008753
Benjamin Peterson857ce152009-01-31 16:29:18 +00008754 if (type != &PyUnicode_Type)
8755 return unicode_subtype_new(type, args, kwds);
8756 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008757 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008758 return NULL;
8759 if (x == NULL)
8760 return (PyObject *)_PyUnicode_New(0);
8761 if (encoding == NULL && errors == NULL)
8762 return PyObject_Unicode(x);
8763 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008764 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008765}
8766
Guido van Rossume023fe02001-08-30 03:12:59 +00008767static PyObject *
8768unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8769{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008770 PyUnicodeObject *tmp, *pnew;
8771 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008772
Benjamin Peterson857ce152009-01-31 16:29:18 +00008773 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8774 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8775 if (tmp == NULL)
8776 return NULL;
8777 assert(PyUnicode_Check(tmp));
8778 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8779 if (pnew == NULL) {
8780 Py_DECREF(tmp);
8781 return NULL;
8782 }
8783 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8784 if (pnew->str == NULL) {
8785 _Py_ForgetReference((PyObject *)pnew);
8786 PyObject_Del(pnew);
8787 Py_DECREF(tmp);
8788 return PyErr_NoMemory();
8789 }
8790 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8791 pnew->length = n;
8792 pnew->hash = tmp->hash;
8793 Py_DECREF(tmp);
8794 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008795}
8796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008797PyDoc_STRVAR(unicode_doc,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07008798 "unicode(object='') -> unicode object\n\
8799unicode(string[, encoding[, errors]]) -> unicode object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008800\n\
8801Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008802encoding defaults to the current default string encoding.\n\
8803errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008804
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008806 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008807 "unicode", /* tp_name */
8808 sizeof(PyUnicodeObject), /* tp_size */
8809 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008810 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008811 (destructor)unicode_dealloc, /* tp_dealloc */
8812 0, /* tp_print */
8813 0, /* tp_getattr */
8814 0, /* tp_setattr */
8815 0, /* tp_compare */
8816 unicode_repr, /* tp_repr */
8817 &unicode_as_number, /* tp_as_number */
8818 &unicode_as_sequence, /* tp_as_sequence */
8819 &unicode_as_mapping, /* tp_as_mapping */
8820 (hashfunc) unicode_hash, /* tp_hash*/
8821 0, /* tp_call*/
8822 (reprfunc) unicode_str, /* tp_str */
8823 PyObject_GenericGetAttr, /* tp_getattro */
8824 0, /* tp_setattro */
8825 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008826 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008827 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008828 unicode_doc, /* tp_doc */
8829 0, /* tp_traverse */
8830 0, /* tp_clear */
8831 PyUnicode_RichCompare, /* tp_richcompare */
8832 0, /* tp_weaklistoffset */
8833 0, /* tp_iter */
8834 0, /* tp_iternext */
8835 unicode_methods, /* tp_methods */
8836 0, /* tp_members */
8837 0, /* tp_getset */
8838 &PyBaseString_Type, /* tp_base */
8839 0, /* tp_dict */
8840 0, /* tp_descr_get */
8841 0, /* tp_descr_set */
8842 0, /* tp_dictoffset */
8843 0, /* tp_init */
8844 0, /* tp_alloc */
8845 unicode_new, /* tp_new */
8846 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847};
8848
8849/* Initialize the Unicode implementation */
8850
Thomas Wouters78890102000-07-22 19:25:51 +00008851void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008853 int i;
8854
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008855 /* XXX - move this array to unicodectype.c ? */
8856 Py_UNICODE linebreak[] = {
8857 0x000A, /* LINE FEED */
8858 0x000D, /* CARRIAGE RETURN */
8859 0x001C, /* FILE SEPARATOR */
8860 0x001D, /* GROUP SEPARATOR */
8861 0x001E, /* RECORD SEPARATOR */
8862 0x0085, /* NEXT LINE */
8863 0x2028, /* LINE SEPARATOR */
8864 0x2029, /* PARAGRAPH SEPARATOR */
8865 };
8866
Fred Drakee4315f52000-05-09 19:53:39 +00008867 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008868 free_list = NULL;
8869 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008871 if (!unicode_empty)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008872 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00008873
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008874 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008875 for (i = 0; i < 256; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008876 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008877 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008878 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008879
8880 /* initialize the linebreak bloom filter */
8881 bloom_linebreak = make_bloom_mask(
8882 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8883 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008884
8885 PyType_Ready(&EncodingMapType);
Benjamin Peterson6da3ed62012-10-30 23:21:10 -04008886
8887 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8888 Py_FatalError("Can't initialize field name iterator type");
8889
8890 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8891 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892}
8893
8894/* Finalize the Unicode implementation */
8895
Christian Heimes3b718a72008-02-14 12:47:33 +00008896int
8897PyUnicode_ClearFreeList(void)
8898{
8899 int freelist_size = numfree;
8900 PyUnicodeObject *u;
8901
8902 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008903 PyUnicodeObject *v = u;
8904 u = *(PyUnicodeObject **)u;
8905 if (v->str)
8906 PyObject_DEL(v->str);
8907 Py_XDECREF(v->defenc);
8908 PyObject_Del(v);
8909 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008910 }
8911 free_list = NULL;
8912 assert(numfree == 0);
8913 return freelist_size;
8914}
8915
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916void
Thomas Wouters78890102000-07-22 19:25:51 +00008917_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008918{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008919 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008921 Py_XDECREF(unicode_empty);
8922 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008923
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008924 for (i = 0; i < 256; i++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008925 if (unicode_latin1[i]) {
8926 Py_DECREF(unicode_latin1[i]);
8927 unicode_latin1[i] = NULL;
8928 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008929 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008930 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008931}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008932
Anthony Baxterac6bd462006-04-13 02:06:09 +00008933#ifdef __cplusplus
8934}
8935#endif