blob: 46bfe2b54c39403ab9c4692ca9fe52e2cf2ccf63 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson1c5d21d2009-01-31 22:33:02 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Martin v. Löwis5cb69362006-04-14 09:08:42 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
44
Guido van Rossumd57fd912000-03-10 22:53:23 +000045#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000046#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
Christian Heimes5b970ad2008-02-06 13:33:44 +000054#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Christian Heimes5b970ad2008-02-06 13:33:44 +000062 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Anthony Baxterac6bd462006-04-13 02:06:09 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Guido van Rossumd57fd912000-03-10 22:53:23 +000095/* Free list for Unicode objects */
Christian Heimes5b970ad2008-02-06 13:33:44 +000096static PyUnicodeObject *free_list;
97static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000099/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103 shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
Fred Drakee4315f52000-05-09 19:53:39 +0000106/* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
108
109 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000110 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000111
112*/
Fred Drakee4315f52000-05-09 19:53:39 +0000113static char unicode_default_encoding[100];
114
Christian Heimes4d4f2702008-01-30 11:32:37 +0000115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000117 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna22b24382010-03-30 08:24:06 +0000118/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000119/* case 0x000A: * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000120/* case 0x000B: * LINE TABULATION */
Christian Heimes32a66a02008-10-02 19:47:50 +0000121/* case 0x000C: * FORM FEED */
122/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000125/* case 0x001C: * FILE SEPARATOR */
126/* case 0x001D: * GROUP SEPARATOR */
127/* case 0x001E: * RECORD SEPARATOR */
128/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000129 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes32a66a02008-10-02 19:47:50 +0000130/* case 0x0020: * SPACE */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000135
Benjamin Peterson857ce152009-01-31 16:29:18 +0000136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +0000148 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000149/* 0x000A, * LINE FEED */
Florent Xicluna22b24382010-03-30 08:24:06 +0000150/* 0x000B, * LINE TABULATION */
151/* 0x000C, * FORM FEED */
Christian Heimes32a66a02008-10-02 19:47:50 +0000152/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna22b24382010-03-30 08:24:06 +0000153 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson857ce152009-01-31 16:29:18 +0000154 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes32a66a02008-10-02 19:47:50 +0000155/* 0x001C, * FILE SEPARATOR */
156/* 0x001D, * GROUP SEPARATOR */
157/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes4d4f2702008-01-30 11:32:37 +0000163
Benjamin Peterson857ce152009-01-31 16:29:18 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes4d4f2702008-01-30 11:32:37 +0000172};
173
174
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000175Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000176PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000177{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000178#ifdef Py_UNICODE_WIDE
Benjamin Peterson857ce152009-01-31 16:29:18 +0000179 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000180#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
183 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000184#endif
185}
186
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
Antoine Pitrou10042922010-01-13 14:01:26 +0000195#if LONG_BIT >= 128
196#define BLOOM_WIDTH 128
197#elif LONG_BIT >= 64
198#define BLOOM_WIDTH 64
199#elif LONG_BIT >= 32
200#define BLOOM_WIDTH 32
201#else
202#error "LONG_BIT is smaller than 32"
203#endif
204
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000205#define BLOOM_MASK unsigned long
206
207static BLOOM_MASK bloom_linebreak;
208
Antoine Pitrou10042922010-01-13 14:01:26 +0000209#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000211
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000212#define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000215
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000217{
218 /* calculate simple bloom-style bitmask for a given unicode string */
219
Antoine Pitrou10042922010-01-13 14:01:26 +0000220 BLOOM_MASK mask;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000221 Py_ssize_t i;
222
223 mask = 0;
224 for (i = 0; i < len; i++)
Antoine Pitrou64672132010-01-13 07:55:48 +0000225 BLOOM_ADD(mask, ptr[i]);
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000226
227 return mask;
228}
229
Fredrik Lundhc2d29c52006-05-27 14:58:20 +0000230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000231{
232 Py_ssize_t i;
233
234 for (i = 0; i < setlen; i++)
235 if (set[i] == chr)
236 return 1;
237
Fredrik Lundh77633512006-05-23 19:47:35 +0000238 return 0;
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000239}
240
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000241#define BLOOM_MEMBER(mask, chr, set, setlen) \
Fredrik Lundhb63588c2006-05-23 18:44:25 +0000242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244/* --- Unicode Object ----------------------------------------------------- */
245
246static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000247int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000248 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249{
250 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000251
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000252 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 if (unicode->length == length)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000254 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
258 instead ! */
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000259
Benjamin Peterson857ce152009-01-31 16:29:18 +0000260 if (unicode == unicode_empty ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000261 (unicode->length == 1 &&
262 unicode->str[0] < 256U &&
263 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000266 return -1;
267 }
268
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
Andrew M. Kuchling07bbfc62006-05-26 19:51:10 +0000271 safe to look at str[length] (without making any assumptions about what
Fredrik Lundh06a69dd2006-05-26 08:54:28 +0000272 it contains). */
273
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 oldstr = unicode->str;
Neal Norwitz419fd492008-03-17 20:22:43 +0000275 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000276 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000278 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279 PyErr_NoMemory();
280 return -1;
281 }
282 unicode->str[length] = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +0000283 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000285 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000286 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000287 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000288 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 }
290 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292 return 0;
293}
294
295/* We allocate one more byte to make sure the string is
Georg Brandle27d0442010-08-01 20:54:30 +0000296 Ux0000 terminated; some code relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000297
298 XXX This allocator could further be enhanced by assuring that the
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000299 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300
301*/
302
303static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305{
306 register PyUnicodeObject *unicode;
307
Andrew Dalkee0df7622006-05-27 11:04:36 +0000308 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309 if (length == 0 && unicode_empty != NULL) {
310 Py_INCREF(unicode_empty);
311 return unicode_empty;
312 }
313
Neal Norwitze7d8be82008-07-31 17:17:14 +0000314 /* Ensure we won't overflow the size. */
315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316 return (PyUnicodeObject *)PyErr_NoMemory();
317 }
318
Guido van Rossumd57fd912000-03-10 22:53:23 +0000319 /* Unicode freelist & memory allocation */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000320 if (free_list) {
321 unicode = free_list;
322 free_list = *(PyUnicodeObject **)unicode;
323 numfree--;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000324 if (unicode->str) {
325 /* Keep-Alive optimization: we only upsize the buffer,
326 never downsize it. */
327 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000328 unicode_resize(unicode, length) < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000329 PyObject_DEL(unicode->str);
330 unicode->str = NULL;
331 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000332 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000333 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000336 }
337 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 }
339 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000340 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000342 if (unicode == NULL)
343 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000346 }
347
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000348 if (!unicode->str) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000349 PyErr_NoMemory();
350 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000351 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000352 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000353 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in
357 * that case.
358 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000359 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000360 unicode->str[length] = 0;
Martin v. Löwisf15da692006-04-13 07:24:50 +0000361 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000362 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000363 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000365
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000366 onError:
Amaury Forgeot d'Arc06847b12008-07-31 23:39:05 +0000367 /* XXX UNREF/NEWREF interface should be more symmetrical */
368 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000369 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000370 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372}
373
374static
Guido van Rossum9475a232001-10-05 20:51:39 +0000375void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000377 if (PyUnicode_CheckExact(unicode) &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000378 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000379 /* Keep-Alive optimization */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000380 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
381 PyObject_DEL(unicode->str);
382 unicode->str = NULL;
383 unicode->length = 0;
384 }
385 if (unicode->defenc) {
Georg Brandld070cc52010-08-01 21:06:46 +0000386 Py_CLEAR(unicode->defenc);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000387 }
388 /* Add to free list */
Christian Heimes5b970ad2008-02-06 13:33:44 +0000389 *(PyUnicodeObject **)unicode = free_list;
390 free_list = unicode;
391 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392 }
393 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000394 PyObject_DEL(unicode->str);
395 Py_XDECREF(unicode->defenc);
396 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000397 }
398}
399
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000400static
401int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000402{
403 register PyUnicodeObject *v;
404
405 /* Argument checks */
406 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000407 PyErr_BadInternalCall();
408 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000409 }
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000410 v = *unicode;
Christian Heimese93237d2007-12-19 02:37:44 +0000411 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000412 PyErr_BadInternalCall();
413 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000414 }
415
416 /* Resizing unicode_empty and single character objects is not
417 possible since these are being shared. We simply return a fresh
418 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000419 if (v->length != length &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000420 (v == unicode_empty || v->length == 1)) {
421 PyUnicodeObject *w = _PyUnicode_New(length);
422 if (w == NULL)
423 return -1;
424 Py_UNICODE_COPY(w->str, v->str,
425 length < v->length ? length : v->length);
426 Py_DECREF(*unicode);
427 *unicode = w;
428 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000429 }
430
431 /* Note that we don't have to modify *unicode for unshared Unicode
432 objects, since we can modify them in-place. */
433 return unicode_resize(v, length);
434}
435
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +0000436int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
437{
438 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
439}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440
Guido van Rossumd57fd912000-03-10 22:53:23 +0000441PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000442 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443{
444 PyUnicodeObject *unicode;
445
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000446 /* If the Unicode data is known at construction time, we can apply
447 some optimizations which share commonly used objects. */
448 if (u != NULL) {
449
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000450 /* Optimization for empty strings */
451 if (size == 0 && unicode_empty != NULL) {
452 Py_INCREF(unicode_empty);
453 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000454 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000455
456 /* Single character Unicode objects in the Latin-1 range are
457 shared when using this constructor */
458 if (size == 1 && *u < 256) {
459 unicode = unicode_latin1[*u];
460 if (!unicode) {
461 unicode = _PyUnicode_New(1);
462 if (!unicode)
463 return NULL;
464 unicode->str[0] = *u;
465 unicode_latin1[*u] = unicode;
466 }
467 Py_INCREF(unicode);
468 return (PyObject *)unicode;
469 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000470 }
Tim Petersced69f82003-09-16 20:30:58 +0000471
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472 unicode = _PyUnicode_New(size);
473 if (!unicode)
474 return NULL;
475
476 /* Copy the Unicode data into the new object */
477 if (u != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000478 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479
480 return (PyObject *)unicode;
481}
482
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000483PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
484{
485 PyUnicodeObject *unicode;
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000486
Benjamin Peterson857ce152009-01-31 16:29:18 +0000487 if (size < 0) {
488 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000489 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson857ce152009-01-31 16:29:18 +0000490 return NULL;
491 }
Gregory P. Smithc00eb732008-04-09 23:16:37 +0000492
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000493 /* If the Unicode data is known at construction time, we can apply
494 some optimizations which share commonly used objects.
495 Also, this means the input must be UTF-8, so fall back to the
496 UTF-8 decoder at the end. */
497 if (u != NULL) {
498
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000499 /* Optimization for empty strings */
500 if (size == 0 && unicode_empty != NULL) {
501 Py_INCREF(unicode_empty);
502 return (PyObject *)unicode_empty;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000503 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000504
505 /* Single characters are shared when using this constructor.
506 Restrict to ASCII, since the input must be UTF-8. */
507 if (size == 1 && Py_CHARMASK(*u) < 128) {
508 unicode = unicode_latin1[Py_CHARMASK(*u)];
509 if (!unicode) {
510 unicode = _PyUnicode_New(1);
511 if (!unicode)
512 return NULL;
513 unicode->str[0] = Py_CHARMASK(*u);
514 unicode_latin1[Py_CHARMASK(*u)] = unicode;
515 }
516 Py_INCREF(unicode);
517 return (PyObject *)unicode;
518 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000519
520 return PyUnicode_DecodeUTF8(u, size, NULL);
521 }
522
523 unicode = _PyUnicode_New(size);
524 if (!unicode)
525 return NULL;
526
527 return (PyObject *)unicode;
528}
529
530PyObject *PyUnicode_FromString(const char *u)
531{
532 size_t size = strlen(u);
533 if (size > PY_SSIZE_T_MAX) {
534 PyErr_SetString(PyExc_OverflowError, "input too long");
535 return NULL;
536 }
537
538 return PyUnicode_FromStringAndSize(u, size);
539}
540
Guido van Rossumd57fd912000-03-10 22:53:23 +0000541#ifdef HAVE_WCHAR_H
542
Mark Dickinson6b265f12009-03-18 16:07:26 +0000543#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
544# define CONVERT_WCHAR_TO_SURROGATES
545#endif
546
547#ifdef CONVERT_WCHAR_TO_SURROGATES
548
549/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
550 to convert from UTF32 to UTF16. */
551
552PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
553 Py_ssize_t size)
554{
555 PyUnicodeObject *unicode;
556 register Py_ssize_t i;
557 Py_ssize_t alloc;
558 const wchar_t *orig_w;
559
560 if (w == NULL) {
561 PyErr_BadInternalCall();
562 return NULL;
563 }
564
565 alloc = size;
566 orig_w = w;
567 for (i = size; i > 0; i--) {
568 if (*w > 0xFFFF)
569 alloc++;
570 w++;
571 }
572 w = orig_w;
573 unicode = _PyUnicode_New(alloc);
574 if (!unicode)
575 return NULL;
576
577 /* Copy the wchar_t data into the new object */
578 {
579 register Py_UNICODE *u;
580 u = PyUnicode_AS_UNICODE(unicode);
581 for (i = size; i > 0; i--) {
582 if (*w > 0xFFFF) {
583 wchar_t ordinal = *w++;
584 ordinal -= 0x10000;
585 *u++ = 0xD800 | (ordinal >> 10);
586 *u++ = 0xDC00 | (ordinal & 0x3FF);
587 }
588 else
589 *u++ = *w++;
590 }
591 }
592 return (PyObject *)unicode;
593}
594
595#else
596
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000598 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599{
600 PyUnicodeObject *unicode;
601
602 if (w == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000603 PyErr_BadInternalCall();
604 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000605 }
606
607 unicode = _PyUnicode_New(size);
608 if (!unicode)
609 return NULL;
610
611 /* Copy the wchar_t data into the new object */
612#ifdef HAVE_USABLE_WCHAR_T
613 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000614#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000615 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000616 register Py_UNICODE *u;
617 register Py_ssize_t i;
618 u = PyUnicode_AS_UNICODE(unicode);
619 for (i = size; i > 0; i--)
620 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000621 }
622#endif
623
624 return (PyObject *)unicode;
625}
626
Mark Dickinson6b265f12009-03-18 16:07:26 +0000627#endif /* CONVERT_WCHAR_TO_SURROGATES */
628
629#undef CONVERT_WCHAR_TO_SURROGATES
630
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000631static void
632makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
633{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000634 *fmt++ = '%';
635 if (width) {
636 if (zeropad)
637 *fmt++ = '0';
638 fmt += sprintf(fmt, "%d", width);
639 }
640 if (precision)
641 fmt += sprintf(fmt, ".%d", precision);
642 if (longflag)
643 *fmt++ = 'l';
644 else if (size_tflag) {
645 char *f = PY_FORMAT_SIZE_T;
646 while (*f)
647 *fmt++ = *f++;
648 }
649 *fmt++ = c;
650 *fmt = '\0';
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000651}
652
653#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
654
655PyObject *
656PyUnicode_FromFormatV(const char *format, va_list vargs)
657{
Benjamin Peterson857ce152009-01-31 16:29:18 +0000658 va_list count;
659 Py_ssize_t callcount = 0;
660 PyObject **callresults = NULL;
661 PyObject **callresult = NULL;
662 Py_ssize_t n = 0;
663 int width = 0;
664 int precision = 0;
665 int zeropad;
666 const char* f;
667 Py_UNICODE *s;
668 PyObject *string;
669 /* used by sprintf */
670 char buffer[21];
671 /* use abuffer instead of buffer, if we need more space
672 * (which can happen if there's a format specifier with width). */
673 char *abuffer = NULL;
674 char *realbuffer;
675 Py_ssize_t abuffersize = 0;
676 char fmt[60]; /* should be enough for %0width.precisionld */
677 const char *copy;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000678
679#ifdef VA_LIST_IS_ARRAY
Benjamin Peterson857ce152009-01-31 16:29:18 +0000680 Py_MEMCPY(count, vargs, sizeof(va_list));
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000681#else
682#ifdef __va_copy
Benjamin Peterson857ce152009-01-31 16:29:18 +0000683 __va_copy(count, vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000684#else
Benjamin Peterson857ce152009-01-31 16:29:18 +0000685 count = vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000686#endif
687#endif
Walter Dörwalded960ac2009-05-03 22:36:33 +0000688 /* step 1: count the number of %S/%R/%s format specifications
689 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
690 * objects once during step 3 and put the result in an array) */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000691 for (f = format; *f; f++) {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000692 if (*f == '%') {
693 if (*(f+1)=='%')
694 continue;
Walter Dörwald342c8db2009-05-03 22:46:07 +0000695 if (*(f+1)=='S' || *(f+1)=='R')
Walter Dörwalded960ac2009-05-03 22:36:33 +0000696 ++callcount;
697 while (isdigit((unsigned)*f))
698 width = (width*10) + *f++ - '0';
699 while (*++f && *f != '%' && !isalpha((unsigned)*f))
700 ;
701 if (*f == 's')
702 ++callcount;
703 }
Benjamin Peterson857ce152009-01-31 16:29:18 +0000704 }
705 /* step 2: allocate memory for the results of
Walter Dörwalded960ac2009-05-03 22:36:33 +0000706 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson857ce152009-01-31 16:29:18 +0000707 if (callcount) {
708 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
709 if (!callresults) {
710 PyErr_NoMemory();
711 return NULL;
712 }
713 callresult = callresults;
714 }
715 /* step 3: figure out how large a buffer we need */
716 for (f = format; *f; f++) {
717 if (*f == '%') {
718 const char* p = f;
719 width = 0;
720 while (isdigit((unsigned)*f))
721 width = (width*10) + *f++ - '0';
722 while (*++f && *f != '%' && !isalpha((unsigned)*f))
723 ;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000724
Benjamin Peterson857ce152009-01-31 16:29:18 +0000725 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
726 * they don't affect the amount of space we reserve.
727 */
728 if ((*f == 'l' || *f == 'z') &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000729 (f[1] == 'd' || f[1] == 'u'))
730 ++f;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000731
Benjamin Peterson857ce152009-01-31 16:29:18 +0000732 switch (*f) {
733 case 'c':
734 (void)va_arg(count, int);
735 /* fall through... */
736 case '%':
737 n++;
738 break;
739 case 'd': case 'u': case 'i': case 'x':
740 (void) va_arg(count, int);
741 /* 20 bytes is enough to hold a 64-bit
742 integer. Decimal takes the most space.
743 This isn't enough for octal.
744 If a width is specified we need more
745 (which we allocate later). */
746 if (width < 20)
747 width = 20;
748 n += width;
749 if (abuffersize < width)
750 abuffersize = width;
751 break;
752 case 's':
753 {
754 /* UTF-8 */
Georg Brandlba68a992009-05-05 09:19:43 +0000755 const char *s = va_arg(count, const char*);
Walter Dörwalded960ac2009-05-03 22:36:33 +0000756 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
757 if (!str)
758 goto fail;
759 n += PyUnicode_GET_SIZE(str);
760 /* Remember the str and switch to the next slot */
761 *callresult++ = str;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000762 break;
763 }
764 case 'U':
765 {
766 PyObject *obj = va_arg(count, PyObject *);
767 assert(obj && PyUnicode_Check(obj));
768 n += PyUnicode_GET_SIZE(obj);
769 break;
770 }
771 case 'V':
772 {
773 PyObject *obj = va_arg(count, PyObject *);
774 const char *str = va_arg(count, const char *);
775 assert(obj || str);
776 assert(!obj || PyUnicode_Check(obj));
777 if (obj)
778 n += PyUnicode_GET_SIZE(obj);
779 else
780 n += strlen(str);
781 break;
782 }
783 case 'S':
784 {
785 PyObject *obj = va_arg(count, PyObject *);
786 PyObject *str;
787 assert(obj);
788 str = PyObject_Str(obj);
789 if (!str)
790 goto fail;
791 n += PyUnicode_GET_SIZE(str);
792 /* Remember the str and switch to the next slot */
793 *callresult++ = str;
794 break;
795 }
796 case 'R':
797 {
798 PyObject *obj = va_arg(count, PyObject *);
799 PyObject *repr;
800 assert(obj);
801 repr = PyObject_Repr(obj);
802 if (!repr)
803 goto fail;
804 n += PyUnicode_GET_SIZE(repr);
805 /* Remember the repr and switch to the next slot */
806 *callresult++ = repr;
807 break;
808 }
809 case 'p':
810 (void) va_arg(count, int);
811 /* maximum 64-bit pointer representation:
812 * 0xffffffffffffffff
813 * so 19 characters is enough.
814 * XXX I count 18 -- what's the extra for?
815 */
816 n += 19;
817 break;
818 default:
819 /* if we stumble upon an unknown
820 formatting code, copy the rest of
821 the format string to the output
822 string. (we cannot just skip the
823 code, since there's no way to know
824 what's in the argument list) */
825 n += strlen(p);
826 goto expand;
827 }
828 } else
829 n++;
830 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000831 expand:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000832 if (abuffersize > 20) {
833 abuffer = PyObject_Malloc(abuffersize);
834 if (!abuffer) {
835 PyErr_NoMemory();
836 goto fail;
837 }
838 realbuffer = abuffer;
839 }
840 else
841 realbuffer = buffer;
842 /* step 4: fill the buffer */
843 /* Since we've analyzed how much space we need for the worst case,
844 we don't have to resize the string.
845 There can be no errors beyond this point. */
846 string = PyUnicode_FromUnicode(NULL, n);
847 if (!string)
848 goto fail;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000849
Benjamin Peterson857ce152009-01-31 16:29:18 +0000850 s = PyUnicode_AS_UNICODE(string);
851 callresult = callresults;
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000852
Benjamin Peterson857ce152009-01-31 16:29:18 +0000853 for (f = format; *f; f++) {
854 if (*f == '%') {
855 const char* p = f++;
856 int longflag = 0;
857 int size_tflag = 0;
858 zeropad = (*f == '0');
859 /* parse the width.precision part */
860 width = 0;
861 while (isdigit((unsigned)*f))
862 width = (width*10) + *f++ - '0';
863 precision = 0;
864 if (*f == '.') {
865 f++;
866 while (isdigit((unsigned)*f))
867 precision = (precision*10) + *f++ - '0';
868 }
869 /* handle the long flag, but only for %ld and %lu.
870 others can be added when necessary. */
871 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
872 longflag = 1;
873 ++f;
874 }
875 /* handle the size_t flag. */
876 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
877 size_tflag = 1;
878 ++f;
879 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000880
Benjamin Peterson857ce152009-01-31 16:29:18 +0000881 switch (*f) {
882 case 'c':
883 *s++ = va_arg(vargs, int);
884 break;
885 case 'd':
886 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
887 if (longflag)
888 sprintf(realbuffer, fmt, va_arg(vargs, long));
889 else if (size_tflag)
890 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
891 else
892 sprintf(realbuffer, fmt, va_arg(vargs, int));
893 appendstring(realbuffer);
894 break;
895 case 'u':
896 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
897 if (longflag)
898 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
899 else if (size_tflag)
900 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
901 else
902 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
903 appendstring(realbuffer);
904 break;
905 case 'i':
906 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
907 sprintf(realbuffer, fmt, va_arg(vargs, int));
908 appendstring(realbuffer);
909 break;
910 case 'x':
911 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
912 sprintf(realbuffer, fmt, va_arg(vargs, int));
913 appendstring(realbuffer);
914 break;
915 case 's':
916 {
Walter Dörwalded960ac2009-05-03 22:36:33 +0000917 /* unused, since we already have the result */
918 (void) va_arg(vargs, char *);
919 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
920 PyUnicode_GET_SIZE(*callresult));
921 s += PyUnicode_GET_SIZE(*callresult);
922 /* We're done with the unicode()/repr() => forget it */
923 Py_DECREF(*callresult);
924 /* switch to next unicode()/repr() result */
925 ++callresult;
Benjamin Peterson857ce152009-01-31 16:29:18 +0000926 break;
927 }
928 case 'U':
929 {
930 PyObject *obj = va_arg(vargs, PyObject *);
931 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
932 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
933 s += size;
934 break;
935 }
936 case 'V':
937 {
938 PyObject *obj = va_arg(vargs, PyObject *);
939 const char *str = va_arg(vargs, const char *);
940 if (obj) {
941 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
942 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
943 s += size;
944 } else {
945 appendstring(str);
946 }
947 break;
948 }
949 case 'S':
950 case 'R':
951 {
952 Py_UNICODE *ucopy;
953 Py_ssize_t usize;
954 Py_ssize_t upos;
955 /* unused, since we already have the result */
956 (void) va_arg(vargs, PyObject *);
957 ucopy = PyUnicode_AS_UNICODE(*callresult);
958 usize = PyUnicode_GET_SIZE(*callresult);
959 for (upos = 0; upos<usize;)
960 *s++ = ucopy[upos++];
961 /* We're done with the unicode()/repr() => forget it */
962 Py_DECREF(*callresult);
963 /* switch to next unicode()/repr() result */
964 ++callresult;
965 break;
966 }
967 case 'p':
968 sprintf(buffer, "%p", va_arg(vargs, void*));
969 /* %p is ill-defined: ensure leading 0x. */
970 if (buffer[1] == 'X')
971 buffer[1] = 'x';
972 else if (buffer[1] != 'x') {
973 memmove(buffer+2, buffer, strlen(buffer)+1);
974 buffer[0] = '0';
975 buffer[1] = 'x';
976 }
977 appendstring(buffer);
978 break;
979 case '%':
980 *s++ = '%';
981 break;
982 default:
983 appendstring(p);
984 goto end;
985 }
986 } else
987 *s++ = *f;
988 }
Christian Heimes7f39c9f2008-01-25 12:18:43 +0000989
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000990 end:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000991 if (callresults)
992 PyObject_Free(callresults);
993 if (abuffer)
994 PyObject_Free(abuffer);
995 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
996 return string;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +0000997 fail:
Benjamin Peterson857ce152009-01-31 16:29:18 +0000998 if (callresults) {
999 PyObject **callresult2 = callresults;
1000 while (callresult2 < callresult) {
1001 Py_DECREF(*callresult2);
1002 ++callresult2;
1003 }
1004 PyObject_Free(callresults);
1005 }
1006 if (abuffer)
1007 PyObject_Free(abuffer);
1008 return NULL;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001009}
1010
1011#undef appendstring
1012
1013PyObject *
1014PyUnicode_FromFormat(const char *format, ...)
1015{
Benjamin Peterson857ce152009-01-31 16:29:18 +00001016 PyObject* ret;
1017 va_list vargs;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001018
1019#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson857ce152009-01-31 16:29:18 +00001020 va_start(vargs, format);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001021#else
Benjamin Peterson857ce152009-01-31 16:29:18 +00001022 va_start(vargs);
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001023#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00001024 ret = PyUnicode_FromFormatV(format, vargs);
1025 va_end(vargs);
1026 return ret;
Christian Heimes7f39c9f2008-01-25 12:18:43 +00001027}
1028
Martin v. Löwis18e16552006-02-15 17:27:45 +00001029Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001030 wchar_t *w,
1031 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001032{
1033 if (unicode == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001034 PyErr_BadInternalCall();
1035 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001037
1038 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039 if (size > PyUnicode_GET_SIZE(unicode))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001040 size = PyUnicode_GET_SIZE(unicode) + 1;
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001041
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042#ifdef HAVE_USABLE_WCHAR_T
1043 memcpy(w, unicode->str, size * sizeof(wchar_t));
1044#else
1045 {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001046 register Py_UNICODE *u;
1047 register Py_ssize_t i;
1048 u = PyUnicode_AS_UNICODE(unicode);
1049 for (i = size; i > 0; i--)
1050 *w++ = *u++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051 }
1052#endif
1053
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +00001054 if (size > PyUnicode_GET_SIZE(unicode))
1055 return PyUnicode_GET_SIZE(unicode);
1056 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001057 return size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058}
1059
1060#endif
1061
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001062PyObject *PyUnicode_FromOrdinal(int ordinal)
1063{
Hye-Shik Chang40574832004-04-06 07:24:51 +00001064 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001065
1066#ifdef Py_UNICODE_WIDE
1067 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001068 PyErr_SetString(PyExc_ValueError,
1069 "unichr() arg not in range(0x110000) "
1070 "(wide Python build)");
1071 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001072 }
1073#else
1074 if (ordinal < 0 || ordinal > 0xffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001075 PyErr_SetString(PyExc_ValueError,
1076 "unichr() arg not in range(0x10000) "
1077 "(narrow Python build)");
1078 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001079 }
1080#endif
1081
Hye-Shik Chang40574832004-04-06 07:24:51 +00001082 s[0] = (Py_UNICODE)ordinal;
1083 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001084}
1085
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086PyObject *PyUnicode_FromObject(register PyObject *obj)
1087{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001088 /* XXX Perhaps we should make this API an alias of
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001089 PyObject_Unicode() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001090 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001091 Py_INCREF(obj);
1092 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001093 }
1094 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001095 /* For a Unicode subtype that's not a Unicode object,
1096 return a true Unicode object with the same data. */
1097 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1098 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001099 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001100 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1101}
1102
1103PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001104 const char *encoding,
1105 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001106{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001107 const char *s = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001108 Py_ssize_t len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001109 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001110
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111 if (obj == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001112 PyErr_BadInternalCall();
1113 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001115
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001116#if 0
1117 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +00001118 that no encodings is given and then redirect to
1119 PyObject_Unicode() which then applies the additional logic for
1120 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001121
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001122 NOTE: This API should really only be used for object which
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001123 represent *encoded* Unicode !
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001124
1125 */
Benjamin Peterson857ce152009-01-31 16:29:18 +00001126 if (PyUnicode_Check(obj)) {
1127 if (encoding) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001128 PyErr_SetString(PyExc_TypeError,
1129 "decoding Unicode is not supported");
1130 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001131 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001132 return PyObject_Unicode(obj);
1133 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001134#else
1135 if (PyUnicode_Check(obj)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001136 PyErr_SetString(PyExc_TypeError,
1137 "decoding Unicode is not supported");
1138 return NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00001139 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001140#endif
1141
1142 /* Coerce object */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001143 if (PyString_Check(obj)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001144 s = PyString_AS_STRING(obj);
1145 len = PyString_GET_SIZE(obj);
Christian Heimes1a6387e2008-03-26 12:49:49 +00001146 }
Christian Heimes3497f942008-05-26 12:29:14 +00001147 else if (PyByteArray_Check(obj)) {
Christian Heimes1a6387e2008-03-26 12:49:49 +00001148 /* Python 2.x specific */
1149 PyErr_Format(PyExc_TypeError,
1150 "decoding bytearray is not supported");
1151 return NULL;
1152 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001153 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001154 /* Overwrite the error message with something more useful in
1155 case of a TypeError. */
1156 if (PyErr_ExceptionMatches(PyExc_TypeError))
1157 PyErr_Format(PyExc_TypeError,
1158 "coercing to Unicode: need string or buffer, "
1159 "%.80s found",
1160 Py_TYPE(obj)->tp_name);
1161 goto onError;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001162 }
Tim Petersced69f82003-09-16 20:30:58 +00001163
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001164 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 if (len == 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001166 Py_INCREF(unicode_empty);
1167 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 }
Tim Petersced69f82003-09-16 20:30:58 +00001169 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001170 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001171
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001172 return v;
1173
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001174 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001175 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176}
1177
1178PyObject *PyUnicode_Decode(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001179 Py_ssize_t size,
1180 const char *encoding,
1181 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182{
1183 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001184
1185 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001186 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001187
1188 /* Shortcuts for common default encodings */
1189 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001191 else if (strcmp(encoding, "latin-1") == 0)
1192 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001193#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1194 else if (strcmp(encoding, "mbcs") == 0)
1195 return PyUnicode_DecodeMBCS(s, size, errors);
1196#endif
Fred Drakee4315f52000-05-09 19:53:39 +00001197 else if (strcmp(encoding, "ascii") == 0)
1198 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199
1200 /* Decode via the codec registry */
1201 buffer = PyBuffer_FromMemory((void *)s, size);
1202 if (buffer == NULL)
1203 goto onError;
1204 unicode = PyCodec_Decode(buffer, encoding, errors);
1205 if (unicode == NULL)
1206 goto onError;
1207 if (!PyUnicode_Check(unicode)) {
1208 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001209 "decoder did not return an unicode object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001210 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 Py_DECREF(unicode);
1212 goto onError;
1213 }
1214 Py_DECREF(buffer);
1215 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001216
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001217 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218 Py_XDECREF(buffer);
1219 return NULL;
1220}
1221
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001222PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1223 const char *encoding,
1224 const char *errors)
1225{
1226 PyObject *v;
1227
1228 if (!PyUnicode_Check(unicode)) {
1229 PyErr_BadArgument();
1230 goto onError;
1231 }
1232
1233 if (encoding == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001234 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001235
1236 /* Decode via the codec registry */
1237 v = PyCodec_Decode(unicode, encoding, errors);
1238 if (v == NULL)
1239 goto onError;
1240 return v;
1241
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001242 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001243 return NULL;
1244}
1245
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001247 Py_ssize_t size,
1248 const char *encoding,
1249 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250{
1251 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001252
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 unicode = PyUnicode_FromUnicode(s, size);
1254 if (unicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1257 Py_DECREF(unicode);
1258 return v;
1259}
1260
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001261PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1262 const char *encoding,
1263 const char *errors)
1264{
1265 PyObject *v;
1266
1267 if (!PyUnicode_Check(unicode)) {
1268 PyErr_BadArgument();
1269 goto onError;
1270 }
1271
1272 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001273 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001274
1275 /* Encode via the codec registry */
1276 v = PyCodec_Encode(unicode, encoding, errors);
1277 if (v == NULL)
1278 goto onError;
1279 return v;
1280
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001281 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001282 return NULL;
1283}
1284
Guido van Rossumd57fd912000-03-10 22:53:23 +00001285PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1286 const char *encoding,
1287 const char *errors)
1288{
1289 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001290
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 if (!PyUnicode_Check(unicode)) {
1292 PyErr_BadArgument();
1293 goto onError;
1294 }
Fred Drakee4315f52000-05-09 19:53:39 +00001295
Tim Petersced69f82003-09-16 20:30:58 +00001296 if (encoding == NULL)
Benjamin Petersond17fec72009-01-31 21:47:42 +00001297 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001298
1299 /* Shortcuts for common default encodings */
1300 if (errors == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001301 if (strcmp(encoding, "utf-8") == 0)
1302 return PyUnicode_AsUTF8String(unicode);
1303 else if (strcmp(encoding, "latin-1") == 0)
1304 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001305#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001306 else if (strcmp(encoding, "mbcs") == 0)
1307 return PyUnicode_AsMBCSString(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001308#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001309 else if (strcmp(encoding, "ascii") == 0)
1310 return PyUnicode_AsASCIIString(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +00001311 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312
1313 /* Encode via the codec registry */
1314 v = PyCodec_Encode(unicode, encoding, errors);
1315 if (v == NULL)
1316 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001317 if (!PyString_Check(v)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001318 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001319 "encoder did not return a string object (type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00001320 Py_TYPE(v)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 Py_DECREF(v);
1322 goto onError;
1323 }
1324 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001325
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001326 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 return NULL;
1328}
1329
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001330PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001331 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001332{
1333 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1334
1335 if (v)
1336 return v;
1337 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1338 if (v && errors == NULL)
1339 ((PyUnicodeObject *)unicode)->defenc = v;
1340 return v;
1341}
1342
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1344{
1345 if (!PyUnicode_Check(unicode)) {
1346 PyErr_BadArgument();
1347 goto onError;
1348 }
1349 return PyUnicode_AS_UNICODE(unicode);
1350
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001351 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352 return NULL;
1353}
1354
Martin v. Löwis18e16552006-02-15 17:27:45 +00001355Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001356{
1357 if (!PyUnicode_Check(unicode)) {
1358 PyErr_BadArgument();
1359 goto onError;
1360 }
1361 return PyUnicode_GET_SIZE(unicode);
1362
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001363 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001364 return -1;
1365}
1366
Thomas Wouters78890102000-07-22 19:25:51 +00001367const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001368{
1369 return unicode_default_encoding;
1370}
1371
1372int PyUnicode_SetDefaultEncoding(const char *encoding)
1373{
1374 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001375
Fred Drakee4315f52000-05-09 19:53:39 +00001376 /* Make sure the encoding is valid. As side effect, this also
1377 loads the encoding into the codec registry cache. */
1378 v = _PyCodec_Lookup(encoding);
1379 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001380 goto onError;
Fred Drakee4315f52000-05-09 19:53:39 +00001381 Py_DECREF(v);
1382 strncpy(unicode_default_encoding,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001383 encoding,
1384 sizeof(unicode_default_encoding));
Fred Drakee4315f52000-05-09 19:53:39 +00001385 return 0;
1386
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001387 onError:
Fred Drakee4315f52000-05-09 19:53:39 +00001388 return -1;
1389}
1390
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001391/* error handling callback helper:
1392 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001393 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001394 and adjust various state variables.
1395 return 0 on success, -1 on error
1396*/
1397
1398static
1399int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001400 const char *encoding, const char *reason,
1401 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1402 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1403 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001404{
Martin v. Löwis18e16552006-02-15 17:27:45 +00001405 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001406
1407 PyObject *restuple = NULL;
1408 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001409 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1410 Py_ssize_t requiredsize;
1411 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001412 Py_UNICODE *repptr;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001413 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 int res = -1;
1415
1416 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001417 *errorHandler = PyCodec_LookupError(errors);
1418 if (*errorHandler == NULL)
1419 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420 }
1421
1422 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00001423 *exceptionObject = PyUnicodeDecodeError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001424 encoding, input, insize, *startinpos, *endinpos, reason);
1425 if (*exceptionObject == NULL)
1426 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001427 }
1428 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001429 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1430 goto onError;
1431 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1432 goto onError;
1433 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1434 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001435 }
1436
1437 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1438 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001439 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001440 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00001441 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001442 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001443 }
1444 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001445 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001446 if (newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001447 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001448 if (newpos<0 || newpos>insize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001449 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1450 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00001451 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001452
1453 /* need more space? (at least enough for what we
1454 have+the replacement+the rest of the string (starting
1455 at the new input position), so we won't have to check space
1456 when there are no errors in the rest of the string) */
1457 repptr = PyUnicode_AS_UNICODE(repunicode);
1458 repsize = PyUnicode_GET_SIZE(repunicode);
1459 requiredsize = *outpos + repsize + insize-newpos;
1460 if (requiredsize > outsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001461 if (requiredsize<2*outsize)
1462 requiredsize = 2*outsize;
1463 if (_PyUnicode_Resize(output, requiredsize) < 0)
1464 goto onError;
1465 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001466 }
1467 *endinpos = newpos;
1468 *inptr = input + newpos;
1469 Py_UNICODE_COPY(*outptr, repptr, repsize);
1470 *outptr += repsize;
1471 *outpos += repsize;
1472 /* we made it! */
1473 res = 0;
1474
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001475 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001476 Py_XDECREF(restuple);
1477 return res;
1478}
1479
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001480/* --- UTF-7 Codec -------------------------------------------------------- */
1481
Antoine Pitrou653dece2009-05-04 18:32:32 +00001482/* See RFC2152 for details. We encode conservatively and decode liberally. */
1483
1484/* Three simple macros defining base-64. */
1485
1486/* Is c a base-64 character? */
1487
1488#define IS_BASE64(c) \
1489 (isalnum(c) || (c) == '+' || (c) == '/')
1490
1491/* given that c is a base-64 character, what is its base-64 value? */
1492
1493#define FROM_BASE64(c) \
1494 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1495 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1496 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1497 (c) == '+' ? 62 : 63)
1498
1499/* What is the base-64 character of the bottom 6 bits of n? */
1500
1501#define TO_BASE64(n) \
1502 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1503
1504/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1505 * decoded as itself. We are permissive on decoding; the only ASCII
1506 * byte not decoding to itself is the + which begins a base64
1507 * string. */
1508
1509#define DECODE_DIRECT(c) \
1510 ((c) <= 127 && (c) != '+')
1511
1512/* The UTF-7 encoder treats ASCII characters differently according to
1513 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1514 * the above). See RFC2152. This array identifies these different
1515 * sets:
1516 * 0 : "Set D"
1517 * alphanumeric and '(),-./:?
1518 * 1 : "Set O"
1519 * !"#$%&*;<=>@[]^_`{|}
1520 * 2 : "whitespace"
1521 * ht nl cr sp
1522 * 3 : special (must be base64 encoded)
1523 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1524 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001525
Tim Petersced69f82003-09-16 20:30:58 +00001526static
Antoine Pitrou653dece2009-05-04 18:32:32 +00001527char utf7_category[128] = {
1528/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1529 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1530/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1532/* sp ! " # $ % & ' ( ) * + , - . / */
1533 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1534/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1535 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1536/* @ A B C D E F G H I J K L M N O */
1537 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1538/* P Q R S T U V W X Y Z [ \ ] ^ _ */
1539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1540/* ` a b c d e f g h i j k l m n o */
1541 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1542/* p q r s t u v w x y z { | } ~ del */
1543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001544};
1545
Antoine Pitrou653dece2009-05-04 18:32:32 +00001546/* ENCODE_DIRECT: this character should be encoded as itself. The
1547 * answer depends on whether we are encoding set O as itself, and also
1548 * on whether we are encoding whitespace as itself. RFC2152 makes it
1549 * clear that the answers to these questions vary between
1550 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00001551
Antoine Pitrou653dece2009-05-04 18:32:32 +00001552#define ENCODE_DIRECT(c, directO, directWS) \
1553 ((c) < 128 && (c) > 0 && \
1554 ((utf7_category[(c)] == 0) || \
1555 (directWS && (utf7_category[(c)] == 2)) || \
1556 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001557
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001558PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001559 Py_ssize_t size,
1560 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001561{
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001562 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1563}
1564
Antoine Pitrou653dece2009-05-04 18:32:32 +00001565/* The decoder. The only state we preserve is our read position,
1566 * i.e. how many characters we have consumed. So if we end in the
1567 * middle of a shift sequence we have to back off the read position
1568 * and the output to the beginning of the sequence, otherwise we lose
1569 * all the shift state (seen bits, number of bits seen, high
1570 * surrogate). */
1571
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001572PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001573 Py_ssize_t size,
1574 const char *errors,
1575 Py_ssize_t *consumed)
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001576{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001577 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001578 Py_ssize_t startinpos;
1579 Py_ssize_t endinpos;
1580 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001581 const char *e;
1582 PyUnicodeObject *unicode;
1583 Py_UNICODE *p;
1584 const char *errmsg = "";
1585 int inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001586 Py_UNICODE *shiftOutStart;
1587 unsigned int base64bits = 0;
1588 unsigned long base64buffer = 0;
1589 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001590 PyObject *errorHandler = NULL;
1591 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001592
1593 unicode = _PyUnicode_New(size);
1594 if (!unicode)
1595 return NULL;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001596 if (size == 0) {
1597 if (consumed)
1598 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001599 return (PyObject *)unicode;
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001600 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001601
1602 p = unicode->str;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001603 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001604 e = s + size;
1605
1606 while (s < e) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001607 Py_UNICODE ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001608
Antoine Pitrou653dece2009-05-04 18:32:32 +00001609 if (inShift) { /* in a base-64 section */
1610 if (IS_BASE64(ch)) { /* consume a base-64 character */
1611 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1612 base64bits += 6;
1613 s++;
1614 if (base64bits >= 16) {
1615 /* we have enough bits for a UTF-16 value */
1616 Py_UNICODE outCh = (Py_UNICODE)
1617 (base64buffer >> (base64bits-16));
1618 base64bits -= 16;
1619 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1620 if (surrogate) {
1621 /* expecting a second surrogate */
1622 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1623#ifdef Py_UNICODE_WIDE
1624 *p++ = (((surrogate & 0x3FF)<<10)
1625 | (outCh & 0x3FF)) + 0x10000;
1626#else
1627 *p++ = surrogate;
1628 *p++ = outCh;
1629#endif
1630 surrogate = 0;
Antoine Pitrou30402542011-11-15 01:49:40 +01001631 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001632 }
1633 else {
Antoine Pitrou30402542011-11-15 01:49:40 +01001634 *p++ = surrogate;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001635 surrogate = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001636 }
1637 }
Antoine Pitrou30402542011-11-15 01:49:40 +01001638 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001639 /* first surrogate */
1640 surrogate = outCh;
1641 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001642 else {
1643 *p++ = outCh;
1644 }
1645 }
1646 }
1647 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001648 inShift = 0;
1649 s++;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001650 if (surrogate) {
Antoine Pitrou30402542011-11-15 01:49:40 +01001651 *p++ = surrogate;
1652 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001653 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001654 if (base64bits > 0) { /* left-over bits */
1655 if (base64bits >= 6) {
1656 /* We've seen at least one base-64 character */
1657 errmsg = "partial character in shift sequence";
1658 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001659 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001660 else {
1661 /* Some bits remain; they should be zero */
1662 if (base64buffer != 0) {
1663 errmsg = "non-zero padding bits in shift sequence";
1664 goto utf7Error;
1665 }
1666 }
1667 }
1668 if (ch != '-') {
1669 /* '-' is absorbed; other terminating
1670 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001671 *p++ = ch;
1672 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001673 }
1674 }
1675 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001676 startinpos = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001677 s++; /* consume '+' */
1678 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001679 s++;
1680 *p++ = '+';
Antoine Pitrou653dece2009-05-04 18:32:32 +00001681 }
1682 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001683 inShift = 1;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001684 shiftOutStart = p;
1685 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001686 }
1687 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001688 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001689 *p++ = ch;
1690 s++;
1691 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001692 else {
1693 startinpos = s-starts;
1694 s++;
1695 errmsg = "unexpected special character";
1696 goto utf7Error;
1697 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001698 continue;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001699utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001700 outpos = p-PyUnicode_AS_UNICODE(unicode);
1701 endinpos = s-starts;
1702 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001703 errors, &errorHandler,
1704 "utf7", errmsg,
1705 starts, size, &startinpos, &endinpos, &exc, &s,
1706 &unicode, &outpos, &p))
1707 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001708 }
1709
Antoine Pitrou653dece2009-05-04 18:32:32 +00001710 /* end of string */
1711
1712 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1713 /* if we're in an inconsistent state, that's an error */
1714 if (surrogate ||
1715 (base64bits >= 6) ||
1716 (base64bits > 0 && base64buffer != 0)) {
1717 outpos = p-PyUnicode_AS_UNICODE(unicode);
1718 endinpos = size;
1719 if (unicode_decode_call_errorhandler(
1720 errors, &errorHandler,
1721 "utf7", "unterminated shift sequence",
1722 starts, size, &startinpos, &endinpos, &exc, &s,
1723 &unicode, &outpos, &p))
1724 goto onError;
1725 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001726 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001727
1728 /* return state */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001729 if (consumed) {
Antoine Pitrou653dece2009-05-04 18:32:32 +00001730 if (inShift) {
1731 p = shiftOutStart; /* back off output */
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001732 *consumed = startinpos;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001733 }
1734 else {
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001735 *consumed = s-starts;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001736 }
Amaury Forgeot d'Arc50879802007-11-20 23:31:27 +00001737 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001738
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001739 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001740 goto onError;
1741
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 Py_XDECREF(errorHandler);
1743 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001744 return (PyObject *)unicode;
1745
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001746 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001747 Py_XDECREF(errorHandler);
1748 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001749 Py_DECREF(unicode);
1750 return NULL;
1751}
1752
1753
1754PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001755 Py_ssize_t size,
Antoine Pitrou653dece2009-05-04 18:32:32 +00001756 int base64SetO,
1757 int base64WhiteSpace,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001758 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001759{
1760 PyObject *v;
1761 /* It might be possible to tighten this worst case */
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001762 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001763 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001764 Py_ssize_t i = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001765 unsigned int base64bits = 0;
1766 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001767 char * out;
1768 char * start;
1769
Alexandre Vassalottifd009162009-07-07 02:17:30 +00001770 if (allocated / 8 != size)
Neal Norwitze7d8be82008-07-31 17:17:14 +00001771 return PyErr_NoMemory();
1772
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001773 if (size == 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00001774 return PyString_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001775
Antoine Pitrou653dece2009-05-04 18:32:32 +00001776 v = PyString_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001777 if (v == NULL)
1778 return NULL;
1779
Gregory P. Smithdd96db62008-06-09 04:58:54 +00001780 start = out = PyString_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001781 for (;i < size; ++i) {
1782 Py_UNICODE ch = s[i];
1783
Antoine Pitrou653dece2009-05-04 18:32:32 +00001784 if (inShift) {
1785 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1786 /* shifting out */
1787 if (base64bits) { /* output remaining bits */
1788 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1789 base64buffer = 0;
1790 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001791 }
1792 inShift = 0;
Antoine Pitrou653dece2009-05-04 18:32:32 +00001793 /* Characters not in the BASE64 set implicitly unshift the sequence
1794 so no '-' is required, except if the character is itself a '-' */
1795 if (IS_BASE64(ch) || ch == '-') {
1796 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001797 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001798 *out++ = (char) ch;
1799 }
1800 else {
1801 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00001802 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001803 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001804 else { /* not in a shift sequence */
1805 if (ch == '+') {
1806 *out++ = '+';
1807 *out++ = '-';
1808 }
1809 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1810 *out++ = (char) ch;
1811 }
1812 else {
1813 *out++ = '+';
1814 inShift = 1;
1815 goto encode_char;
1816 }
1817 }
1818 continue;
1819encode_char:
1820#ifdef Py_UNICODE_WIDE
1821 if (ch >= 0x10000) {
1822 /* code first surrogate */
1823 base64bits += 16;
1824 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1825 while (base64bits >= 6) {
1826 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1827 base64bits -= 6;
1828 }
1829 /* prepare second surrogate */
1830 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1831 }
1832#endif
1833 base64bits += 16;
1834 base64buffer = (base64buffer << 16) | ch;
1835 while (base64bits >= 6) {
1836 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1837 base64bits -= 6;
1838 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001839 }
Antoine Pitrou653dece2009-05-04 18:32:32 +00001840 if (base64bits)
1841 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1842 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001843 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001844
Benjamin Petersonbea424a2010-04-03 00:57:33 +00001845 if (_PyString_Resize(&v, out - start))
1846 return NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001847 return v;
1848}
1849
Antoine Pitrou653dece2009-05-04 18:32:32 +00001850#undef IS_BASE64
1851#undef FROM_BASE64
1852#undef TO_BASE64
1853#undef DECODE_DIRECT
1854#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001855
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856/* --- UTF-8 Codec -------------------------------------------------------- */
1857
Tim Petersced69f82003-09-16 20:30:58 +00001858static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859char utf8_code_length[256] = {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001860 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1861 illegal prefix. See RFC 3629 for details */
1862 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1863 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07001864 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1866 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1867 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1870 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001871 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1872 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melottie57e50c2010-06-05 17:51:07 +00001873 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1874 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1875 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1876 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1877 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878};
1879
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001881 Py_ssize_t size,
1882 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883{
Walter Dörwald69652032004-09-07 20:24:22 +00001884 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1885}
1886
1887PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001888 Py_ssize_t size,
1889 const char *errors,
1890 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00001891{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001892 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001893 int n;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001894 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00001895 Py_ssize_t startinpos;
1896 Py_ssize_t endinpos;
1897 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898 const char *e;
1899 PyUnicodeObject *unicode;
1900 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001901 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001902 PyObject *errorHandler = NULL;
1903 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904
1905 /* Note: size will always be longer than the resulting Unicode
1906 character count */
1907 unicode = _PyUnicode_New(size);
1908 if (!unicode)
1909 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001910 if (size == 0) {
1911 if (consumed)
1912 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001913 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001914 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001915
1916 /* Unpack UTF-8 encoded data */
1917 p = unicode->str;
1918 e = s + size;
1919
1920 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001921 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922
1923 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001924 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001925 s++;
1926 continue;
1927 }
1928
1929 n = utf8_code_length[ch];
1930
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001931 if (s + n > e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001932 if (consumed)
1933 break;
1934 else {
1935 errmsg = "unexpected end of data";
1936 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001937 endinpos = startinpos+1;
1938 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1939 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001940 goto utf8Error;
1941 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00001942 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943
1944 switch (n) {
1945
1946 case 0:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001947 errmsg = "invalid start byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001948 startinpos = s-starts;
1949 endinpos = startinpos+1;
1950 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951
1952 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001953 errmsg = "internal error";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001954 startinpos = s-starts;
1955 endinpos = startinpos+1;
1956 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957
1958 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001959 if ((s[1] & 0xc0) != 0x80) {
Ezio Melottie57e50c2010-06-05 17:51:07 +00001960 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001961 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001962 endinpos = startinpos + 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001963 goto utf8Error;
1964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001965 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00001966 assert ((ch > 0x007F) && (ch <= 0x07FF));
1967 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968 break;
1969
1970 case 3:
Ezio Melottie57e50c2010-06-05 17:51:07 +00001971 /* XXX: surrogates shouldn't be valid UTF-8!
1972 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1973 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1974 Uncomment the 2 lines below to make them invalid,
1975 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
Tim Petersced69f82003-09-16 20:30:58 +00001976 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00001977 (s[2] & 0xc0) != 0x80 ||
1978 ((unsigned char)s[0] == 0xE0 &&
1979 (unsigned char)s[1] < 0xA0)/* ||
1980 ((unsigned char)s[0] == 0xED &&
1981 (unsigned char)s[1] > 0x9F)*/) {
1982 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001983 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00001984 endinpos = startinpos + 1;
1985
1986 /* if s[1] first two bits are 1 and 0, then the invalid
1987 continuation byte is s[2], so increment endinpos by 1,
1988 if not, s[1] is invalid and endinpos doesn't need to
1989 be incremented. */
1990 if ((s[1] & 0xC0) == 0x80)
1991 endinpos++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00001992 goto utf8Error;
1993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melottie57e50c2010-06-05 17:51:07 +00001995 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
1996 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001997 break;
1998
1999 case 4:
2000 if ((s[1] & 0xc0) != 0x80 ||
2001 (s[2] & 0xc0) != 0x80 ||
Ezio Melottie57e50c2010-06-05 17:51:07 +00002002 (s[3] & 0xc0) != 0x80 ||
2003 ((unsigned char)s[0] == 0xF0 &&
2004 (unsigned char)s[1] < 0x90) ||
2005 ((unsigned char)s[0] == 0xF4 &&
2006 (unsigned char)s[1] > 0x8F)) {
2007 errmsg = "invalid continuation byte";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002008 startinpos = s-starts;
Ezio Melottie57e50c2010-06-05 17:51:07 +00002009 endinpos = startinpos + 1;
2010 if ((s[1] & 0xC0) == 0x80) {
2011 endinpos++;
2012 if ((s[2] & 0xC0) == 0x80)
2013 endinpos++;
2014 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002015 goto utf8Error;
2016 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002017 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melottie57e50c2010-06-05 17:51:07 +00002018 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2019 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2020
Fredrik Lundh8f455852001-06-27 18:59:43 +00002021#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002022 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002023#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002024 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002025
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002026 /* translate from 10000..10FFFF to 0..FFFF */
2027 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002028
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002029 /* high surrogate = top 10 bits added to D800 */
2030 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002031
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002032 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002033 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002034#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 }
2037 s += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002038 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002039
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002040 utf8Error:
2041 outpos = p-PyUnicode_AS_UNICODE(unicode);
2042 if (unicode_decode_call_errorhandler(
2043 errors, &errorHandler,
2044 "utf8", errmsg,
2045 starts, size, &startinpos, &endinpos, &exc, &s,
2046 &unicode, &outpos, &p))
2047 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 }
Walter Dörwald69652032004-09-07 20:24:22 +00002049 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002050 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051
2052 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002053 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 goto onError;
2055
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056 Py_XDECREF(errorHandler);
2057 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 return (PyObject *)unicode;
2059
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002060 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 Py_XDECREF(errorHandler);
2062 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 Py_DECREF(unicode);
2064 return NULL;
2065}
2066
Tim Peters602f7402002-04-27 18:03:26 +00002067/* Allocation strategy: if the string is short, convert into a stack buffer
2068 and allocate exactly as much space needed at the end. Else allocate the
2069 maximum possible needed (4 result bytes per Unicode character), and return
2070 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002071*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002072PyObject *
2073PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002074 Py_ssize_t size,
2075 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076{
Tim Peters602f7402002-04-27 18:03:26 +00002077#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002078
Martin v. Löwis18e16552006-02-15 17:27:45 +00002079 Py_ssize_t i; /* index into s of next input byte */
Tim Peters602f7402002-04-27 18:03:26 +00002080 PyObject *v; /* result string object */
2081 char *p; /* next free byte in output buffer */
Martin v. Löwis18e16552006-02-15 17:27:45 +00002082 Py_ssize_t nallocated; /* number of result bytes allocated */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002083 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002084 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002085
Tim Peters602f7402002-04-27 18:03:26 +00002086 assert(s != NULL);
2087 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088
Tim Peters602f7402002-04-27 18:03:26 +00002089 if (size <= MAX_SHORT_UNICHARS) {
2090 /* Write into the stack buffer; nallocated can't overflow.
2091 * At the end, we'll allocate exactly as much heap space as it
2092 * turns out we need.
2093 */
2094 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2095 v = NULL; /* will allocate after we're done */
2096 p = stackbuf;
2097 }
2098 else {
2099 /* Overallocate on the heap, and give the excess back at the end. */
2100 nallocated = size * 4;
2101 if (nallocated / 4 != size) /* overflow! */
2102 return PyErr_NoMemory();
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002103 v = PyString_FromStringAndSize(NULL, nallocated);
Tim Peters602f7402002-04-27 18:03:26 +00002104 if (v == NULL)
2105 return NULL;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002106 p = PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002107 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002108
Tim Peters602f7402002-04-27 18:03:26 +00002109 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002110 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002111
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002112 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002113 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002115
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002117 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002118 *p++ = (char)(0xc0 | (ch >> 6));
2119 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002120 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002121 else {
Tim Peters602f7402002-04-27 18:03:26 +00002122 /* Encode UCS2 Unicode ordinals */
2123 if (ch < 0x10000) {
2124 /* Special case: check for high surrogate */
2125 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2126 Py_UCS4 ch2 = s[i];
2127 /* Check for low surrogate and combine the two to
2128 form a UCS4 value */
2129 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002130 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00002131 i++;
2132 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002133 }
Tim Peters602f7402002-04-27 18:03:26 +00002134 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002135 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002136 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00002137 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2138 *p++ = (char)(0x80 | (ch & 0x3f));
2139 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00002140 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002141 encodeUCS4:
Tim Peters602f7402002-04-27 18:03:26 +00002142 /* Encode UCS4 Unicode ordinals */
2143 *p++ = (char)(0xf0 | (ch >> 18));
2144 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2145 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2146 *p++ = (char)(0x80 | (ch & 0x3f));
2147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148 }
Tim Peters0eca65c2002-04-21 17:28:06 +00002149
Tim Peters602f7402002-04-27 18:03:26 +00002150 if (v == NULL) {
2151 /* This was stack allocated. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00002152 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00002153 assert(nneeded <= nallocated);
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002154 v = PyString_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00002155 }
2156 else {
Benjamin Peterson857ce152009-01-31 16:29:18 +00002157 /* Cut back to size actually needed. */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002158 nneeded = p - PyString_AS_STRING(v);
Tim Peters602f7402002-04-27 18:03:26 +00002159 assert(nneeded <= nallocated);
Benjamin Petersonbea424a2010-04-03 00:57:33 +00002160 if (_PyString_Resize(&v, nneeded))
2161 return NULL;
Tim Peters602f7402002-04-27 18:03:26 +00002162 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002164
Tim Peters602f7402002-04-27 18:03:26 +00002165#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00002166}
2167
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2169{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 if (!PyUnicode_Check(unicode)) {
2171 PyErr_BadArgument();
2172 return NULL;
2173 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00002174 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002175 PyUnicode_GET_SIZE(unicode),
2176 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177}
2178
Walter Dörwald6e390802007-08-17 16:41:28 +00002179/* --- UTF-32 Codec ------------------------------------------------------- */
2180
2181PyObject *
2182PyUnicode_DecodeUTF32(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002183 Py_ssize_t size,
2184 const char *errors,
2185 int *byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002186{
2187 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2188}
2189
2190PyObject *
2191PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002192 Py_ssize_t size,
2193 const char *errors,
2194 int *byteorder,
2195 Py_ssize_t *consumed)
Walter Dörwald6e390802007-08-17 16:41:28 +00002196{
2197 const char *starts = s;
2198 Py_ssize_t startinpos;
2199 Py_ssize_t endinpos;
2200 Py_ssize_t outpos;
2201 PyUnicodeObject *unicode;
2202 Py_UNICODE *p;
2203#ifndef Py_UNICODE_WIDE
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002204 int pairs = 0;
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002205 const unsigned char *qq;
Walter Dörwald6e390802007-08-17 16:41:28 +00002206#else
2207 const int pairs = 0;
2208#endif
Benjamin Peterson8e5effa2010-06-12 17:47:06 +00002209 const unsigned char *q, *e;
Walter Dörwald6e390802007-08-17 16:41:28 +00002210 int bo = 0; /* assume native ordering by default */
2211 const char *errmsg = "";
Walter Dörwald20b40d32007-08-17 16:52:50 +00002212 /* Offsets from q for retrieving bytes in the right order. */
2213#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2214 int iorder[] = {0, 1, 2, 3};
2215#else
2216 int iorder[] = {3, 2, 1, 0};
2217#endif
Walter Dörwald9ab80a92007-08-17 16:58:43 +00002218 PyObject *errorHandler = NULL;
2219 PyObject *exc = NULL;
Chris Jerdonekad4b0002012-10-07 20:37:54 -07002220
Walter Dörwald6e390802007-08-17 16:41:28 +00002221 q = (unsigned char *)s;
2222 e = q + size;
2223
2224 if (byteorder)
2225 bo = *byteorder;
2226
2227 /* Check for BOM marks (U+FEFF) in the input and adjust current
2228 byte order setting accordingly. In native mode, the leading BOM
2229 mark is skipped, in all other modes, it is copied to the output
2230 stream as-is (giving a ZWNBSP character). */
2231 if (bo == 0) {
2232 if (size >= 4) {
2233 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002234 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002235#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002236 if (bom == 0x0000FEFF) {
2237 q += 4;
2238 bo = -1;
2239 }
2240 else if (bom == 0xFFFE0000) {
2241 q += 4;
2242 bo = 1;
2243 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002244#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002245 if (bom == 0x0000FEFF) {
2246 q += 4;
2247 bo = 1;
2248 }
2249 else if (bom == 0xFFFE0000) {
2250 q += 4;
2251 bo = -1;
2252 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002253#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002254 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002255 }
2256
2257 if (bo == -1) {
2258 /* force LE */
2259 iorder[0] = 0;
2260 iorder[1] = 1;
2261 iorder[2] = 2;
2262 iorder[3] = 3;
2263 }
2264 else if (bo == 1) {
2265 /* force BE */
2266 iorder[0] = 3;
2267 iorder[1] = 2;
2268 iorder[2] = 1;
2269 iorder[3] = 0;
2270 }
2271
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002272 /* On narrow builds we split characters outside the BMP into two
2273 codepoints => count how much extra space we need. */
2274#ifndef Py_UNICODE_WIDE
Serhiy Storchakac9631a12013-01-08 22:43:18 +02002275 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucca3a3f2010-06-11 21:42:26 +00002276 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2277 pairs++;
2278#endif
2279
2280 /* This might be one to much, because of a BOM */
2281 unicode = _PyUnicode_New((size+3)/4+pairs);
2282 if (!unicode)
2283 return NULL;
2284 if (size == 0)
2285 return (PyObject *)unicode;
2286
2287 /* Unpack UTF-32 encoded data */
2288 p = unicode->str;
2289
Walter Dörwald6e390802007-08-17 16:41:28 +00002290 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002291 Py_UCS4 ch;
2292 /* remaining bytes at the end? (size should be divisible by 4) */
2293 if (e-q<4) {
2294 if (consumed)
2295 break;
2296 errmsg = "truncated data";
2297 startinpos = ((const char *)q)-starts;
2298 endinpos = ((const char *)e)-starts;
2299 goto utf32Error;
2300 /* The remaining input chars are ignored if the callback
2301 chooses to skip the input */
2302 }
2303 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2304 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald6e390802007-08-17 16:41:28 +00002305
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002306 if (ch >= 0x110000)
2307 {
2308 errmsg = "codepoint not in range(0x110000)";
2309 startinpos = ((const char *)q)-starts;
2310 endinpos = startinpos+4;
2311 goto utf32Error;
2312 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002313#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002314 if (ch >= 0x10000)
2315 {
2316 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2317 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2318 }
2319 else
Walter Dörwald6e390802007-08-17 16:41:28 +00002320#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002321 *p++ = ch;
2322 q += 4;
2323 continue;
2324 utf32Error:
2325 outpos = p-PyUnicode_AS_UNICODE(unicode);
2326 if (unicode_decode_call_errorhandler(
2327 errors, &errorHandler,
2328 "utf32", errmsg,
Georg Brandle9741f32009-09-17 11:28:09 +00002329 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002330 &unicode, &outpos, &p))
2331 goto onError;
Walter Dörwald6e390802007-08-17 16:41:28 +00002332 }
2333
2334 if (byteorder)
2335 *byteorder = bo;
2336
2337 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002338 *consumed = (const char *)q-starts;
Walter Dörwald6e390802007-08-17 16:41:28 +00002339
2340 /* Adjust length */
2341 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2342 goto onError;
2343
2344 Py_XDECREF(errorHandler);
2345 Py_XDECREF(exc);
2346 return (PyObject *)unicode;
2347
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002348 onError:
Walter Dörwald6e390802007-08-17 16:41:28 +00002349 Py_DECREF(unicode);
2350 Py_XDECREF(errorHandler);
2351 Py_XDECREF(exc);
2352 return NULL;
2353}
2354
2355PyObject *
2356PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002357 Py_ssize_t size,
2358 const char *errors,
2359 int byteorder)
Walter Dörwald6e390802007-08-17 16:41:28 +00002360{
2361 PyObject *v;
2362 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002363 Py_ssize_t nsize, bytesize;
Walter Dörwald6e390802007-08-17 16:41:28 +00002364#ifndef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002365 Py_ssize_t i, pairs;
Walter Dörwald6e390802007-08-17 16:41:28 +00002366#else
2367 const int pairs = 0;
2368#endif
2369 /* Offsets from p for storing byte pairs in the right order. */
2370#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2371 int iorder[] = {0, 1, 2, 3};
2372#else
2373 int iorder[] = {3, 2, 1, 0};
2374#endif
2375
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002376#define STORECHAR(CH) \
2377 do { \
2378 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2379 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2380 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2381 p[iorder[0]] = (CH) & 0xff; \
2382 p += 4; \
Walter Dörwald6e390802007-08-17 16:41:28 +00002383 } while(0)
2384
2385 /* In narrow builds we can output surrogate pairs as one codepoint,
2386 so we need less space. */
2387#ifndef Py_UNICODE_WIDE
2388 for (i = pairs = 0; i < size-1; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002389 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2390 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2391 pairs++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002392#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002393 nsize = (size - pairs + (byteorder == 0));
2394 bytesize = nsize * 4;
2395 if (bytesize / 4 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002396 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002397 v = PyString_FromStringAndSize(NULL, bytesize);
Walter Dörwald6e390802007-08-17 16:41:28 +00002398 if (v == NULL)
2399 return NULL;
2400
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002401 p = (unsigned char *)PyString_AS_STRING(v);
Walter Dörwald6e390802007-08-17 16:41:28 +00002402 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002403 STORECHAR(0xFEFF);
Walter Dörwald6e390802007-08-17 16:41:28 +00002404 if (size == 0)
2405 return v;
2406
2407 if (byteorder == -1) {
2408 /* force LE */
2409 iorder[0] = 0;
2410 iorder[1] = 1;
2411 iorder[2] = 2;
2412 iorder[3] = 3;
2413 }
2414 else if (byteorder == 1) {
2415 /* force BE */
2416 iorder[0] = 3;
2417 iorder[1] = 2;
2418 iorder[2] = 1;
2419 iorder[3] = 0;
2420 }
2421
2422 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002423 Py_UCS4 ch = *s++;
Walter Dörwald6e390802007-08-17 16:41:28 +00002424#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002425 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2426 Py_UCS4 ch2 = *s;
2427 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2428 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2429 s++;
2430 size--;
2431 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00002432 }
Walter Dörwald6e390802007-08-17 16:41:28 +00002433#endif
2434 STORECHAR(ch);
2435 }
2436 return v;
2437#undef STORECHAR
2438}
2439
2440PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2441{
2442 if (!PyUnicode_Check(unicode)) {
2443 PyErr_BadArgument();
2444 return NULL;
2445 }
2446 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002447 PyUnicode_GET_SIZE(unicode),
2448 NULL,
2449 0);
Walter Dörwald6e390802007-08-17 16:41:28 +00002450}
2451
Guido van Rossumd57fd912000-03-10 22:53:23 +00002452/* --- UTF-16 Codec ------------------------------------------------------- */
2453
Tim Peters772747b2001-08-09 22:21:55 +00002454PyObject *
2455PyUnicode_DecodeUTF16(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002456 Py_ssize_t size,
2457 const char *errors,
2458 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002459{
Walter Dörwald69652032004-09-07 20:24:22 +00002460 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2461}
2462
2463PyObject *
2464PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002465 Py_ssize_t size,
2466 const char *errors,
2467 int *byteorder,
2468 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002469{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002470 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002471 Py_ssize_t startinpos;
2472 Py_ssize_t endinpos;
2473 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 PyUnicodeObject *unicode;
2475 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00002476 const unsigned char *q, *e;
2477 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002478 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00002479 /* Offsets from q for retrieving byte pairs in the right order. */
2480#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2481 int ihi = 1, ilo = 0;
2482#else
2483 int ihi = 0, ilo = 1;
2484#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002485 PyObject *errorHandler = NULL;
2486 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487
2488 /* Note: size will always be longer than the resulting Unicode
2489 character count */
2490 unicode = _PyUnicode_New(size);
2491 if (!unicode)
2492 return NULL;
2493 if (size == 0)
2494 return (PyObject *)unicode;
2495
2496 /* Unpack UTF-16 encoded data */
2497 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00002498 q = (unsigned char *)s;
2499 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500
2501 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00002502 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002504 /* Check for BOM marks (U+FEFF) in the input and adjust current
2505 byte order setting accordingly. In native mode, the leading BOM
2506 mark is skipped, in all other modes, it is copied to the output
2507 stream as-is (giving a ZWNBSP character). */
2508 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00002509 if (size >= 2) {
2510 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002511#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002512 if (bom == 0xFEFF) {
2513 q += 2;
2514 bo = -1;
2515 }
2516 else if (bom == 0xFFFE) {
2517 q += 2;
2518 bo = 1;
2519 }
Tim Petersced69f82003-09-16 20:30:58 +00002520#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002521 if (bom == 0xFEFF) {
2522 q += 2;
2523 bo = 1;
2524 }
2525 else if (bom == 0xFFFE) {
2526 q += 2;
2527 bo = -1;
2528 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002529#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002530 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00002531 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532
Tim Peters772747b2001-08-09 22:21:55 +00002533 if (bo == -1) {
2534 /* force LE */
2535 ihi = 1;
2536 ilo = 0;
2537 }
2538 else if (bo == 1) {
2539 /* force BE */
2540 ihi = 0;
2541 ilo = 1;
2542 }
2543
2544 while (q < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002545 Py_UNICODE ch;
2546 /* remaining bytes at the end? (size should be even) */
2547 if (e-q<2) {
2548 if (consumed)
2549 break;
2550 errmsg = "truncated data";
2551 startinpos = ((const char *)q)-starts;
2552 endinpos = ((const char *)e)-starts;
2553 goto utf16Error;
2554 /* The remaining input chars are ignored if the callback
2555 chooses to skip the input */
2556 }
2557 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002558
Benjamin Peterson857ce152009-01-31 16:29:18 +00002559 q += 2;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002560
2561 if (ch < 0xD800 || ch > 0xDFFF) {
2562 *p++ = ch;
2563 continue;
2564 }
2565
2566 /* UTF-16 code pair: */
Antoine Pitrou715a63b2012-07-21 00:52:06 +02002567 if (e - q < 2) {
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002568 q -= 2;
2569 if (consumed)
2570 break;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002571 errmsg = "unexpected end of data";
Serhiy Storchakac4b82c02013-01-08 23:12:00 +02002572 startinpos = ((const char *)q)-starts;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002573 endinpos = ((const char *)e)-starts;
2574 goto utf16Error;
2575 }
2576 if (0xD800 <= ch && ch <= 0xDBFF) {
2577 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2578 q += 2;
2579 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00002580#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002581 *p++ = ch;
2582 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002583#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002584 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002585#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002586 continue;
2587 }
2588 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002589 errmsg = "illegal UTF-16 surrogate";
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002590 startinpos = (((const char *)q)-4)-starts;
2591 endinpos = startinpos+2;
2592 goto utf16Error;
2593 }
2594
Benjamin Peterson857ce152009-01-31 16:29:18 +00002595 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002596 errmsg = "illegal encoding";
2597 startinpos = (((const char *)q)-2)-starts;
2598 endinpos = startinpos+2;
2599 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002600
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002601 utf16Error:
2602 outpos = p-PyUnicode_AS_UNICODE(unicode);
2603 if (unicode_decode_call_errorhandler(
2604 errors, &errorHandler,
2605 "utf16", errmsg,
2606 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2607 &unicode, &outpos, &p))
2608 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609 }
2610
2611 if (byteorder)
2612 *byteorder = bo;
2613
Walter Dörwald69652032004-09-07 20:24:22 +00002614 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002615 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00002616
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002618 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 goto onError;
2620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002621 Py_XDECREF(errorHandler);
2622 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623 return (PyObject *)unicode;
2624
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002625 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002626 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002627 Py_XDECREF(errorHandler);
2628 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 return NULL;
2630}
2631
Tim Peters772747b2001-08-09 22:21:55 +00002632PyObject *
2633PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002634 Py_ssize_t size,
2635 const char *errors,
2636 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002637{
2638 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00002639 unsigned char *p;
Neal Norwitze7d8be82008-07-31 17:17:14 +00002640 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002641#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00002642 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002643#else
2644 const int pairs = 0;
2645#endif
Tim Peters772747b2001-08-09 22:21:55 +00002646 /* Offsets from p for storing byte pairs in the right order. */
2647#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2648 int ihi = 1, ilo = 0;
2649#else
2650 int ihi = 0, ilo = 1;
2651#endif
2652
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002653#define STORECHAR(CH) \
2654 do { \
2655 p[ihi] = ((CH) >> 8) & 0xff; \
2656 p[ilo] = (CH) & 0xff; \
2657 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00002658 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002660#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002661 for (i = pairs = 0; i < size; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002662 if (s[i] >= 0x10000)
2663 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002664#endif
Neal Norwitze7d8be82008-07-31 17:17:14 +00002665 /* 2 * (size + pairs + (byteorder == 0)) */
2666 if (size > PY_SSIZE_T_MAX ||
2667 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002668 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002669 nsize = size + pairs + (byteorder == 0);
2670 bytesize = nsize * 2;
2671 if (bytesize / 2 != nsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002672 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00002673 v = PyString_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674 if (v == NULL)
2675 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676
Gregory P. Smithdd96db62008-06-09 04:58:54 +00002677 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002678 if (byteorder == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002679 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002680 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002681 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002682
2683 if (byteorder == -1) {
2684 /* force LE */
2685 ihi = 1;
2686 ilo = 0;
2687 }
2688 else if (byteorder == 1) {
2689 /* force BE */
2690 ihi = 0;
2691 ilo = 1;
2692 }
2693
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002694 while (size-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002695 Py_UNICODE ch = *s++;
2696 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002697#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002698 if (ch >= 0x10000) {
2699 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2700 ch = 0xD800 | ((ch-0x10000) >> 10);
2701 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002702#endif
Tim Peters772747b2001-08-09 22:21:55 +00002703 STORECHAR(ch);
2704 if (ch2)
2705 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707 return v;
Tim Peters772747b2001-08-09 22:21:55 +00002708#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709}
2710
2711PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2712{
2713 if (!PyUnicode_Check(unicode)) {
2714 PyErr_BadArgument();
2715 return NULL;
2716 }
2717 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002718 PyUnicode_GET_SIZE(unicode),
2719 NULL,
2720 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002721}
2722
2723/* --- Unicode Escape Codec ----------------------------------------------- */
2724
Fredrik Lundh06d12682001-01-24 07:59:11 +00002725static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00002726
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002728 Py_ssize_t size,
2729 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002731 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002732 Py_ssize_t startinpos;
2733 Py_ssize_t endinpos;
2734 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002735 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002737 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002738 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002739 char* message;
2740 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002741 PyObject *errorHandler = NULL;
2742 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002743
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 /* Escaped strings will always be longer than the resulting
2745 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 length after conversion to the true value.
2747 (but if the error callback returns a long replacement string
2748 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749 v = _PyUnicode_New(size);
2750 if (v == NULL)
2751 goto onError;
2752 if (size == 0)
2753 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002754
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002755 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002757
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 while (s < end) {
2759 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00002760 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002761 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762
2763 /* Non-escape characters are interpreted as Unicode ordinals */
2764 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002765 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 continue;
2767 }
2768
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002769 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770 /* \ - Escapes */
2771 s++;
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002772 c = *s++;
2773 if (s > end)
2774 c = '\0'; /* Invalid after \ */
2775 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002777 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 case '\n': break;
2779 case '\\': *p++ = '\\'; break;
2780 case '\'': *p++ = '\''; break;
2781 case '\"': *p++ = '\"'; break;
2782 case 'b': *p++ = '\b'; break;
2783 case 'f': *p++ = '\014'; break; /* FF */
2784 case 't': *p++ = '\t'; break;
2785 case 'n': *p++ = '\n'; break;
2786 case 'r': *p++ = '\r'; break;
2787 case 'v': *p++ = '\013'; break; /* VT */
2788 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2789
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002790 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 case '0': case '1': case '2': case '3':
2792 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002793 x = s[-1] - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002794 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002795 x = (x<<3) + *s++ - '0';
Guido van Rossum1c1ac382007-10-29 22:15:05 +00002796 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002797 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00002799 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800 break;
2801
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002802 /* hex escapes */
2803 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002805 digits = 2;
2806 message = "truncated \\xXX escape";
2807 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002809 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002811 digits = 4;
2812 message = "truncated \\uXXXX escape";
2813 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002814
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002815 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00002816 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00002817 digits = 8;
2818 message = "truncated \\UXXXXXXXX escape";
2819 hexescape:
2820 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002821 outpos = p-PyUnicode_AS_UNICODE(v);
2822 if (s+digits>end) {
2823 endinpos = size;
2824 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002825 errors, &errorHandler,
2826 "unicodeescape", "end of string in escape sequence",
2827 starts, size, &startinpos, &endinpos, &exc, &s,
2828 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002829 goto onError;
2830 goto nextByte;
2831 }
2832 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00002833 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00002834 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002835 endinpos = (s+i+1)-starts;
2836 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002837 errors, &errorHandler,
2838 "unicodeescape", message,
2839 starts, size, &startinpos, &endinpos, &exc, &s,
2840 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002841 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00002843 }
2844 chr = (chr<<4) & ~0xF;
2845 if (c >= '0' && c <= '9')
2846 chr += c - '0';
2847 else if (c >= 'a' && c <= 'f')
2848 chr += 10 + c - 'a';
2849 else
2850 chr += 10 + c - 'A';
2851 }
2852 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00002853 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002854 /* _decoding_error will have already written into the
2855 target buffer. */
2856 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002857 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00002858 /* when we get here, chr is a 32-bit unicode character */
2859 if (chr <= 0xffff)
2860 /* UCS-2 character */
2861 *p++ = (Py_UNICODE) chr;
2862 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002863 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00002864 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00002865#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002866 *p++ = chr;
2867#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00002868 chr -= 0x10000L;
2869 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00002870 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002871#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00002872 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002873 endinpos = s-starts;
2874 outpos = p-PyUnicode_AS_UNICODE(v);
2875 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002876 errors, &errorHandler,
2877 "unicodeescape", "illegal Unicode character",
2878 starts, size, &startinpos, &endinpos, &exc, &s,
2879 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00002880 goto onError;
2881 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002882 break;
2883
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002884 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00002885 case 'N':
2886 message = "malformed \\N character escape";
2887 if (ucnhash_CAPI == NULL) {
2888 /* load the unicode data module */
Larry Hastings402b73f2010-03-25 00:54:54 +00002889 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00002890 if (ucnhash_CAPI == NULL)
2891 goto ucnhashError;
2892 }
2893 if (*s == '{') {
2894 const char *start = s+1;
2895 /* look for the closing brace */
2896 while (*s != '}' && s < end)
2897 s++;
2898 if (s > start && s < end && *s == '}') {
2899 /* found a name. look it up in the unicode database */
2900 message = "unknown Unicode character name";
2901 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00002902 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002903 goto store;
2904 }
2905 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002906 endinpos = s-starts;
2907 outpos = p-PyUnicode_AS_UNICODE(v);
2908 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002909 errors, &errorHandler,
2910 "unicodeescape", message,
2911 starts, size, &startinpos, &endinpos, &exc, &s,
2912 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00002913 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00002914 break;
2915
2916 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00002917 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002918 message = "\\ at end of string";
2919 s--;
2920 endinpos = s-starts;
2921 outpos = p-PyUnicode_AS_UNICODE(v);
2922 if (unicode_decode_call_errorhandler(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002923 errors, &errorHandler,
2924 "unicodeescape", message,
2925 starts, size, &startinpos, &endinpos, &exc, &s,
2926 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00002927 goto onError;
2928 }
2929 else {
2930 *p++ = '\\';
2931 *p++ = (unsigned char)s[-1];
2932 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00002933 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002935 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002936 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002937 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00002938 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002939 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00002940 Py_XDECREF(errorHandler);
2941 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002942 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00002943
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002944 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00002945 PyErr_SetString(
2946 PyExc_UnicodeError,
2947 "\\N escapes not supported (can't load unicodedata module)"
2948 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00002949 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002950 Py_XDECREF(errorHandler);
2951 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00002952 return NULL;
2953
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002954 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002955 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002956 Py_XDECREF(errorHandler);
2957 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 return NULL;
2959}
2960
2961/* Return a Unicode-Escape string version of the Unicode object.
2962
2963 If quotes is true, the string is enclosed in u"" or u'' quotes as
2964 appropriate.
2965
2966*/
2967
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00002968Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00002969 Py_ssize_t size,
2970 Py_UNICODE ch)
Fredrik Lundh347ee272006-05-24 16:35:18 +00002971{
2972 /* like wcschr, but doesn't stop at NULL characters */
2973
2974 while (size-- > 0) {
2975 if (*s == ch)
2976 return s;
2977 s++;
2978 }
2979
2980 return NULL;
2981}
Barry Warsaw51ac5802000-03-20 16:36:48 +00002982
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983static
2984PyObject *unicodeescape_string(const Py_UNICODE *s,
Martin v. Löwis18e16552006-02-15 17:27:45 +00002985 Py_ssize_t size,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 int quotes)
2987{
2988 PyObject *repr;
2989 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002991 static const char *hexdigit = "0123456789abcdef";
Neal Norwitze7d8be82008-07-31 17:17:14 +00002992#ifdef Py_UNICODE_WIDE
2993 const Py_ssize_t expandsize = 10;
2994#else
2995 const Py_ssize_t expandsize = 6;
2996#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997
Neal Norwitz17753ec2006-08-21 22:21:19 +00002998 /* XXX(nnorwitz): rather than over-allocating, it would be
2999 better to choose a different scheme. Perhaps scan the
3000 first N-chars of the string and allocate based on that size.
3001 */
3002 /* Initial allocation is based on the longest-possible unichr
3003 escape.
3004
3005 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3006 unichr, so in this case it's the longest unichr escape. In
3007 narrow (UTF-16) builds this is five chars per source unichr
3008 since there are two unichrs in the surrogate pair, so in narrow
3009 (UTF-16) builds it's not the longest unichr escape.
3010
3011 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3012 so in the narrow (UTF-16) build case it's the longest unichr
3013 escape.
3014 */
3015
Neal Norwitze7d8be82008-07-31 17:17:14 +00003016 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003017 return PyErr_NoMemory();
Neal Norwitze7d8be82008-07-31 17:17:14 +00003018
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003019 repr = PyString_FromStringAndSize(NULL,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003020 2
3021 + expandsize*size
3022 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003023 if (repr == NULL)
3024 return NULL;
3025
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003026 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027
3028 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00003030 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031 !findchar(s, size, '"')) ? '"' : '\'';
3032 }
3033 while (size-- > 0) {
3034 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003035
Hye-Shik Chang835b2432005-12-17 04:38:31 +00003036 /* Escape quotes and backslashes */
3037 if ((quotes &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003038 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 *p++ = '\\';
3040 *p++ = (char) ch;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003041 continue;
Tim Petersced69f82003-09-16 20:30:58 +00003042 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003043
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00003044#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003045 /* Map 21-bit characters to '\U00xxxxxx' */
3046 else if (ch >= 0x10000) {
3047 *p++ = '\\';
3048 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003049 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3050 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3051 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3052 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3053 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3054 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3055 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003056 *p++ = hexdigit[ch & 0x0000000F];
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003057 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003058 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003059#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003060 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3061 else if (ch >= 0xD800 && ch < 0xDC00) {
3062 Py_UNICODE ch2;
3063 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00003064
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003065 ch2 = *s++;
3066 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003067 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003068 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3069 *p++ = '\\';
3070 *p++ = 'U';
3071 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3072 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3073 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3074 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3075 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3076 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3077 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3078 *p++ = hexdigit[ucs & 0x0000000F];
3079 continue;
3080 }
3081 /* Fall through: isolated surrogates are copied as-is */
3082 s--;
3083 size++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003084 }
Neal Norwitz17753ec2006-08-21 22:21:19 +00003085#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003086
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003088 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 *p++ = '\\';
3090 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003091 *p++ = hexdigit[(ch >> 12) & 0x000F];
3092 *p++ = hexdigit[(ch >> 8) & 0x000F];
3093 *p++ = hexdigit[(ch >> 4) & 0x000F];
3094 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003096
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003097 /* Map special whitespace to '\t', \n', '\r' */
3098 else if (ch == '\t') {
3099 *p++ = '\\';
3100 *p++ = 't';
3101 }
3102 else if (ch == '\n') {
3103 *p++ = '\\';
3104 *p++ = 'n';
3105 }
3106 else if (ch == '\r') {
3107 *p++ = '\\';
3108 *p++ = 'r';
3109 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003110
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003111 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00003112 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003114 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003115 *p++ = hexdigit[(ch >> 4) & 0x000F];
3116 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00003117 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00003118
Guido van Rossumd57fd912000-03-10 22:53:23 +00003119 /* Copy everything else as-is */
3120 else
3121 *p++ = (char) ch;
3122 }
3123 if (quotes)
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003124 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125
3126 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003127 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3128 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129 return repr;
3130}
3131
3132PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003133 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134{
3135 return unicodeescape_string(s, size, 0);
3136}
3137
3138PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3139{
3140 if (!PyUnicode_Check(unicode)) {
3141 PyErr_BadArgument();
3142 return NULL;
3143 }
3144 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003145 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146}
3147
3148/* --- Raw Unicode Escape Codec ------------------------------------------- */
3149
3150PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003151 Py_ssize_t size,
3152 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003154 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003155 Py_ssize_t startinpos;
3156 Py_ssize_t endinpos;
3157 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003159 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 const char *end;
3161 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003162 PyObject *errorHandler = NULL;
3163 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003164
Guido van Rossumd57fd912000-03-10 22:53:23 +00003165 /* Escaped strings will always be longer than the resulting
3166 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003167 length after conversion to the true value. (But decoding error
3168 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169 v = _PyUnicode_New(size);
3170 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003171 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003173 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003174 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175 end = s + size;
3176 while (s < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003177 unsigned char c;
3178 Py_UCS4 x;
3179 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003180 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003181
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003182 /* Non-escape characters are interpreted as Unicode ordinals */
3183 if (*s != '\\') {
3184 *p++ = (unsigned char)*s++;
3185 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003186 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003187 startinpos = s-starts;
3188
3189 /* \u-escapes are only interpreted iff the number of leading
3190 backslashes if odd */
3191 bs = s;
3192 for (;s < end;) {
3193 if (*s != '\\')
3194 break;
3195 *p++ = (unsigned char)*s++;
3196 }
3197 if (((s - bs) & 1) == 0 ||
3198 s >= end ||
3199 (*s != 'u' && *s != 'U')) {
3200 continue;
3201 }
3202 p--;
3203 count = *s=='u' ? 4 : 8;
3204 s++;
3205
3206 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3207 outpos = p-PyUnicode_AS_UNICODE(v);
3208 for (x = 0, i = 0; i < count; ++i, ++s) {
3209 c = (unsigned char)*s;
3210 if (!isxdigit(c)) {
3211 endinpos = s-starts;
3212 if (unicode_decode_call_errorhandler(
3213 errors, &errorHandler,
3214 "rawunicodeescape", "truncated \\uXXXX",
3215 starts, size, &startinpos, &endinpos, &exc, &s,
3216 &v, &outpos, &p))
3217 goto onError;
3218 goto nextByte;
3219 }
3220 x = (x<<4) & ~0xF;
3221 if (c >= '0' && c <= '9')
3222 x += c - '0';
3223 else if (c >= 'a' && c <= 'f')
3224 x += 10 + c - 'a';
3225 else
3226 x += 10 + c - 'A';
3227 }
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003228 if (x <= 0xffff)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003229 /* UCS-2 character */
3230 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003231 else if (x <= 0x10ffff) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003232 /* UCS-4 character. Either store directly, or as
3233 surrogate pair. */
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003234#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003235 *p++ = (Py_UNICODE) x;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003236#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003237 x -= 0x10000L;
3238 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3239 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003240#endif
3241 } else {
3242 endinpos = s-starts;
3243 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003244 if (unicode_decode_call_errorhandler(
3245 errors, &errorHandler,
3246 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003247 starts, size, &startinpos, &endinpos, &exc, &s,
3248 &v, &outpos, &p))
3249 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003250 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003251 nextByte:
3252 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 }
Martin v. Löwis412fb672006-04-13 06:34:32 +00003254 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003255 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256 Py_XDECREF(errorHandler);
3257 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003259
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003260 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003262 Py_XDECREF(errorHandler);
3263 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 return NULL;
3265}
3266
3267PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003268 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269{
3270 PyObject *repr;
3271 char *p;
3272 char *q;
3273
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00003274 static const char *hexdigit = "0123456789abcdef";
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003275#ifdef Py_UNICODE_WIDE
Neal Norwitze7d8be82008-07-31 17:17:14 +00003276 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003277#else
Neal Norwitze7d8be82008-07-31 17:17:14 +00003278 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003279#endif
Benjamin Peterson857ce152009-01-31 16:29:18 +00003280
Neal Norwitze7d8be82008-07-31 17:17:14 +00003281 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003282 return PyErr_NoMemory();
Benjamin Peterson857ce152009-01-31 16:29:18 +00003283
Neal Norwitze7d8be82008-07-31 17:17:14 +00003284 repr = PyString_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003285 if (repr == NULL)
3286 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003287 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003288 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003289
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003290 p = q = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291 while (size-- > 0) {
3292 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003293#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003294 /* Map 32-bit characters to '\Uxxxxxxxx' */
3295 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003296 *p++ = '\\';
3297 *p++ = 'U';
3298 *p++ = hexdigit[(ch >> 28) & 0xf];
3299 *p++ = hexdigit[(ch >> 24) & 0xf];
3300 *p++ = hexdigit[(ch >> 20) & 0xf];
3301 *p++ = hexdigit[(ch >> 16) & 0xf];
3302 *p++ = hexdigit[(ch >> 12) & 0xf];
3303 *p++ = hexdigit[(ch >> 8) & 0xf];
3304 *p++ = hexdigit[(ch >> 4) & 0xf];
3305 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00003306 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003307 else
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003308#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003309 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3310 if (ch >= 0xD800 && ch < 0xDC00) {
3311 Py_UNICODE ch2;
3312 Py_UCS4 ucs;
Amaury Forgeot d'Arc9a0d3462008-03-23 09:55:29 +00003313
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003314 ch2 = *s++;
3315 size--;
Georg Brandle27d0442010-08-01 20:54:30 +00003316 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003317 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3318 *p++ = '\\';
3319 *p++ = 'U';
3320 *p++ = hexdigit[(ucs >> 28) & 0xf];
3321 *p++ = hexdigit[(ucs >> 24) & 0xf];
3322 *p++ = hexdigit[(ucs >> 20) & 0xf];
3323 *p++ = hexdigit[(ucs >> 16) & 0xf];
3324 *p++ = hexdigit[(ucs >> 12) & 0xf];
3325 *p++ = hexdigit[(ucs >> 8) & 0xf];
3326 *p++ = hexdigit[(ucs >> 4) & 0xf];
3327 *p++ = hexdigit[ucs & 0xf];
3328 continue;
3329 }
3330 /* Fall through: isolated surrogates are copied as-is */
3331 s--;
3332 size++;
3333 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00003334#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003335 /* Map 16-bit characters to '\uxxxx' */
3336 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 *p++ = '\\';
3338 *p++ = 'u';
3339 *p++ = hexdigit[(ch >> 12) & 0xf];
3340 *p++ = hexdigit[(ch >> 8) & 0xf];
3341 *p++ = hexdigit[(ch >> 4) & 0xf];
3342 *p++ = hexdigit[ch & 15];
3343 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003344 /* Copy everything else as-is */
3345 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346 *p++ = (char) ch;
3347 }
3348 *p = '\0';
Benjamin Petersonbea424a2010-04-03 00:57:33 +00003349 if (_PyString_Resize(&repr, p - q))
3350 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351 return repr;
3352}
3353
3354PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3355{
3356 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003357 PyErr_BadArgument();
3358 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359 }
3360 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003361 PyUnicode_GET_SIZE(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362}
3363
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003364/* --- Unicode Internal Codec ------------------------------------------- */
3365
3366PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003367 Py_ssize_t size,
3368 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003369{
3370 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003371 Py_ssize_t startinpos;
3372 Py_ssize_t endinpos;
3373 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003374 PyUnicodeObject *v;
3375 Py_UNICODE *p;
3376 const char *end;
3377 const char *reason;
3378 PyObject *errorHandler = NULL;
3379 PyObject *exc = NULL;
3380
Neal Norwitzd43069c2006-01-08 01:12:10 +00003381#ifdef Py_UNICODE_WIDE
3382 Py_UNICODE unimax = PyUnicode_GetMax();
3383#endif
3384
Armin Rigo7ccbca92006-10-04 12:17:45 +00003385 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003386 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3387 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003388 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003389 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003390 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003391 p = PyUnicode_AS_UNICODE(v);
3392 end = s + size;
3393
3394 while (s < end) {
Neal Norwitz1004a532006-05-15 07:17:23 +00003395 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003396 /* We have to sanity check the raw data, otherwise doom looms for
3397 some malformed UCS-4 data. */
3398 if (
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003399#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003400 *p > unimax || *p < 0 ||
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003401#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003402 end-s < Py_UNICODE_SIZE
3403 )
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003404 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003405 startinpos = s - starts;
3406 if (end-s < Py_UNICODE_SIZE) {
3407 endinpos = end-starts;
3408 reason = "truncated input";
3409 }
3410 else {
3411 endinpos = s - starts + Py_UNICODE_SIZE;
3412 reason = "illegal code point (> 0x10FFFF)";
3413 }
3414 outpos = p - PyUnicode_AS_UNICODE(v);
3415 if (unicode_decode_call_errorhandler(
3416 errors, &errorHandler,
3417 "unicode_internal", reason,
3418 starts, size, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalotti034e08c2008-12-27 06:36:10 +00003419 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003420 goto onError;
3421 }
3422 }
3423 else {
3424 p++;
3425 s += Py_UNICODE_SIZE;
3426 }
3427 }
3428
Martin v. Löwis412fb672006-04-13 06:34:32 +00003429 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003430 goto onError;
3431 Py_XDECREF(errorHandler);
3432 Py_XDECREF(exc);
3433 return (PyObject *)v;
3434
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003435 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00003436 Py_XDECREF(v);
3437 Py_XDECREF(errorHandler);
3438 Py_XDECREF(exc);
3439 return NULL;
3440}
3441
Guido van Rossumd57fd912000-03-10 22:53:23 +00003442/* --- Latin-1 Codec ------------------------------------------------------ */
3443
3444PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003445 Py_ssize_t size,
3446 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447{
3448 PyUnicodeObject *v;
3449 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00003450
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003452 if (size == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003453 Py_UNICODE r = *(unsigned char*)s;
3454 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003455 }
3456
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457 v = _PyUnicode_New(size);
3458 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003459 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003461 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462 p = PyUnicode_AS_UNICODE(v);
3463 while (size-- > 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003464 *p++ = (unsigned char)*s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003466
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003467 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 Py_XDECREF(v);
3469 return NULL;
3470}
3471
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472/* create or adjust a UnicodeEncodeError */
3473static void make_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003474 const char *encoding,
3475 const Py_UNICODE *unicode, Py_ssize_t size,
3476 Py_ssize_t startpos, Py_ssize_t endpos,
3477 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 if (*exceptionObject == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003480 *exceptionObject = PyUnicodeEncodeError_Create(
3481 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003482 }
3483 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003484 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3485 goto onError;
3486 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3487 goto onError;
3488 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3489 goto onError;
3490 return;
3491 onError:
3492 Py_DECREF(*exceptionObject);
3493 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 }
3495}
3496
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003497/* raises a UnicodeEncodeError */
3498static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003499 const char *encoding,
3500 const Py_UNICODE *unicode, Py_ssize_t size,
3501 Py_ssize_t startpos, Py_ssize_t endpos,
3502 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003503{
3504 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003505 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003506 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003507 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003508}
3509
3510/* error handling callback helper:
3511 build arguments, call the callback and check the arguments,
3512 put the result into newpos and return the replacement string, which
3513 has to be freed by the caller */
3514static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003515 PyObject **errorHandler,
3516 const char *encoding, const char *reason,
3517 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3518 Py_ssize_t startpos, Py_ssize_t endpos,
3519 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520{
Martin v. Löwis18e16552006-02-15 17:27:45 +00003521 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522
3523 PyObject *restuple;
3524 PyObject *resunicode;
3525
3526 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003527 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003528 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003529 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 }
3531
3532 make_encode_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003533 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003534 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003535 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003536
3537 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003538 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003539 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003540 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003541 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00003542 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003543 Py_DECREF(restuple);
3544 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 }
3546 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003547 &resunicode, newpos)) {
3548 Py_DECREF(restuple);
3549 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003550 }
3551 if (*newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003552 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003553 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003554 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3555 Py_DECREF(restuple);
3556 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003557 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558 Py_INCREF(resunicode);
3559 Py_DECREF(restuple);
3560 return resunicode;
3561}
3562
3563static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003564 Py_ssize_t size,
3565 const char *errors,
3566 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567{
3568 /* output object */
3569 PyObject *res;
3570 /* pointers to the beginning and end+1 of input */
3571 const Py_UNICODE *startp = p;
3572 const Py_UNICODE *endp = p + size;
3573 /* pointer to the beginning of the unencodable characters */
3574 /* const Py_UNICODE *badp = NULL; */
3575 /* pointer into the output */
3576 char *str;
3577 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00003578 Py_ssize_t respos = 0;
3579 Py_ssize_t ressize;
Anthony Baxtera6286212006-04-11 07:42:36 +00003580 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3581 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582 PyObject *errorHandler = NULL;
3583 PyObject *exc = NULL;
3584 /* the following variable is used for caching string comparisons
3585 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3586 int known_errorHandler = -1;
3587
3588 /* allocate enough for a simple encoding without
3589 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003590 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591 if (res == NULL)
3592 goto onError;
3593 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003594 return res;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003595 str = PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596 ressize = size;
3597
3598 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003599 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003601 /* can we encode this? */
3602 if (c<limit) {
3603 /* no overflow check, because we know that the space is enough */
3604 *str++ = (char)c;
3605 ++p;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003606 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003607 else {
3608 Py_ssize_t unicodepos = p-startp;
3609 Py_ssize_t requiredsize;
3610 PyObject *repunicode;
3611 Py_ssize_t repsize;
3612 Py_ssize_t newpos;
3613 Py_ssize_t respos;
3614 Py_UNICODE *uni2;
3615 /* startpos for collecting unencodable chars */
3616 const Py_UNICODE *collstart = p;
3617 const Py_UNICODE *collend = p;
3618 /* find all unecodable characters */
3619 while ((collend < endp) && ((*collend)>=limit))
3620 ++collend;
3621 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3622 if (known_errorHandler==-1) {
3623 if ((errors==NULL) || (!strcmp(errors, "strict")))
3624 known_errorHandler = 1;
3625 else if (!strcmp(errors, "replace"))
3626 known_errorHandler = 2;
3627 else if (!strcmp(errors, "ignore"))
3628 known_errorHandler = 3;
3629 else if (!strcmp(errors, "xmlcharrefreplace"))
3630 known_errorHandler = 4;
3631 else
3632 known_errorHandler = 0;
3633 }
3634 switch (known_errorHandler) {
3635 case 1: /* strict */
3636 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3637 goto onError;
3638 case 2: /* replace */
3639 while (collstart++<collend)
3640 *str++ = '?'; /* fall through */
3641 case 3: /* ignore */
3642 p = collend;
3643 break;
3644 case 4: /* xmlcharrefreplace */
3645 respos = str-PyString_AS_STRING(res);
3646 /* determine replacement size (temporarily (mis)uses p) */
3647 for (p = collstart, repsize = 0; p < collend; ++p) {
3648 if (*p<10)
3649 repsize += 2+1+1;
3650 else if (*p<100)
3651 repsize += 2+2+1;
3652 else if (*p<1000)
3653 repsize += 2+3+1;
3654 else if (*p<10000)
3655 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003656#ifndef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003657 else
3658 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00003659#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003660 else if (*p<100000)
3661 repsize += 2+5+1;
3662 else if (*p<1000000)
3663 repsize += 2+6+1;
3664 else
3665 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003666#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003667 }
3668 requiredsize = respos+repsize+(endp-collend);
3669 if (requiredsize > ressize) {
3670 if (requiredsize<2*ressize)
3671 requiredsize = 2*ressize;
3672 if (_PyString_Resize(&res, requiredsize))
3673 goto onError;
3674 str = PyString_AS_STRING(res) + respos;
3675 ressize = requiredsize;
3676 }
3677 /* generate replacement (temporarily (mis)uses p) */
3678 for (p = collstart; p < collend; ++p) {
3679 str += sprintf(str, "&#%d;", (int)*p);
3680 }
3681 p = collend;
3682 break;
3683 default:
3684 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3685 encoding, reason, startp, size, &exc,
3686 collstart-startp, collend-startp, &newpos);
3687 if (repunicode == NULL)
3688 goto onError;
3689 /* need more space? (at least enough for what we have+the
3690 replacement+the rest of the string, so we won't have to
3691 check space for encodable characters) */
3692 respos = str-PyString_AS_STRING(res);
3693 repsize = PyUnicode_GET_SIZE(repunicode);
3694 requiredsize = respos+repsize+(endp-collend);
3695 if (requiredsize > ressize) {
3696 if (requiredsize<2*ressize)
3697 requiredsize = 2*ressize;
3698 if (_PyString_Resize(&res, requiredsize)) {
3699 Py_DECREF(repunicode);
3700 goto onError;
3701 }
3702 str = PyString_AS_STRING(res) + respos;
3703 ressize = requiredsize;
3704 }
3705 /* check if there is anything unencodable in the replacement
3706 and copy it to the output */
3707 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3708 c = *uni2;
3709 if (c >= limit) {
3710 raise_encode_exception(&exc, encoding, startp, size,
3711 unicodepos, unicodepos+1, reason);
3712 Py_DECREF(repunicode);
3713 goto onError;
3714 }
3715 *str = (char)c;
3716 }
3717 p = startp + newpos;
Benjamin Peterson857ce152009-01-31 16:29:18 +00003718 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00003719 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00003720 }
3721 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003723 respos = str-PyString_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003724 if (respos<ressize)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003725 /* If this falls res will be NULL */
3726 _PyString_Resize(&res, respos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003727 Py_XDECREF(errorHandler);
3728 Py_XDECREF(exc);
3729 return res;
3730
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003731 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003732 Py_XDECREF(res);
3733 Py_XDECREF(errorHandler);
3734 Py_XDECREF(exc);
3735 return NULL;
3736}
3737
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003739 Py_ssize_t size,
3740 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003742 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743}
3744
3745PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3746{
3747 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003748 PyErr_BadArgument();
3749 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750 }
3751 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003752 PyUnicode_GET_SIZE(unicode),
3753 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754}
3755
3756/* --- 7-bit ASCII Codec -------------------------------------------------- */
3757
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003759 Py_ssize_t size,
3760 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003762 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763 PyUnicodeObject *v;
3764 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003765 Py_ssize_t startinpos;
3766 Py_ssize_t endinpos;
3767 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 const char *e;
3769 PyObject *errorHandler = NULL;
3770 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00003771
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003773 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003774 Py_UNICODE r = *(unsigned char*)s;
3775 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003776 }
Tim Petersced69f82003-09-16 20:30:58 +00003777
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 v = _PyUnicode_New(size);
3779 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003780 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003782 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003784 e = s + size;
3785 while (s < e) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003786 register unsigned char c = (unsigned char)*s;
3787 if (c < 128) {
3788 *p++ = c;
3789 ++s;
3790 }
3791 else {
3792 startinpos = s-starts;
3793 endinpos = startinpos + 1;
3794 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3795 if (unicode_decode_call_errorhandler(
3796 errors, &errorHandler,
3797 "ascii", "ordinal not in range(128)",
3798 starts, size, &startinpos, &endinpos, &exc, &s,
3799 &v, &outpos, &p))
3800 goto onError;
3801 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00003803 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003804 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3805 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003806 Py_XDECREF(errorHandler);
3807 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00003809
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003810 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003812 Py_XDECREF(errorHandler);
3813 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814 return NULL;
3815}
3816
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003818 Py_ssize_t size,
3819 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003821 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822}
3823
3824PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3825{
3826 if (!PyUnicode_Check(unicode)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003827 PyErr_BadArgument();
3828 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 }
3830 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003831 PyUnicode_GET_SIZE(unicode),
3832 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833}
3834
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00003835#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003836
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003837/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00003838
Hirokazu Yamamoto52a34922009-03-21 10:32:52 +00003839#if SIZEOF_INT < SIZEOF_SIZE_T
Martin v. Löwisd8251432006-06-14 05:21:04 +00003840#define NEED_RETRY
3841#endif
3842
3843/* XXX This code is limited to "true" double-byte encodings, as
3844 a) it assumes an incomplete character consists of a single byte, and
3845 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003846 encodings, see IsDBCSLeadByteEx documentation. */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003847
3848static int is_dbcs_lead_byte(const char *s, int offset)
3849{
3850 const char *curr = s + offset;
3851
3852 if (IsDBCSLeadByte(*curr)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003853 const char *prev = CharPrev(s, curr);
3854 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003855 }
3856 return 0;
3857}
3858
3859/*
3860 * Decode MBCS string into unicode object. If 'final' is set, converts
3861 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3862 */
3863static int decode_mbcs(PyUnicodeObject **v,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003864 const char *s, /* MBCS string */
3865 int size, /* sizeof MBCS string */
3866 int final)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003867{
3868 Py_UNICODE *p;
3869 Py_ssize_t n = 0;
3870 int usize = 0;
3871
3872 assert(size >= 0);
3873
3874 /* Skip trailing lead-byte unless 'final' is set */
3875 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003876 --size;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003877
3878 /* First get the size of the result */
3879 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003880 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3881 if (usize == 0) {
3882 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3883 return -1;
3884 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003885 }
3886
3887 if (*v == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003888 /* Create unicode object */
3889 *v = _PyUnicode_New(usize);
3890 if (*v == NULL)
3891 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003892 }
3893 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003894 /* Extend unicode object */
3895 n = PyUnicode_GET_SIZE(*v);
3896 if (_PyUnicode_Resize(v, n + usize) < 0)
3897 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003898 }
3899
3900 /* Do the conversion */
3901 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003902 p = PyUnicode_AS_UNICODE(*v) + n;
3903 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3904 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3905 return -1;
3906 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003907 }
3908
3909 return size;
3910}
3911
3912PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003913 Py_ssize_t size,
3914 const char *errors,
3915 Py_ssize_t *consumed)
Martin v. Löwisd8251432006-06-14 05:21:04 +00003916{
3917 PyUnicodeObject *v = NULL;
3918 int done;
3919
3920 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003921 *consumed = 0;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003922
3923#ifdef NEED_RETRY
3924 retry:
3925 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003926 done = decode_mbcs(&v, s, INT_MAX, 0);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003927 else
3928#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003929 done = decode_mbcs(&v, s, (int)size, !consumed);
Martin v. Löwisd8251432006-06-14 05:21:04 +00003930
3931 if (done < 0) {
3932 Py_XDECREF(v);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003933 return NULL;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003934 }
3935
3936 if (consumed)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003937 *consumed += done;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003938
3939#ifdef NEED_RETRY
3940 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003941 s += done;
3942 size -= done;
3943 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003944 }
3945#endif
3946
3947 return (PyObject *)v;
3948}
3949
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003950PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003951 Py_ssize_t size,
3952 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003953{
Martin v. Löwisd8251432006-06-14 05:21:04 +00003954 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3955}
3956
3957/*
3958 * Convert unicode into string object (MBCS).
3959 * Returns 0 if succeed, -1 otherwise.
3960 */
3961static int encode_mbcs(PyObject **repr,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003962 const Py_UNICODE *p, /* unicode */
3963 int size) /* size of unicode */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003964{
3965 int mbcssize = 0;
3966 Py_ssize_t n = 0;
3967
3968 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003969
3970 /* First get the size of the result */
Martin v. Löwisd8251432006-06-14 05:21:04 +00003971 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003972 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3973 if (mbcssize == 0) {
3974 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3975 return -1;
3976 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00003977 }
3978
Martin v. Löwisd8251432006-06-14 05:21:04 +00003979 if (*repr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003980 /* Create string object */
3981 *repr = PyString_FromStringAndSize(NULL, mbcssize);
3982 if (*repr == NULL)
3983 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003984 }
3985 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003986 /* Extend string object */
3987 n = PyString_Size(*repr);
3988 if (_PyString_Resize(repr, n + mbcssize) < 0)
3989 return -1;
Martin v. Löwisd8251432006-06-14 05:21:04 +00003990 }
3991
3992 /* Do the conversion */
3993 if (size > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00003994 char *s = PyString_AS_STRING(*repr) + n;
3995 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3996 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3997 return -1;
3998 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00003999 }
4000
4001 return 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004002}
4003
4004PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004005 Py_ssize_t size,
4006 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004007{
Martin v. Löwisd8251432006-06-14 05:21:04 +00004008 PyObject *repr = NULL;
4009 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00004010
Martin v. Löwisd8251432006-06-14 05:21:04 +00004011#ifdef NEED_RETRY
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004012 retry:
Martin v. Löwisd8251432006-06-14 05:21:04 +00004013 if (size > INT_MAX)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004014 ret = encode_mbcs(&repr, p, INT_MAX);
Martin v. Löwisd8251432006-06-14 05:21:04 +00004015 else
4016#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004017 ret = encode_mbcs(&repr, p, (int)size);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004018
Martin v. Löwisd8251432006-06-14 05:21:04 +00004019 if (ret < 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004020 Py_XDECREF(repr);
4021 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004022 }
Martin v. Löwisd8251432006-06-14 05:21:04 +00004023
4024#ifdef NEED_RETRY
4025 if (size > INT_MAX) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004026 p += INT_MAX;
4027 size -= INT_MAX;
4028 goto retry;
Martin v. Löwisd8251432006-06-14 05:21:04 +00004029 }
4030#endif
4031
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004032 return repr;
4033}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004034
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004035PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4036{
4037 if (!PyUnicode_Check(unicode)) {
4038 PyErr_BadArgument();
4039 return NULL;
4040 }
4041 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004042 PyUnicode_GET_SIZE(unicode),
4043 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00004044}
4045
Martin v. Löwisd8251432006-06-14 05:21:04 +00004046#undef NEED_RETRY
4047
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004048#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004049
Guido van Rossumd57fd912000-03-10 22:53:23 +00004050/* --- Character Mapping Codec -------------------------------------------- */
4051
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004053 Py_ssize_t size,
4054 PyObject *mapping,
4055 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004057 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004058 Py_ssize_t startinpos;
4059 Py_ssize_t endinpos;
4060 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004061 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062 PyUnicodeObject *v;
4063 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004064 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004065 PyObject *errorHandler = NULL;
4066 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004067 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004068 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004069
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 /* Default to Latin-1 */
4071 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004072 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073
4074 v = _PyUnicode_New(size);
4075 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004076 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004078 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004081 if (PyUnicode_CheckExact(mapping)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004082 mapstring = PyUnicode_AS_UNICODE(mapping);
4083 maplen = PyUnicode_GET_SIZE(mapping);
4084 while (s < e) {
4085 unsigned char ch = *s;
4086 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004088 if (ch < maplen)
4089 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004091 if (x == 0xfffe) {
4092 /* undefined mapping */
4093 outpos = p-PyUnicode_AS_UNICODE(v);
4094 startinpos = s-starts;
4095 endinpos = startinpos+1;
4096 if (unicode_decode_call_errorhandler(
4097 errors, &errorHandler,
4098 "charmap", "character maps to <undefined>",
4099 starts, size, &startinpos, &endinpos, &exc, &s,
4100 &v, &outpos, &p)) {
4101 goto onError;
4102 }
4103 continue;
4104 }
4105 *p++ = x;
4106 ++s;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004107 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004108 }
4109 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004110 while (s < e) {
4111 unsigned char ch = *s;
4112 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00004113
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004114 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4115 w = PyInt_FromLong((long)ch);
4116 if (w == NULL)
4117 goto onError;
4118 x = PyObject_GetItem(mapping, w);
4119 Py_DECREF(w);
4120 if (x == NULL) {
4121 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4122 /* No mapping found means: mapping is undefined. */
4123 PyErr_Clear();
Serhiy Storchaka95997452013-01-15 14:42:59 +02004124 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004125 } else
4126 goto onError;
4127 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004128
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004129 /* Apply mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004130 if (x == Py_None)
4131 goto Undefined;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004132 if (PyInt_Check(x)) {
4133 long value = PyInt_AS_LONG(x);
Serhiy Storchaka95997452013-01-15 14:42:59 +02004134 if (value == 0xFFFE)
4135 goto Undefined;
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004136 if (value < 0 || value > 0x10FFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004137 PyErr_SetString(PyExc_TypeError,
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004138 "character mapping must be in range(0x110000)");
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004139 Py_DECREF(x);
4140 goto onError;
4141 }
Antoine Pitroue3ae3212012-11-17 21:14:58 +01004142
4143#ifndef Py_UNICODE_WIDE
4144 if (value > 0xFFFF) {
4145 /* see the code for 1-n mapping below */
4146 if (extrachars < 2) {
4147 /* resize first */
4148 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4149 Py_ssize_t needed = 10 - extrachars;
4150 extrachars += needed;
4151 /* XXX overflow detection missing */
4152 if (_PyUnicode_Resize(&v,
4153 PyUnicode_GET_SIZE(v) + needed) < 0) {
4154 Py_DECREF(x);
4155 goto onError;
4156 }
4157 p = PyUnicode_AS_UNICODE(v) + oldpos;
4158 }
4159 value -= 0x10000;
4160 *p++ = 0xD800 | (value >> 10);
4161 *p++ = 0xDC00 | (value & 0x3FF);
4162 extrachars -= 2;
4163 }
4164 else
4165#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004166 *p++ = (Py_UNICODE)value;
4167 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004168 else if (PyUnicode_Check(x)) {
4169 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004170
Serhiy Storchaka95997452013-01-15 14:42:59 +02004171 if (targetsize == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004172 /* 1-1 mapping */
Serhiy Storchaka95997452013-01-15 14:42:59 +02004173 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4174 if (value == 0xFFFE)
4175 goto Undefined;
4176 *p++ = value;
4177 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004178 else if (targetsize > 1) {
4179 /* 1-n mapping */
4180 if (targetsize > extrachars) {
4181 /* resize first */
4182 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4183 Py_ssize_t needed = (targetsize - extrachars) + \
4184 (targetsize << 2);
4185 extrachars += needed;
4186 /* XXX overflow detection missing */
4187 if (_PyUnicode_Resize(&v,
4188 PyUnicode_GET_SIZE(v) + needed) < 0) {
4189 Py_DECREF(x);
4190 goto onError;
4191 }
4192 p = PyUnicode_AS_UNICODE(v) + oldpos;
4193 }
4194 Py_UNICODE_COPY(p,
4195 PyUnicode_AS_UNICODE(x),
4196 targetsize);
4197 p += targetsize;
4198 extrachars -= targetsize;
4199 }
4200 /* 1-0 mapping: skip the character */
4201 }
4202 else {
4203 /* wrong return value */
4204 PyErr_SetString(PyExc_TypeError,
4205 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004206 Py_DECREF(x);
4207 goto onError;
4208 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004209 Py_DECREF(x);
4210 ++s;
Serhiy Storchaka95997452013-01-15 14:42:59 +02004211 continue;
4212Undefined:
4213 /* undefined mapping */
4214 Py_XDECREF(x);
4215 outpos = p-PyUnicode_AS_UNICODE(v);
4216 startinpos = s-starts;
4217 endinpos = startinpos+1;
4218 if (unicode_decode_call_errorhandler(
4219 errors, &errorHandler,
4220 "charmap", "character maps to <undefined>",
4221 starts, size, &startinpos, &endinpos, &exc, &s,
4222 &v, &outpos, &p)) {
4223 goto onError;
4224 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004225 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004226 }
4227 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004228 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4229 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004230 Py_XDECREF(errorHandler);
4231 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004232 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004233
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004234 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235 Py_XDECREF(errorHandler);
4236 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004237 Py_XDECREF(v);
4238 return NULL;
4239}
4240
Martin v. Löwis3f767792006-06-04 19:36:28 +00004241/* Charmap encoding: the lookup table */
4242
4243struct encoding_map{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004244 PyObject_HEAD
4245 unsigned char level1[32];
4246 int count2, count3;
4247 unsigned char level23[1];
Martin v. Löwis3f767792006-06-04 19:36:28 +00004248};
4249
4250static PyObject*
4251encoding_map_size(PyObject *obj, PyObject* args)
4252{
4253 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004254 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Martin v. Löwis3f767792006-06-04 19:36:28 +00004255 128*map->count3);
4256}
4257
4258static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004259 {"size", encoding_map_size, METH_NOARGS,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004260 PyDoc_STR("Return the size (in bytes) of this object") },
4261 { 0 }
Martin v. Löwis3f767792006-06-04 19:36:28 +00004262};
4263
4264static void
4265encoding_map_dealloc(PyObject* o)
4266{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004267 PyObject_FREE(o);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004268}
4269
4270static PyTypeObject EncodingMapType = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004271 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004272 "EncodingMap", /*tp_name*/
4273 sizeof(struct encoding_map), /*tp_basicsize*/
4274 0, /*tp_itemsize*/
4275 /* methods */
4276 encoding_map_dealloc, /*tp_dealloc*/
4277 0, /*tp_print*/
4278 0, /*tp_getattr*/
4279 0, /*tp_setattr*/
4280 0, /*tp_compare*/
4281 0, /*tp_repr*/
4282 0, /*tp_as_number*/
4283 0, /*tp_as_sequence*/
4284 0, /*tp_as_mapping*/
4285 0, /*tp_hash*/
4286 0, /*tp_call*/
4287 0, /*tp_str*/
4288 0, /*tp_getattro*/
4289 0, /*tp_setattro*/
4290 0, /*tp_as_buffer*/
4291 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4292 0, /*tp_doc*/
4293 0, /*tp_traverse*/
4294 0, /*tp_clear*/
4295 0, /*tp_richcompare*/
4296 0, /*tp_weaklistoffset*/
4297 0, /*tp_iter*/
4298 0, /*tp_iternext*/
4299 encoding_map_methods, /*tp_methods*/
4300 0, /*tp_members*/
4301 0, /*tp_getset*/
4302 0, /*tp_base*/
4303 0, /*tp_dict*/
4304 0, /*tp_descr_get*/
4305 0, /*tp_descr_set*/
4306 0, /*tp_dictoffset*/
4307 0, /*tp_init*/
4308 0, /*tp_alloc*/
4309 0, /*tp_new*/
4310 0, /*tp_free*/
4311 0, /*tp_is_gc*/
Martin v. Löwis3f767792006-06-04 19:36:28 +00004312};
4313
4314PyObject*
4315PyUnicode_BuildEncodingMap(PyObject* string)
4316{
4317 Py_UNICODE *decode;
4318 PyObject *result;
4319 struct encoding_map *mresult;
4320 int i;
4321 int need_dict = 0;
4322 unsigned char level1[32];
4323 unsigned char level2[512];
4324 unsigned char *mlevel1, *mlevel2, *mlevel3;
4325 int count2 = 0, count3 = 0;
4326
4327 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4328 PyErr_BadArgument();
4329 return NULL;
4330 }
4331 decode = PyUnicode_AS_UNICODE(string);
4332 memset(level1, 0xFF, sizeof level1);
4333 memset(level2, 0xFF, sizeof level2);
4334
4335 /* If there isn't a one-to-one mapping of NULL to \0,
4336 or if there are non-BMP characters, we need to use
4337 a mapping dictionary. */
4338 if (decode[0] != 0)
4339 need_dict = 1;
4340 for (i = 1; i < 256; i++) {
4341 int l1, l2;
4342 if (decode[i] == 0
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004343#ifdef Py_UNICODE_WIDE
Martin v. Löwis3f767792006-06-04 19:36:28 +00004344 || decode[i] > 0xFFFF
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004345#endif
4346 ) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004347 need_dict = 1;
4348 break;
4349 }
4350 if (decode[i] == 0xFFFE)
4351 /* unmapped character */
4352 continue;
4353 l1 = decode[i] >> 11;
4354 l2 = decode[i] >> 7;
4355 if (level1[l1] == 0xFF)
4356 level1[l1] = count2++;
4357 if (level2[l2] == 0xFF)
Benjamin Peterson857ce152009-01-31 16:29:18 +00004358 level2[l2] = count3++;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004359 }
4360
4361 if (count2 >= 0xFF || count3 >= 0xFF)
4362 need_dict = 1;
4363
4364 if (need_dict) {
4365 PyObject *result = PyDict_New();
4366 PyObject *key, *value;
4367 if (!result)
4368 return NULL;
4369 for (i = 0; i < 256; i++) {
Brett Cannona7f13ee2010-05-04 01:16:51 +00004370 value = NULL;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004371 key = PyInt_FromLong(decode[i]);
4372 value = PyInt_FromLong(i);
4373 if (!key || !value)
4374 goto failed1;
4375 if (PyDict_SetItem(result, key, value) == -1)
4376 goto failed1;
Georg Brandl9f167602006-06-04 21:46:16 +00004377 Py_DECREF(key);
4378 Py_DECREF(value);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004379 }
4380 return result;
4381 failed1:
4382 Py_XDECREF(key);
4383 Py_XDECREF(value);
4384 Py_DECREF(result);
4385 return NULL;
4386 }
4387
4388 /* Create a three-level trie */
4389 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4390 16*count2 + 128*count3 - 1);
4391 if (!result)
4392 return PyErr_NoMemory();
4393 PyObject_Init(result, &EncodingMapType);
4394 mresult = (struct encoding_map*)result;
4395 mresult->count2 = count2;
4396 mresult->count3 = count3;
4397 mlevel1 = mresult->level1;
4398 mlevel2 = mresult->level23;
4399 mlevel3 = mresult->level23 + 16*count2;
4400 memcpy(mlevel1, level1, 32);
4401 memset(mlevel2, 0xFF, 16*count2);
4402 memset(mlevel3, 0, 128*count3);
4403 count3 = 0;
4404 for (i = 1; i < 256; i++) {
4405 int o1, o2, o3, i2, i3;
4406 if (decode[i] == 0xFFFE)
4407 /* unmapped character */
4408 continue;
4409 o1 = decode[i]>>11;
4410 o2 = (decode[i]>>7) & 0xF;
4411 i2 = 16*mlevel1[o1] + o2;
4412 if (mlevel2[i2] == 0xFF)
4413 mlevel2[i2] = count3++;
4414 o3 = decode[i] & 0x7F;
4415 i3 = 128*mlevel2[i2] + o3;
4416 mlevel3[i3] = i;
4417 }
4418 return result;
4419}
4420
4421static int
4422encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4423{
4424 struct encoding_map *map = (struct encoding_map*)mapping;
4425 int l1 = c>>11;
4426 int l2 = (c>>7) & 0xF;
4427 int l3 = c & 0x7F;
4428 int i;
4429
4430#ifdef Py_UNICODE_WIDE
4431 if (c > 0xFFFF) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004432 return -1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004433 }
4434#endif
4435 if (c == 0)
4436 return 0;
4437 /* level 1*/
4438 i = map->level1[l1];
4439 if (i == 0xFF) {
4440 return -1;
4441 }
4442 /* level 2*/
4443 i = map->level23[16*i+l2];
4444 if (i == 0xFF) {
4445 return -1;
4446 }
4447 /* level 3 */
4448 i = map->level23[16*map->count2 + 128*i + l3];
4449 if (i == 0) {
4450 return -1;
4451 }
4452 return i;
4453}
4454
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455/* Lookup the character ch in the mapping. If the character
4456 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00004457 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004460 PyObject *w = PyInt_FromLong((long)c);
4461 PyObject *x;
4462
4463 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004464 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 x = PyObject_GetItem(mapping, w);
4466 Py_DECREF(w);
4467 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004468 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4469 /* No mapping found means: mapping is undefined. */
4470 PyErr_Clear();
4471 x = Py_None;
4472 Py_INCREF(x);
4473 return x;
4474 } else
4475 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004476 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00004477 else if (x == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004478 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004479 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004480 long value = PyInt_AS_LONG(x);
4481 if (value < 0 || value > 255) {
4482 PyErr_SetString(PyExc_TypeError,
4483 "character mapping must be in range(256)");
4484 Py_DECREF(x);
4485 return NULL;
4486 }
4487 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004489 else if (PyString_Check(x))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004490 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004492 /* wrong return value */
4493 PyErr_SetString(PyExc_TypeError,
4494 "character mapping must return integer, None or str");
4495 Py_DECREF(x);
4496 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497 }
4498}
4499
Martin v. Löwis3f767792006-06-04 19:36:28 +00004500static int
4501charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4502{
Benjamin Peterson857ce152009-01-31 16:29:18 +00004503 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4504 /* exponentially overallocate to minimize reallocations */
4505 if (requiredsize < 2*outsize)
4506 requiredsize = 2*outsize;
4507 if (_PyString_Resize(outobj, requiredsize)) {
4508 return 0;
4509 }
4510 return 1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004511}
4512
Benjamin Peterson857ce152009-01-31 16:29:18 +00004513typedef enum charmapencode_result {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004514 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Martin v. Löwis3f767792006-06-04 19:36:28 +00004515}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516/* lookup the character, put the result in the output string and adjust
4517 various state variables. Reallocate the output string if not enough
4518 space is available. Return a new reference to the object that
4519 was put in the output buffer, or Py_None, if the mapping was undefined
4520 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00004521 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522static
Martin v. Löwis3f767792006-06-04 19:36:28 +00004523charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004524 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004525{
Martin v. Löwis3f767792006-06-04 19:36:28 +00004526 PyObject *rep;
4527 char *outstart;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004528 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529
Christian Heimese93237d2007-12-19 02:37:44 +00004530 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004531 int res = encoding_map_lookup(c, mapping);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004532 Py_ssize_t requiredsize = *outpos+1;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004533 if (res == -1)
4534 return enc_FAILED;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004535 if (outsize<requiredsize)
4536 if (!charmapencode_resize(outobj, outpos, requiredsize))
4537 return enc_EXCEPTION;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004538 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004539 outstart[(*outpos)++] = (char)res;
4540 return enc_SUCCESS;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004541 }
4542
4543 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544 if (rep==NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004545 return enc_EXCEPTION;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004546 else if (rep==Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004547 Py_DECREF(rep);
4548 return enc_FAILED;
Martin v. Löwis3f767792006-06-04 19:36:28 +00004549 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004550 if (PyInt_Check(rep)) {
4551 Py_ssize_t requiredsize = *outpos+1;
4552 if (outsize<requiredsize)
4553 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4554 Py_DECREF(rep);
4555 return enc_EXCEPTION;
4556 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004557 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004558 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004559 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004560 else {
4561 const char *repchars = PyString_AS_STRING(rep);
4562 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4563 Py_ssize_t requiredsize = *outpos+repsize;
4564 if (outsize<requiredsize)
4565 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4566 Py_DECREF(rep);
4567 return enc_EXCEPTION;
4568 }
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004569 outstart = PyString_AS_STRING(*outobj);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004570 memcpy(outstart + *outpos, repchars, repsize);
4571 *outpos += repsize;
4572 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573 }
Georg Brandl9f167602006-06-04 21:46:16 +00004574 Py_DECREF(rep);
Martin v. Löwis3f767792006-06-04 19:36:28 +00004575 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576}
4577
4578/* handle an error in PyUnicode_EncodeCharmap
4579 Return 0 on success, -1 on error */
4580static
4581int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00004582 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00004584 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Martin v. Löwis18e16552006-02-15 17:27:45 +00004585 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004586{
4587 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004588 Py_ssize_t repsize;
4589 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004590 Py_UNICODE *uni2;
4591 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004592 Py_ssize_t collstartpos = *inpos;
4593 Py_ssize_t collendpos = *inpos+1;
4594 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004595 char *encoding = "charmap";
4596 char *reason = "character maps to <undefined>";
Martin v. Löwis3f767792006-06-04 19:36:28 +00004597 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004598
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 /* find all unencodable characters */
4600 while (collendpos < size) {
Martin v. Löwis3f767792006-06-04 19:36:28 +00004601 PyObject *rep;
Christian Heimese93237d2007-12-19 02:37:44 +00004602 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004603 int res = encoding_map_lookup(p[collendpos], mapping);
4604 if (res != -1)
4605 break;
4606 ++collendpos;
4607 continue;
4608 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004609
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004610 rep = charmapencode_lookup(p[collendpos], mapping);
4611 if (rep==NULL)
4612 return -1;
4613 else if (rep!=Py_None) {
4614 Py_DECREF(rep);
4615 break;
4616 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004617 Py_DECREF(rep);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004618 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004619 }
4620 /* cache callback name lookup
4621 * (if not done yet, i.e. it's the first error) */
4622 if (*known_errorHandler==-1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004623 if ((errors==NULL) || (!strcmp(errors, "strict")))
4624 *known_errorHandler = 1;
4625 else if (!strcmp(errors, "replace"))
4626 *known_errorHandler = 2;
4627 else if (!strcmp(errors, "ignore"))
4628 *known_errorHandler = 3;
4629 else if (!strcmp(errors, "xmlcharrefreplace"))
4630 *known_errorHandler = 4;
4631 else
4632 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004633 }
4634 switch (*known_errorHandler) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004635 case 1: /* strict */
4636 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4637 return -1;
4638 case 2: /* replace */
4639 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004640 x = charmapencode_output('?', mapping, res, respos);
4641 if (x==enc_EXCEPTION) {
4642 return -1;
4643 }
4644 else if (x==enc_FAILED) {
4645 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4646 return -1;
4647 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004648 }
4649 /* fall through */
4650 case 3: /* ignore */
4651 *inpos = collendpos;
4652 break;
4653 case 4: /* xmlcharrefreplace */
4654 /* generate replacement (temporarily (mis)uses p) */
4655 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004656 char buffer[2+29+1+1];
4657 char *cp;
4658 sprintf(buffer, "&#%d;", (int)p[collpos]);
4659 for (cp = buffer; *cp; ++cp) {
4660 x = charmapencode_output(*cp, mapping, res, respos);
4661 if (x==enc_EXCEPTION)
4662 return -1;
4663 else if (x==enc_FAILED) {
4664 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4665 return -1;
4666 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004667 }
4668 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004669 *inpos = collendpos;
4670 break;
4671 default:
4672 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004673 encoding, reason, p, size, exceptionObject,
4674 collstartpos, collendpos, &newpos);
Benjamin Peterson857ce152009-01-31 16:29:18 +00004675 if (repunicode == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004676 return -1;
Benjamin Peterson857ce152009-01-31 16:29:18 +00004677 /* generate replacement */
4678 repsize = PyUnicode_GET_SIZE(repunicode);
4679 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004680 x = charmapencode_output(*uni2, mapping, res, respos);
4681 if (x==enc_EXCEPTION) {
4682 return -1;
4683 }
4684 else if (x==enc_FAILED) {
4685 Py_DECREF(repunicode);
4686 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4687 return -1;
4688 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004689 }
4690 *inpos = newpos;
4691 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004692 }
4693 return 0;
4694}
4695
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004697 Py_ssize_t size,
4698 PyObject *mapping,
4699 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004701 /* output object */
4702 PyObject *res = NULL;
4703 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004704 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004705 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004706 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004707 PyObject *errorHandler = NULL;
4708 PyObject *exc = NULL;
4709 /* the following variable is used for caching string comparisons
4710 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4711 * 3=ignore, 4=xmlcharrefreplace */
4712 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713
4714 /* Default to Latin-1 */
4715 if (mapping == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004716 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004717
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004718 /* allocate enough for a simple encoding without
4719 replacements, if we need more, we'll resize */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004720 res = PyString_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004721 if (res == NULL)
4722 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004723 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004724 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004726 while (inpos<size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004727 /* try to encode it */
4728 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4729 if (x==enc_EXCEPTION) /* error */
4730 goto onError;
4731 if (x==enc_FAILED) { /* unencodable character */
4732 if (charmap_encoding_error(p, size, &inpos, mapping,
4733 &exc,
4734 &known_errorHandler, &errorHandler, errors,
4735 &res, &respos)) {
4736 goto onError;
4737 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00004738 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004739 else
4740 /* done with this character => adjust input position */
4741 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004744 /* Resize if we allocated to much */
Gregory P. Smithdd96db62008-06-09 04:58:54 +00004745 if (respos<PyString_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004746 if (_PyString_Resize(&res, respos))
4747 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748 }
4749 Py_XDECREF(exc);
4750 Py_XDECREF(errorHandler);
4751 return res;
4752
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004753 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 Py_XDECREF(res);
4755 Py_XDECREF(exc);
4756 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757 return NULL;
4758}
4759
4760PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004761 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762{
4763 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004764 PyErr_BadArgument();
4765 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766 }
4767 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004768 PyUnicode_GET_SIZE(unicode),
4769 mapping,
4770 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004771}
4772
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004773/* create or adjust a UnicodeTranslateError */
4774static void make_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004775 const Py_UNICODE *unicode, Py_ssize_t size,
4776 Py_ssize_t startpos, Py_ssize_t endpos,
4777 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004779 if (*exceptionObject == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00004780 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004781 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782 }
4783 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004784 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4785 goto onError;
4786 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4787 goto onError;
4788 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4789 goto onError;
4790 return;
4791 onError:
4792 Py_DECREF(*exceptionObject);
4793 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 }
4795}
4796
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004797/* raises a UnicodeTranslateError */
4798static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004799 const Py_UNICODE *unicode, Py_ssize_t size,
4800 Py_ssize_t startpos, Py_ssize_t endpos,
4801 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004802{
4803 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004804 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004805 if (*exceptionObject != NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004806 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807}
4808
4809/* error handling callback helper:
4810 build arguments, call the callback and check the arguments,
4811 put the result into newpos and return the replacement string, which
4812 has to be freed by the caller */
4813static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004814 PyObject **errorHandler,
4815 const char *reason,
4816 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4817 Py_ssize_t startpos, Py_ssize_t endpos,
4818 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004819{
Martin v. Löwis412fb672006-04-13 06:34:32 +00004820 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004821
Martin v. Löwis412fb672006-04-13 06:34:32 +00004822 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 PyObject *restuple;
4824 PyObject *resunicode;
4825
4826 if (*errorHandler == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004827 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004828 if (*errorHandler == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004829 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004830 }
4831
4832 make_translate_exception(exceptionObject,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004833 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004834 if (*exceptionObject == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004835 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004836
4837 restuple = PyObject_CallFunctionObjArgs(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004838 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004839 if (restuple == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004840 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004841 if (!PyTuple_Check(restuple)) {
Georg Brandlcbb49582009-02-13 11:06:59 +00004842 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004843 Py_DECREF(restuple);
4844 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004845 }
4846 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004847 &resunicode, &i_newpos)) {
4848 Py_DECREF(restuple);
4849 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004850 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00004851 if (i_newpos<0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004852 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004853 else
4854 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004855 if (*newpos<0 || *newpos>size) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004856 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4857 Py_DECREF(restuple);
4858 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004859 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004860 Py_INCREF(resunicode);
4861 Py_DECREF(restuple);
4862 return resunicode;
4863}
4864
4865/* Lookup the character ch in the mapping and put the result in result,
4866 which must be decrefed by the caller.
4867 Return 0 on success, -1 on error */
4868static
4869int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4870{
4871 PyObject *w = PyInt_FromLong((long)c);
4872 PyObject *x;
4873
4874 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004875 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876 x = PyObject_GetItem(mapping, w);
4877 Py_DECREF(w);
4878 if (x == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004879 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4880 /* No mapping found means: use 1:1 mapping. */
4881 PyErr_Clear();
4882 *result = NULL;
4883 return 0;
4884 } else
4885 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004886 }
4887 else if (x == Py_None) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004888 *result = x;
4889 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004890 }
4891 else if (PyInt_Check(x)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004892 long value = PyInt_AS_LONG(x);
4893 long max = PyUnicode_GetMax();
4894 if (value < 0 || value > max) {
4895 PyErr_Format(PyExc_TypeError,
4896 "character mapping must be in range(0x%lx)", max+1);
4897 Py_DECREF(x);
4898 return -1;
4899 }
4900 *result = x;
4901 return 0;
4902 }
4903 else if (PyUnicode_Check(x)) {
4904 *result = x;
4905 return 0;
4906 }
4907 else {
4908 /* wrong return value */
4909 PyErr_SetString(PyExc_TypeError,
4910 "character mapping must return integer, None or unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00004911 Py_DECREF(x);
4912 return -1;
4913 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004914}
4915/* ensure that *outobj is at least requiredsize characters long,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004916 if not reallocate and adjust various state variables.
4917 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004918static
Walter Dörwald4894c302003-10-24 14:25:28 +00004919int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004920 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004921{
Martin v. Löwis18e16552006-02-15 17:27:45 +00004922 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00004923 if (requiredsize > oldsize) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004924 /* remember old output position */
4925 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4926 /* exponentially overallocate to minimize reallocations */
4927 if (requiredsize < 2 * oldsize)
4928 requiredsize = 2 * oldsize;
4929 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4930 return -1;
4931 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004932 }
4933 return 0;
4934}
4935/* lookup the character, put the result in the output string and adjust
4936 various state variables. Return a new reference to the object that
4937 was put in the output buffer in *result, or Py_None, if the mapping was
4938 undefined (in which case no character was written).
4939 The called must decref result.
4940 Return 0 on success, -1 on error. */
4941static
Walter Dörwald4894c302003-10-24 14:25:28 +00004942int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004943 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4944 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004945{
Walter Dörwald4894c302003-10-24 14:25:28 +00004946 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004947 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004948 if (*res==NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004949 /* not found => default to 1:1 mapping */
4950 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004951 }
4952 else if (*res==Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004953 ;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004954 else if (PyInt_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004955 /* no overflow check, because we know that the space is enough */
4956 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004957 }
4958 else if (PyUnicode_Check(*res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004959 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4960 if (repsize==1) {
4961 /* no overflow check, because we know that the space is enough */
4962 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4963 }
4964 else if (repsize!=0) {
4965 /* more than one character */
4966 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4967 (insize - (curinp-startinp)) +
4968 repsize - 1;
4969 if (charmaptranslate_makespace(outobj, outp, requiredsize))
4970 return -1;
4971 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4972 *outp += repsize;
4973 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004974 }
4975 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004976 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004977 return 0;
4978}
4979
4980PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00004981 Py_ssize_t size,
4982 PyObject *mapping,
4983 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004985 /* output object */
4986 PyObject *res = NULL;
4987 /* pointers to the beginning and end+1 of input */
4988 const Py_UNICODE *startp = p;
4989 const Py_UNICODE *endp = p + size;
4990 /* pointer into the output */
4991 Py_UNICODE *str;
4992 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004993 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994 char *reason = "character maps to <undefined>";
4995 PyObject *errorHandler = NULL;
4996 PyObject *exc = NULL;
4997 /* the following variable is used for caching string comparisons
4998 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4999 * 3=ignore, 4=xmlcharrefreplace */
5000 int known_errorHandler = -1;
5001
Guido van Rossumd57fd912000-03-10 22:53:23 +00005002 if (mapping == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005003 PyErr_BadArgument();
5004 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005005 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005006
5007 /* allocate enough for a simple 1:1 translation without
5008 replacements, if we need more, we'll resize */
5009 res = PyUnicode_FromUnicode(NULL, size);
5010 if (res == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005011 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012 if (size == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005013 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005014 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005015
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005016 while (p<endp) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005017 /* try to encode it */
5018 PyObject *x = NULL;
5019 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5020 Py_XDECREF(x);
5021 goto onError;
5022 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005023 Py_XDECREF(x);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005024 if (x!=Py_None) /* it worked => adjust input pointer */
5025 ++p;
5026 else { /* untranslatable character */
5027 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5028 Py_ssize_t repsize;
5029 Py_ssize_t newpos;
5030 Py_UNICODE *uni2;
5031 /* startpos for collecting untranslatable chars */
5032 const Py_UNICODE *collstart = p;
5033 const Py_UNICODE *collend = p+1;
5034 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005036 /* find all untranslatable characters */
5037 while (collend < endp) {
5038 if (charmaptranslate_lookup(*collend, mapping, &x))
5039 goto onError;
5040 Py_XDECREF(x);
5041 if (x!=Py_None)
5042 break;
5043 ++collend;
5044 }
5045 /* cache callback name lookup
5046 * (if not done yet, i.e. it's the first error) */
5047 if (known_errorHandler==-1) {
5048 if ((errors==NULL) || (!strcmp(errors, "strict")))
5049 known_errorHandler = 1;
5050 else if (!strcmp(errors, "replace"))
5051 known_errorHandler = 2;
5052 else if (!strcmp(errors, "ignore"))
5053 known_errorHandler = 3;
5054 else if (!strcmp(errors, "xmlcharrefreplace"))
5055 known_errorHandler = 4;
5056 else
5057 known_errorHandler = 0;
5058 }
5059 switch (known_errorHandler) {
5060 case 1: /* strict */
5061 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005062 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005063 case 2: /* replace */
5064 /* No need to check for space, this is a 1:1 replacement */
5065 for (coll = collstart; coll<collend; ++coll)
5066 *str++ = '?';
5067 /* fall through */
5068 case 3: /* ignore */
5069 p = collend;
5070 break;
5071 case 4: /* xmlcharrefreplace */
5072 /* generate replacement (temporarily (mis)uses p) */
5073 for (p = collstart; p < collend; ++p) {
5074 char buffer[2+29+1+1];
5075 char *cp;
5076 sprintf(buffer, "&#%d;", (int)*p);
5077 if (charmaptranslate_makespace(&res, &str,
5078 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5079 goto onError;
5080 for (cp = buffer; *cp; ++cp)
5081 *str++ = *cp;
5082 }
5083 p = collend;
5084 break;
5085 default:
5086 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5087 reason, startp, size, &exc,
5088 collstart-startp, collend-startp, &newpos);
5089 if (repunicode == NULL)
5090 goto onError;
5091 /* generate replacement */
5092 repsize = PyUnicode_GET_SIZE(repunicode);
5093 if (charmaptranslate_makespace(&res, &str,
5094 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5095 Py_DECREF(repunicode);
5096 goto onError;
5097 }
5098 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5099 *str++ = *uni2;
5100 p = startp + newpos;
5101 Py_DECREF(repunicode);
Benjamin Peterson857ce152009-01-31 16:29:18 +00005102 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005103 }
5104 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005105 /* Resize if we allocated to much */
5106 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00005107 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005108 if (PyUnicode_Resize(&res, respos) < 0)
5109 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005110 }
5111 Py_XDECREF(exc);
5112 Py_XDECREF(errorHandler);
5113 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005115 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005116 Py_XDECREF(res);
5117 Py_XDECREF(exc);
5118 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119 return NULL;
5120}
5121
5122PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005123 PyObject *mapping,
5124 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125{
5126 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005127
Guido van Rossumd57fd912000-03-10 22:53:23 +00005128 str = PyUnicode_FromObject(str);
5129 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005130 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005132 PyUnicode_GET_SIZE(str),
5133 mapping,
5134 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135 Py_DECREF(str);
5136 return result;
Tim Petersced69f82003-09-16 20:30:58 +00005137
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005138 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139 Py_XDECREF(str);
5140 return NULL;
5141}
Tim Petersced69f82003-09-16 20:30:58 +00005142
Guido van Rossum9e896b32000-04-05 20:11:21 +00005143/* --- Decimal Encoder ---------------------------------------------------- */
5144
5145int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005146 Py_ssize_t length,
5147 char *output,
5148 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00005149{
5150 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005151 PyObject *errorHandler = NULL;
5152 PyObject *exc = NULL;
5153 const char *encoding = "decimal";
5154 const char *reason = "invalid decimal Unicode string";
5155 /* the following variable is used for caching string comparisons
5156 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5157 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005158
5159 if (output == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005160 PyErr_BadArgument();
5161 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00005162 }
5163
5164 p = s;
5165 end = s + length;
5166 while (p < end) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005167 register Py_UNICODE ch = *p;
5168 int decimal;
5169 PyObject *repunicode;
5170 Py_ssize_t repsize;
5171 Py_ssize_t newpos;
5172 Py_UNICODE *uni2;
5173 Py_UNICODE *collstart;
5174 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00005175
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005176 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005177 *output++ = ' ';
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005178 ++p;
5179 continue;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005180 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005181 decimal = Py_UNICODE_TODECIMAL(ch);
5182 if (decimal >= 0) {
5183 *output++ = '0' + decimal;
5184 ++p;
5185 continue;
5186 }
5187 if (0 < ch && ch < 256) {
5188 *output++ = (char)ch;
5189 ++p;
5190 continue;
5191 }
5192 /* All other characters are considered unencodable */
5193 collstart = p;
Victor Stinner975134e2011-11-22 01:54:19 +01005194 for (collend = p+1; collend < end; collend++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005195 if ((0 < *collend && *collend < 256) ||
Victor Stinner975134e2011-11-22 01:54:19 +01005196 Py_UNICODE_ISSPACE(*collend) ||
5197 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005198 break;
5199 }
5200 /* cache callback name lookup
5201 * (if not done yet, i.e. it's the first error) */
5202 if (known_errorHandler==-1) {
5203 if ((errors==NULL) || (!strcmp(errors, "strict")))
5204 known_errorHandler = 1;
5205 else if (!strcmp(errors, "replace"))
5206 known_errorHandler = 2;
5207 else if (!strcmp(errors, "ignore"))
5208 known_errorHandler = 3;
5209 else if (!strcmp(errors, "xmlcharrefreplace"))
5210 known_errorHandler = 4;
5211 else
5212 known_errorHandler = 0;
5213 }
5214 switch (known_errorHandler) {
5215 case 1: /* strict */
5216 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5217 goto onError;
5218 case 2: /* replace */
5219 for (p = collstart; p < collend; ++p)
5220 *output++ = '?';
5221 /* fall through */
5222 case 3: /* ignore */
5223 p = collend;
5224 break;
5225 case 4: /* xmlcharrefreplace */
5226 /* generate replacement (temporarily (mis)uses p) */
5227 for (p = collstart; p < collend; ++p)
5228 output += sprintf(output, "&#%d;", (int)*p);
5229 p = collend;
5230 break;
5231 default:
5232 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5233 encoding, reason, s, length, &exc,
5234 collstart-s, collend-s, &newpos);
5235 if (repunicode == NULL)
5236 goto onError;
5237 /* generate replacement */
5238 repsize = PyUnicode_GET_SIZE(repunicode);
5239 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5240 Py_UNICODE ch = *uni2;
5241 if (Py_UNICODE_ISSPACE(ch))
5242 *output++ = ' ';
5243 else {
5244 decimal = Py_UNICODE_TODECIMAL(ch);
5245 if (decimal >= 0)
5246 *output++ = '0' + decimal;
5247 else if (0 < ch && ch < 256)
5248 *output++ = (char)ch;
5249 else {
5250 Py_DECREF(repunicode);
5251 raise_encode_exception(&exc, encoding,
5252 s, length, collstart-s, collend-s, reason);
5253 goto onError;
5254 }
5255 }
5256 }
5257 p = s + newpos;
5258 Py_DECREF(repunicode);
5259 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00005260 }
5261 /* 0-terminate the output string */
5262 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005263 Py_XDECREF(exc);
5264 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005265 return 0;
5266
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005267 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005268 Py_XDECREF(exc);
5269 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00005270 return -1;
5271}
5272
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273/* --- Helpers ------------------------------------------------------------ */
5274
Eric Smitha9f7d622008-02-17 19:46:49 +00005275#include "stringlib/unicodedefs.h"
Fredrik Lundha50d2012006-05-26 17:04:58 +00005276#include "stringlib/fastsearch.h"
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005277
5278#include "stringlib/count.h"
5279#include "stringlib/find.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005280#include "stringlib/partition.h"
Antoine Pitrou64672132010-01-13 07:55:48 +00005281#include "stringlib/split.h"
Fredrik Lundhb9479482006-05-26 17:22:38 +00005282
Fredrik Lundhc8162812006-05-26 19:33:03 +00005283/* helper macro to fixup start/end slice values */
Antoine Pitrou64672132010-01-13 07:55:48 +00005284#define ADJUST_INDICES(start, end, len) \
5285 if (end > len) \
5286 end = len; \
5287 else if (end < 0) { \
5288 end += len; \
5289 if (end < 0) \
5290 end = 0; \
5291 } \
5292 if (start < 0) { \
5293 start += len; \
5294 if (start < 0) \
5295 start = 0; \
5296 }
Fredrik Lundhc8162812006-05-26 19:33:03 +00005297
Martin v. Löwis18e16552006-02-15 17:27:45 +00005298Py_ssize_t PyUnicode_Count(PyObject *str,
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005299 PyObject *substr,
5300 Py_ssize_t start,
5301 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005303 Py_ssize_t result;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005304 PyUnicodeObject* str_obj;
5305 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00005306
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005307 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5308 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005309 return -1;
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005310 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5311 if (!sub_obj) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005312 Py_DECREF(str_obj);
5313 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314 }
Tim Petersced69f82003-09-16 20:30:58 +00005315
Antoine Pitrou64672132010-01-13 07:55:48 +00005316 ADJUST_INDICES(start, end, str_obj->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005317 result = stringlib_count(
Antoine Pitrou64672132010-01-13 07:55:48 +00005318 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5319 PY_SSIZE_T_MAX
Fredrik Lundh58b5e842006-05-26 19:24:53 +00005320 );
5321
5322 Py_DECREF(sub_obj);
5323 Py_DECREF(str_obj);
5324
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325 return result;
5326}
5327
Martin v. Löwis18e16552006-02-15 17:27:45 +00005328Py_ssize_t PyUnicode_Find(PyObject *str,
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005329 PyObject *sub,
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005330 Py_ssize_t start,
5331 Py_ssize_t end,
5332 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005334 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005335
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005336 str = PyUnicode_FromObject(str);
5337 if (!str)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005338 return -2;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005339 sub = PyUnicode_FromObject(sub);
5340 if (!sub) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005341 Py_DECREF(str);
5342 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343 }
Tim Petersced69f82003-09-16 20:30:58 +00005344
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005345 if (direction > 0)
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005346 result = stringlib_find_slice(
5347 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5348 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5349 start, end
5350 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005351 else
Fredrik Lundh60d8b182006-05-27 15:20:22 +00005352 result = stringlib_rfind_slice(
5353 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5354 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5355 start, end
5356 );
Fredrik Lundhce4eccb2006-05-26 19:29:05 +00005357
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00005358 Py_DECREF(str);
5359 Py_DECREF(sub);
5360
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 return result;
5362}
5363
Tim Petersced69f82003-09-16 20:30:58 +00005364static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365int tailmatch(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005366 PyUnicodeObject *substring,
5367 Py_ssize_t start,
5368 Py_ssize_t end,
5369 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 if (substring->length == 0)
5372 return 1;
5373
Antoine Pitrou64672132010-01-13 07:55:48 +00005374 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375 end -= substring->length;
5376 if (end < start)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005377 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378
5379 if (direction > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005380 if (Py_UNICODE_MATCH(self, end, substring))
5381 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 } else {
5383 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005384 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 }
5386
5387 return 0;
5388}
5389
Martin v. Löwis18e16552006-02-15 17:27:45 +00005390Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005391 PyObject *substr,
5392 Py_ssize_t start,
5393 Py_ssize_t end,
5394 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005396 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00005397
Guido van Rossumd57fd912000-03-10 22:53:23 +00005398 str = PyUnicode_FromObject(str);
5399 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005400 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 substr = PyUnicode_FromObject(substr);
5402 if (substr == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005403 Py_DECREF(str);
5404 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 }
Tim Petersced69f82003-09-16 20:30:58 +00005406
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 result = tailmatch((PyUnicodeObject *)str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005408 (PyUnicodeObject *)substr,
5409 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 Py_DECREF(str);
5411 Py_DECREF(substr);
5412 return result;
5413}
5414
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415/* Apply fixfct filter to the Unicode object self and return a
5416 reference to the modified object */
5417
Tim Petersced69f82003-09-16 20:30:58 +00005418static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419PyObject *fixup(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005420 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421{
5422
5423 PyUnicodeObject *u;
5424
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005425 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005427 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005428
5429 Py_UNICODE_COPY(u->str, self->str, self->length);
5430
Tim Peters7a29bd52001-09-12 03:03:31 +00005431 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005432 /* fixfct should return TRUE if it modified the buffer. If
5433 FALSE, return a reference to the original buffer instead
5434 (to save space, not time) */
5435 Py_INCREF(self);
5436 Py_DECREF(u);
5437 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 }
5439 return (PyObject*) u;
5440}
5441
Tim Petersced69f82003-09-16 20:30:58 +00005442static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443int fixupper(PyUnicodeObject *self)
5444{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005445 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 Py_UNICODE *s = self->str;
5447 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005448
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005450 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005451
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005452 ch = Py_UNICODE_TOUPPER(*s);
5453 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005455 *s = ch;
5456 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 s++;
5458 }
5459
5460 return status;
5461}
5462
Tim Petersced69f82003-09-16 20:30:58 +00005463static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464int fixlower(PyUnicodeObject *self)
5465{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005466 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 Py_UNICODE *s = self->str;
5468 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005469
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 while (len-- > 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005471 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00005472
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005473 ch = Py_UNICODE_TOLOWER(*s);
5474 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 status = 1;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005476 *s = ch;
5477 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 s++;
5479 }
5480
5481 return status;
5482}
5483
Tim Petersced69f82003-09-16 20:30:58 +00005484static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485int fixswapcase(PyUnicodeObject *self)
5486{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005487 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488 Py_UNICODE *s = self->str;
5489 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005490
Guido van Rossumd57fd912000-03-10 22:53:23 +00005491 while (len-- > 0) {
5492 if (Py_UNICODE_ISUPPER(*s)) {
5493 *s = Py_UNICODE_TOLOWER(*s);
5494 status = 1;
5495 } else if (Py_UNICODE_ISLOWER(*s)) {
5496 *s = Py_UNICODE_TOUPPER(*s);
5497 status = 1;
5498 }
5499 s++;
5500 }
5501
5502 return status;
5503}
5504
Tim Petersced69f82003-09-16 20:30:58 +00005505static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506int fixcapitalize(PyUnicodeObject *self)
5507{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005508 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005509 Py_UNICODE *s = self->str;
5510 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005511
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005512 if (len == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005513 return 0;
Ezio Melotti15d6b652011-08-15 09:22:24 +03005514 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005515 *s = Py_UNICODE_TOUPPER(*s);
5516 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005518 s++;
5519 while (--len > 0) {
Ezio Melotti15d6b652011-08-15 09:22:24 +03005520 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00005521 *s = Py_UNICODE_TOLOWER(*s);
5522 status = 1;
5523 }
5524 s++;
5525 }
5526 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527}
5528
5529static
5530int fixtitle(PyUnicodeObject *self)
5531{
5532 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5533 register Py_UNICODE *e;
5534 int previous_is_cased;
5535
5536 /* Shortcut for single character strings */
5537 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005538 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5539 if (*p != ch) {
5540 *p = ch;
5541 return 1;
5542 }
5543 else
5544 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 }
Tim Petersced69f82003-09-16 20:30:58 +00005546
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 e = p + PyUnicode_GET_SIZE(self);
5548 previous_is_cased = 0;
5549 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005550 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005551
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005552 if (previous_is_cased)
5553 *p = Py_UNICODE_TOLOWER(ch);
5554 else
5555 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00005556
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005557 if (Py_UNICODE_ISLOWER(ch) ||
5558 Py_UNICODE_ISUPPER(ch) ||
5559 Py_UNICODE_ISTITLE(ch))
5560 previous_is_cased = 1;
5561 else
5562 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 }
5564 return 1;
5565}
5566
Tim Peters8ce9f162004-08-27 01:49:32 +00005567PyObject *
5568PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569{
Tim Peters8ce9f162004-08-27 01:49:32 +00005570 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00005571 const Py_UNICODE blank = ' ';
5572 const Py_UNICODE *sep = &blank;
Tim Peters286085c2006-05-22 19:17:04 +00005573 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005574 PyUnicodeObject *res = NULL; /* the result */
Tim Peters286085c2006-05-22 19:17:04 +00005575 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5576 Py_ssize_t res_used; /* # used bytes */
Tim Peters05eba1f2004-08-27 21:32:02 +00005577 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5578 PyObject *fseq; /* PySequence_Fast(seq) */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005579 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00005580 PyObject *item;
Martin v. Löwis412fb672006-04-13 06:34:32 +00005581 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582
Tim Peters05eba1f2004-08-27 21:32:02 +00005583 fseq = PySequence_Fast(seq, "");
5584 if (fseq == NULL) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005585 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00005586 }
5587
Tim Peters91879ab2004-08-27 22:35:44 +00005588 /* Grrrr. A codec may be invoked to convert str objects to
5589 * Unicode, and so it's possible to call back into Python code
5590 * during PyUnicode_FromObject(), and so it's possible for a sick
5591 * codec to change the size of fseq (if seq is a list). Therefore
5592 * we have to keep refetching the size -- can't assume seqlen
5593 * is invariant.
5594 */
Tim Peters05eba1f2004-08-27 21:32:02 +00005595 seqlen = PySequence_Fast_GET_SIZE(fseq);
5596 /* If empty sequence, return u"". */
5597 if (seqlen == 0) {
Benjamin Peterson857ce152009-01-31 16:29:18 +00005598 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5599 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00005600 }
5601 /* If singleton sequence with an exact Unicode, return that. */
5602 if (seqlen == 1) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005603 item = PySequence_Fast_GET_ITEM(fseq, 0);
5604 if (PyUnicode_CheckExact(item)) {
5605 Py_INCREF(item);
5606 res = (PyUnicodeObject *)item;
5607 goto Done;
5608 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005609 }
5610
Tim Peters05eba1f2004-08-27 21:32:02 +00005611 /* At least two items to join, or one that isn't exact Unicode. */
5612 if (seqlen > 1) {
5613 /* Set up sep and seplen -- they're needed. */
Benjamin Peterson857ce152009-01-31 16:29:18 +00005614 if (separator == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005615 sep = &blank;
5616 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00005617 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00005618 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005619 internal_separator = PyUnicode_FromObject(separator);
5620 if (internal_separator == NULL)
5621 goto onError;
5622 sep = PyUnicode_AS_UNICODE(internal_separator);
5623 seplen = PyUnicode_GET_SIZE(internal_separator);
5624 /* In case PyUnicode_FromObject() mutated seq. */
5625 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00005626 }
5627 }
5628
5629 /* Get space. */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005630 res = _PyUnicode_New(res_alloc);
Tim Peters05eba1f2004-08-27 21:32:02 +00005631 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00005632 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00005633 res_p = PyUnicode_AS_UNICODE(res);
5634 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00005635
Tim Peters05eba1f2004-08-27 21:32:02 +00005636 for (i = 0; i < seqlen; ++i) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005637 Py_ssize_t itemlen;
5638 Py_ssize_t new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005639
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005640 item = PySequence_Fast_GET_ITEM(fseq, i);
5641 /* Convert item to Unicode. */
5642 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5643 PyErr_Format(PyExc_TypeError,
5644 "sequence item %zd: expected string or Unicode,"
5645 " %.80s found",
5646 i, Py_TYPE(item)->tp_name);
5647 goto onError;
5648 }
5649 item = PyUnicode_FromObject(item);
5650 if (item == NULL)
5651 goto onError;
5652 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00005653
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005654 /* In case PyUnicode_FromObject() mutated seq. */
5655 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters91879ab2004-08-27 22:35:44 +00005656
Tim Peters8ce9f162004-08-27 01:49:32 +00005657 /* Make sure we have enough space for the separator and the item. */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005658 itemlen = PyUnicode_GET_SIZE(item);
5659 new_res_used = res_used + itemlen;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005660 if (new_res_used < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005661 goto Overflow;
5662 if (i < seqlen - 1) {
5663 new_res_used += seplen;
5664 if (new_res_used < 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00005665 goto Overflow;
Benjamin Peterson857ce152009-01-31 16:29:18 +00005666 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005667 if (new_res_used > res_alloc) {
5668 /* double allocated size until it's big enough */
5669 do {
5670 res_alloc += res_alloc;
5671 if (res_alloc <= 0)
5672 goto Overflow;
5673 } while (new_res_used > res_alloc);
5674 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5675 Py_DECREF(item);
5676 goto onError;
5677 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005678 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005679 }
Tim Peters05eba1f2004-08-27 21:32:02 +00005680
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005681 /* Copy item, and maybe the separator. */
5682 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5683 res_p += itemlen;
5684 if (i < seqlen - 1) {
5685 Py_UNICODE_COPY(res_p, sep, seplen);
5686 res_p += seplen;
5687 }
5688 Py_DECREF(item);
5689 res_used = new_res_used;
Tim Peters05eba1f2004-08-27 21:32:02 +00005690 }
Tim Peters8ce9f162004-08-27 01:49:32 +00005691
Tim Peters05eba1f2004-08-27 21:32:02 +00005692 /* Shrink res to match the used area; this probably can't fail,
5693 * but it's cheap to check.
5694 */
Martin v. Löwis412fb672006-04-13 06:34:32 +00005695 if (_PyUnicode_Resize(&res, res_used) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005696 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00005697
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005698 Done:
Tim Peters8ce9f162004-08-27 01:49:32 +00005699 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005700 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 return (PyObject *)res;
5702
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005703 Overflow:
Tim Peters8ce9f162004-08-27 01:49:32 +00005704 PyErr_SetString(PyExc_OverflowError,
Georg Brandl90e27d32006-06-10 06:40:50 +00005705 "join() result is too long for a Python string");
Tim Peters8ce9f162004-08-27 01:49:32 +00005706 Py_DECREF(item);
5707 /* fall through */
5708
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005709 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00005710 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00005711 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00005712 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 return NULL;
5714}
5715
Tim Petersced69f82003-09-16 20:30:58 +00005716static
5717PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005718 Py_ssize_t left,
5719 Py_ssize_t right,
5720 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721{
5722 PyUnicodeObject *u;
5723
5724 if (left < 0)
5725 left = 0;
5726 if (right < 0)
5727 right = 0;
5728
Tim Peters7a29bd52001-09-12 03:03:31 +00005729 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 Py_INCREF(self);
5731 return self;
5732 }
5733
Neal Norwitze7d8be82008-07-31 17:17:14 +00005734 if (left > PY_SSIZE_T_MAX - self->length ||
5735 right > PY_SSIZE_T_MAX - (left + self->length)) {
5736 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5737 return NULL;
5738 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 u = _PyUnicode_New(left + self->length + right);
5740 if (u) {
5741 if (left)
5742 Py_UNICODE_FILL(u->str, fill, left);
5743 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5744 if (right)
5745 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5746 }
5747
5748 return u;
5749}
5750
Antoine Pitrou64672132010-01-13 07:55:48 +00005751PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754
5755 string = PyUnicode_FromObject(string);
5756 if (string == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005757 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758
Antoine Pitrou64672132010-01-13 07:55:48 +00005759 list = stringlib_splitlines(
5760 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5761 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762
5763 Py_DECREF(string);
5764 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765}
5766
Tim Petersced69f82003-09-16 20:30:58 +00005767static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768PyObject *split(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005769 PyUnicodeObject *substring,
5770 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005773 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005776 return stringlib_split_whitespace(
5777 (PyObject*) self, self->str, self->length, maxcount
5778 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779
Antoine Pitrou64672132010-01-13 07:55:48 +00005780 return stringlib_split(
5781 (PyObject*) self, self->str, self->length,
5782 substring->str, substring->length,
5783 maxcount
5784 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785}
5786
Tim Petersced69f82003-09-16 20:30:58 +00005787static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005788PyObject *rsplit(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005789 PyUnicodeObject *substring,
5790 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005791{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005792 if (maxcount < 0)
Martin v. Löwis412fb672006-04-13 06:34:32 +00005793 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005794
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005795 if (substring == NULL)
Antoine Pitrou64672132010-01-13 07:55:48 +00005796 return stringlib_rsplit_whitespace(
5797 (PyObject*) self, self->str, self->length, maxcount
5798 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005799
Antoine Pitrou64672132010-01-13 07:55:48 +00005800 return stringlib_rsplit(
5801 (PyObject*) self, self->str, self->length,
5802 substring->str, substring->length,
5803 maxcount
5804 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005805}
5806
5807static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808PyObject *replace(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005809 PyUnicodeObject *str1,
5810 PyUnicodeObject *str2,
5811 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812{
5813 PyUnicodeObject *u;
5814
5815 if (maxcount < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005816 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrou64672132010-01-13 07:55:48 +00005817 else if (maxcount == 0 || self->length == 0)
5818 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819
Fredrik Lundh347ee272006-05-24 16:35:18 +00005820 if (str1->length == str2->length) {
Antoine Pitrou5c767c22010-01-13 08:55:20 +00005821 Py_ssize_t i;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005822 /* same length */
Antoine Pitrou64672132010-01-13 07:55:48 +00005823 if (str1->length == 0)
5824 goto nothing;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005825 if (str1->length == 1) {
5826 /* replace characters */
5827 Py_UNICODE u1, u2;
5828 if (!findchar(self->str, self->length, str1->str[0]))
5829 goto nothing;
5830 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5831 if (!u)
5832 return NULL;
5833 Py_UNICODE_COPY(u->str, self->str, self->length);
5834 u1 = str1->str[0];
5835 u2 = str2->str[0];
5836 for (i = 0; i < u->length; i++)
5837 if (u->str[i] == u1) {
5838 if (--maxcount < 0)
5839 break;
5840 u->str[i] = u2;
5841 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 } else {
Antoine Pitrou64672132010-01-13 07:55:48 +00005843 i = stringlib_find(
5844 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 );
Fredrik Lundh347ee272006-05-24 16:35:18 +00005846 if (i < 0)
5847 goto nothing;
5848 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5849 if (!u)
5850 return NULL;
5851 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrou64672132010-01-13 07:55:48 +00005852
5853 /* change everything in-place, starting with this one */
5854 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5855 i += str1->length;
5856
5857 while ( --maxcount > 0) {
5858 i = stringlib_find(self->str+i, self->length-i,
5859 str1->str, str1->length,
5860 i);
5861 if (i == -1)
5862 break;
5863 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5864 i += str1->length;
5865 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 } else {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005868
Brett Cannona7f13ee2010-05-04 01:16:51 +00005869 Py_ssize_t n, i, j;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005870 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 Py_UNICODE *p;
5872
5873 /* replace strings */
Antoine Pitrou64672132010-01-13 07:55:48 +00005874 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5875 maxcount);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005876 if (n == 0)
5877 goto nothing;
Fredrik Lundh0c71f882006-05-25 16:46:54 +00005878 /* new_size = self->length + n * (str2->length - str1->length)); */
5879 delta = (str2->length - str1->length);
5880 if (delta == 0) {
5881 new_size = self->length;
5882 } else {
5883 product = n * (str2->length - str1->length);
5884 if ((product / (str2->length - str1->length)) != n) {
5885 PyErr_SetString(PyExc_OverflowError,
5886 "replace string is too long");
5887 return NULL;
5888 }
5889 new_size = self->length + product;
5890 if (new_size < 0) {
5891 PyErr_SetString(PyExc_OverflowError,
5892 "replace string is too long");
5893 return NULL;
5894 }
5895 }
5896 u = _PyUnicode_New(new_size);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005897 if (!u)
5898 return NULL;
5899 i = 0;
5900 p = u->str;
5901 if (str1->length > 0) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005902 while (n-- > 0) {
5903 /* look for next match */
Antoine Pitrou64672132010-01-13 07:55:48 +00005904 j = stringlib_find(self->str+i, self->length-i,
5905 str1->str, str1->length,
5906 i);
5907 if (j == -1)
5908 break;
5909 else if (j > i) {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005910 /* copy unchanged part [i:j] */
5911 Py_UNICODE_COPY(p, self->str+i, j-i);
5912 p += j - i;
5913 }
5914 /* copy substitution string */
5915 if (str2->length > 0) {
Fredrik Lundh347ee272006-05-24 16:35:18 +00005916 Py_UNICODE_COPY(p, str2->str, str2->length);
5917 p += str2->length;
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005918 }
5919 i = j + str1->length;
5920 }
5921 if (i < self->length)
5922 /* copy tail [i:] */
5923 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Fredrik Lundh347ee272006-05-24 16:35:18 +00005924 } else {
Fredrik Lundhc2d29c52006-05-27 14:58:20 +00005925 /* interleave */
Fredrik Lundh347ee272006-05-24 16:35:18 +00005926 while (n > 0) {
5927 Py_UNICODE_COPY(p, str2->str, str2->length);
5928 p += str2->length;
5929 if (--n <= 0)
5930 break;
5931 *p++ = self->str[i++];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 }
Fredrik Lundh347ee272006-05-24 16:35:18 +00005933 Py_UNICODE_COPY(p, self->str+i, self->length-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 }
5935 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 return (PyObject *) u;
Fredrik Lundh347ee272006-05-24 16:35:18 +00005937
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005938 nothing:
Fredrik Lundh347ee272006-05-24 16:35:18 +00005939 /* nothing to replace; return original string (when possible) */
5940 if (PyUnicode_CheckExact(self)) {
5941 Py_INCREF(self);
5942 return (PyObject *) self;
5943 }
5944 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945}
5946
5947/* --- Unicode Object Methods --------------------------------------------- */
5948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005949PyDoc_STRVAR(title__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005950 "S.title() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951\n\
5952Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005953characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954
5955static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005956unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 return fixup(self, fixtitle);
5959}
5960
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005961PyDoc_STRVAR(capitalize__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005962 "S.capitalize() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963\n\
5964Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumaran5261b102010-07-05 12:04:07 +00005965have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966
5967static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005968unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970 return fixup(self, fixcapitalize);
5971}
5972
5973#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005974PyDoc_STRVAR(capwords__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005975 "S.capwords() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976\n\
5977Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005978normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979
5980static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005981unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982{
5983 PyObject *list;
5984 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005985 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 /* Split into words */
5988 list = split(self, NULL, -1);
5989 if (!list)
5990 return NULL;
5991
5992 /* Capitalize each word */
5993 for (i = 0; i < PyList_GET_SIZE(list); i++) {
5994 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00005995 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 if (item == NULL)
5997 goto onError;
5998 Py_DECREF(PyList_GET_ITEM(list, i));
5999 PyList_SET_ITEM(list, i, item);
6000 }
6001
6002 /* Join the words to form a new string */
6003 item = PyUnicode_Join(NULL, list);
6004
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006005 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 Py_DECREF(list);
6007 return (PyObject *)item;
6008}
6009#endif
6010
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006011/* Argument converter. Coerces to a single unicode character */
6012
6013static int
6014convert_uc(PyObject *obj, void *addr)
6015{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006016 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6017 PyObject *uniobj;
6018 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006019
Benjamin Peterson857ce152009-01-31 16:29:18 +00006020 uniobj = PyUnicode_FromObject(obj);
6021 if (uniobj == NULL) {
6022 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006023 "The fill character cannot be converted to Unicode");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006024 return 0;
6025 }
6026 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6027 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006028 "The fill character must be exactly one character long");
Benjamin Peterson857ce152009-01-31 16:29:18 +00006029 Py_DECREF(uniobj);
6030 return 0;
6031 }
6032 unistr = PyUnicode_AS_UNICODE(uniobj);
6033 *fillcharloc = unistr[0];
6034 Py_DECREF(uniobj);
6035 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006036}
6037
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006038PyDoc_STRVAR(center__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006039 "S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006041Return S centered in a Unicode string of length width. Padding is\n\
6042done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043
6044static PyObject *
6045unicode_center(PyUnicodeObject *self, PyObject *args)
6046{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006047 Py_ssize_t marg, left;
6048 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006049 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050
Thomas Woutersde017742006-02-16 19:34:37 +00006051 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 return NULL;
6053
Tim Peters7a29bd52001-09-12 03:03:31 +00006054 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 Py_INCREF(self);
6056 return (PyObject*) self;
6057 }
6058
6059 marg = width - self->length;
6060 left = marg / 2 + (marg & width & 1);
6061
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006062 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063}
6064
Marc-André Lemburge5034372000-08-08 08:04:29 +00006065#if 0
6066
6067/* This code should go into some future Unicode collation support
6068 module. The basic comparison should compare ordinals on a naive
Georg Brandl18187e22009-06-06 18:21:58 +00006069 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00006070
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006071/* speedy UTF-16 code point order comparison */
6072/* gleaned from: */
6073/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6074
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006075static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006076{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006077 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00006078 0, 0, 0, 0, 0, 0, 0, 0,
6079 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00006080 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006081};
6082
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083static int
6084unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6085{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006086 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006087
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 Py_UNICODE *s1 = str1->str;
6089 Py_UNICODE *s2 = str2->str;
6090
6091 len1 = str1->length;
6092 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006093
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006095 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006096
6097 c1 = *s1++;
6098 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00006099
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006100 if (c1 > (1<<11) * 26)
6101 c1 += utf16Fixup[c1>>11];
6102 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006103 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006104 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00006105
6106 if (c1 != c2)
6107 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00006108
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00006109 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 }
6111
6112 return (len1 < len2) ? -1 : (len1 != len2);
6113}
6114
Marc-André Lemburge5034372000-08-08 08:04:29 +00006115#else
6116
6117static int
6118unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6119{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006120 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006121
6122 Py_UNICODE *s1 = str1->str;
6123 Py_UNICODE *s2 = str2->str;
6124
6125 len1 = str1->length;
6126 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00006127
Marc-André Lemburge5034372000-08-08 08:04:29 +00006128 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00006129 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00006130
Fredrik Lundh45714e92001-06-26 16:39:36 +00006131 c1 = *s1++;
6132 c2 = *s2++;
6133
6134 if (c1 != c2)
6135 return (c1 < c2) ? -1 : 1;
6136
Marc-André Lemburge5034372000-08-08 08:04:29 +00006137 len1--; len2--;
6138 }
6139
6140 return (len1 < len2) ? -1 : (len1 != len2);
6141}
6142
6143#endif
6144
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145int PyUnicode_Compare(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006146 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147{
6148 PyUnicodeObject *u = NULL, *v = NULL;
6149 int result;
6150
6151 /* Coerce the two arguments */
6152 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6153 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006154 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6156 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006157 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006158
Thomas Wouters7e474022000-07-16 12:04:32 +00006159 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 if (v == u) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006161 Py_DECREF(u);
6162 Py_DECREF(v);
6163 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 }
6165
6166 result = unicode_compare(u, v);
6167
6168 Py_DECREF(u);
6169 Py_DECREF(v);
6170 return result;
6171
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006172 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 Py_XDECREF(u);
6174 Py_XDECREF(v);
6175 return -1;
6176}
6177
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006178PyObject *PyUnicode_RichCompare(PyObject *left,
6179 PyObject *right,
6180 int op)
6181{
6182 int result;
6183
6184 result = PyUnicode_Compare(left, right);
6185 if (result == -1 && PyErr_Occurred())
6186 goto onError;
6187
6188 /* Convert the return value to a Boolean */
6189 switch (op) {
6190 case Py_EQ:
6191 result = (result == 0);
6192 break;
6193 case Py_NE:
6194 result = (result != 0);
6195 break;
6196 case Py_LE:
6197 result = (result <= 0);
6198 break;
6199 case Py_GE:
6200 result = (result >= 0);
6201 break;
6202 case Py_LT:
6203 result = (result == -1);
6204 break;
6205 case Py_GT:
6206 result = (result == 1);
6207 break;
6208 }
6209 return PyBool_FromLong(result);
6210
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006211 onError:
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006212
6213 /* Standard case
6214
6215 Type errors mean that PyUnicode_FromObject() could not convert
6216 one of the arguments (usually the right hand side) to Unicode,
6217 ie. we can't handle the comparison request. However, it is
6218 possible that the other object knows a comparison method, which
6219 is why we return Py_NotImplemented to give the other object a
6220 chance.
6221
6222 */
6223 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6224 PyErr_Clear();
6225 Py_INCREF(Py_NotImplemented);
6226 return Py_NotImplemented;
6227 }
6228 if (op != Py_EQ && op != Py_NE)
6229 return NULL;
6230
6231 /* Equality comparison.
6232
6233 This is a special case: we silence any PyExc_UnicodeDecodeError
6234 and instead turn it into a PyErr_UnicodeWarning.
6235
6236 */
6237 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6238 return NULL;
6239 PyErr_Clear();
Benjamin Peterson857ce152009-01-31 16:29:18 +00006240 if (PyErr_Warn(PyExc_UnicodeWarning,
6241 (op == Py_EQ) ?
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006242 "Unicode equal comparison "
6243 "failed to convert both arguments to Unicode - "
6244 "interpreting them as being unequal" :
6245 "Unicode unequal comparison "
6246 "failed to convert both arguments to Unicode - "
6247 "interpreting them as being unequal"
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006248 ) < 0)
Marc-André Lemburg040f76b2006-08-14 10:55:19 +00006249 return NULL;
6250 result = (op == Py_NE);
6251 return PyBool_FromLong(result);
6252}
6253
Guido van Rossum403d68b2000-03-13 15:55:09 +00006254int PyUnicode_Contains(PyObject *container,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006255 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00006256{
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006257 PyObject *str, *sub;
6258 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006259
6260 /* Coerce the two arguments */
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006261 sub = PyUnicode_FromObject(element);
6262 if (!sub) {
Fredrik Lundh833bf942006-05-23 10:12:21 +00006263 return -1;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00006264 }
Fredrik Lundh833bf942006-05-23 10:12:21 +00006265
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006266 str = PyUnicode_FromObject(container);
6267 if (!str) {
6268 Py_DECREF(sub);
Fredrik Lundh833bf942006-05-23 10:12:21 +00006269 return -1;
6270 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00006271
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006272 result = stringlib_contains_obj(str, sub);
Barry Warsaw817918c2002-08-06 16:58:21 +00006273
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006274 Py_DECREF(str);
6275 Py_DECREF(sub);
Guido van Rossum403d68b2000-03-13 15:55:09 +00006276
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006277 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00006278}
6279
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280/* Concat to string or Unicode object giving a new Unicode object. */
6281
6282PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006283 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284{
6285 PyUnicodeObject *u = NULL, *v = NULL, *w;
6286
6287 /* Coerce the two arguments */
6288 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6289 if (u == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006290 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6292 if (v == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006293 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294
6295 /* Shortcuts */
6296 if (v == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006297 Py_DECREF(v);
6298 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 }
6300 if (u == unicode_empty) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006301 Py_DECREF(u);
6302 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 }
6304
6305 /* Concat the two Unicode strings */
6306 w = _PyUnicode_New(u->length + v->length);
6307 if (w == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006308 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 Py_UNICODE_COPY(w->str, u->str, u->length);
6310 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6311
6312 Py_DECREF(u);
6313 Py_DECREF(v);
6314 return (PyObject *)w;
6315
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006316 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 Py_XDECREF(u);
6318 Py_XDECREF(v);
6319 return NULL;
6320}
6321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006322PyDoc_STRVAR(count__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006323 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324\n\
Fredrik Lundh763b50f2006-05-22 15:35:12 +00006325Return the number of non-overlapping occurrences of substring sub in\n\
6326Unicode string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006327interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328
6329static PyObject *
6330unicode_count(PyUnicodeObject *self, PyObject *args)
6331{
6332 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006333 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00006334 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335 PyObject *result;
6336
Jesus Cea44e81682011-04-20 16:39:15 +02006337 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6338 &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006339 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006340
Antoine Pitrou64672132010-01-13 07:55:48 +00006341 ADJUST_INDICES(start, end, self->length);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006342 result = PyInt_FromSsize_t(
6343 stringlib_count(self->str + start, end - start,
Antoine Pitrou64672132010-01-13 07:55:48 +00006344 substring->str, substring->length,
6345 PY_SSIZE_T_MAX)
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006346 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347
6348 Py_DECREF(substring);
Fredrik Lundh58b5e842006-05-26 19:24:53 +00006349
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350 return result;
6351}
6352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006353PyDoc_STRVAR(encode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006354 "S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006355\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006356Encodes S using the codec registered for encoding. encoding defaults\n\
6357to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00006358handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006359a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6360'xmlcharrefreplace' as well as any other name registered with\n\
6361codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362
6363static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006364unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006366 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00006367 char *encoding = NULL;
6368 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006369 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006370
Benjamin Peterson332d7212009-09-18 21:14:55 +00006371 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6372 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006374 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006375 if (v == NULL)
6376 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006377 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006378 PyErr_Format(PyExc_TypeError,
6379 "encoder did not return a string/unicode object "
6380 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006381 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006382 Py_DECREF(v);
6383 return NULL;
6384 }
6385 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006386
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006387 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006388 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006389}
6390
6391PyDoc_STRVAR(decode__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006392 "S.decode([encoding[,errors]]) -> string or unicode\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006393\n\
6394Decodes S using the codec registered for encoding. encoding defaults\n\
6395to the default encoding. errors may be given to set a different error\n\
6396handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6397a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
Éric Araujoa4c81b02012-02-20 02:07:31 +01006398as well as any other name registered with codecs.register_error that is\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006399able to handle UnicodeDecodeErrors.");
6400
6401static PyObject *
Benjamin Peterson332d7212009-09-18 21:14:55 +00006402unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006403{
Benjamin Peterson332d7212009-09-18 21:14:55 +00006404 static char *kwlist[] = {"encoding", "errors", 0};
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006405 char *encoding = NULL;
6406 char *errors = NULL;
6407 PyObject *v;
Benjamin Peterson857ce152009-01-31 16:29:18 +00006408
Benjamin Peterson332d7212009-09-18 21:14:55 +00006409 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6410 kwlist, &encoding, &errors))
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006411 return NULL;
6412 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006413 if (v == NULL)
6414 goto onError;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00006415 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006416 PyErr_Format(PyExc_TypeError,
6417 "decoder did not return a string/unicode object "
6418 "(type=%.400s)",
Christian Heimese93237d2007-12-19 02:37:44 +00006419 Py_TYPE(v)->tp_name);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006420 Py_DECREF(v);
6421 return NULL;
6422 }
6423 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006424
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006425 onError:
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00006426 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427}
6428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006429PyDoc_STRVAR(expandtabs__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006430 "S.expandtabs([tabsize]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431\n\
6432Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006433If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434
6435static PyObject*
6436unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6437{
6438 Py_UNICODE *e;
6439 Py_UNICODE *p;
6440 Py_UNICODE *q;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006441 Py_UNICODE *qe;
6442 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 PyUnicodeObject *u;
6444 int tabsize = 8;
6445
6446 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006447 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448
Thomas Wouters7e474022000-07-16 12:04:32 +00006449 /* First pass: determine size of output string */
Guido van Rossum5bdff602008-03-11 21:18:06 +00006450 i = 0; /* chars up to and including most recent \n or \r */
6451 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6452 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 for (p = self->str; p < e; p++)
6454 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006455 if (tabsize > 0) {
6456 incr = tabsize - (j % tabsize); /* cannot overflow */
6457 if (j > PY_SSIZE_T_MAX - incr)
6458 goto overflow1;
6459 j += incr;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006460 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006461 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006462 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006463 if (j > PY_SSIZE_T_MAX - 1)
6464 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465 j++;
6466 if (*p == '\n' || *p == '\r') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006467 if (i > PY_SSIZE_T_MAX - j)
6468 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 i += j;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006470 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 }
6472 }
6473
Guido van Rossum5bdff602008-03-11 21:18:06 +00006474 if (i > PY_SSIZE_T_MAX - j)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006475 goto overflow1;
Neal Norwitz7dbd2a32007-06-09 03:36:34 +00006476
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477 /* Second pass: create output string and fill it */
6478 u = _PyUnicode_New(i + j);
6479 if (!u)
6480 return NULL;
6481
Guido van Rossum5bdff602008-03-11 21:18:06 +00006482 j = 0; /* same as in first pass */
6483 q = u->str; /* next output char */
6484 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485
6486 for (p = self->str; p < e; p++)
6487 if (*p == '\t') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006488 if (tabsize > 0) {
6489 i = tabsize - (j % tabsize);
6490 j += i;
6491 while (i--) {
6492 if (q >= qe)
6493 goto overflow2;
6494 *q++ = ' ';
Guido van Rossum5bdff602008-03-11 21:18:06 +00006495 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006496 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00006497 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006498 else {
6499 if (q >= qe)
6500 goto overflow2;
6501 *q++ = *p;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006502 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 if (*p == '\n' || *p == '\r')
6504 j = 0;
6505 }
6506
6507 return (PyObject*) u;
Guido van Rossum5bdff602008-03-11 21:18:06 +00006508
6509 overflow2:
6510 Py_DECREF(u);
6511 overflow1:
6512 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514}
6515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006516PyDoc_STRVAR(find__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006517 "S.find(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006518\n\
6519Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08006520such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521arguments start and end are interpreted as in slice notation.\n\
6522\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006523Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524
6525static PyObject *
6526unicode_find(PyUnicodeObject *self, PyObject *args)
6527{
Jesus Cea44e81682011-04-20 16:39:15 +02006528 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006529 Py_ssize_t start;
6530 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006531 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532
Jesus Cea44e81682011-04-20 16:39:15 +02006533 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6534 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006537 result = stringlib_find_slice(
6538 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6539 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6540 start, end
6541 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542
6543 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006544
6545 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546}
6547
6548static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00006549unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550{
6551 if (index < 0 || index >= self->length) {
6552 PyErr_SetString(PyExc_IndexError, "string index out of range");
6553 return NULL;
6554 }
6555
6556 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6557}
6558
6559static long
6560unicode_hash(PyUnicodeObject *self)
6561{
Fredrik Lundhdde61642000-07-10 18:27:47 +00006562 /* Since Unicode objects compare equal to their ASCII string
6563 counterparts, they should use the individual character values
6564 as basis for their hash value. This is needed to assure that
6565 strings and Unicode objects behave in the same way as
6566 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567
Martin v. Löwis18e16552006-02-15 17:27:45 +00006568 register Py_ssize_t len;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006569 register Py_UNICODE *p;
6570 register long x;
6571
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006572#ifdef Py_DEBUG
Benjamin Peterson26da9202012-02-21 11:08:50 -05006573 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf51c3842012-04-09 14:53:07 -04006574#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 if (self->hash != -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006576 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006577 len = PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006578 /*
6579 We make the hash of the empty string be 0, rather than using
6580 (prefix ^ suffix), since this slightly obfuscates the hash secret
6581 */
6582 if (len == 0) {
6583 self->hash = 0;
6584 return 0;
6585 }
Fredrik Lundhdde61642000-07-10 18:27:47 +00006586 p = PyUnicode_AS_UNICODE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006587 x = _Py_HashSecret.prefix;
6588 x ^= *p << 7;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006589 while (--len >= 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006590 x = (1000003*x) ^ *p++;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006591 x ^= PyUnicode_GET_SIZE(self);
Barry Warsaw1e13eb02012-02-20 20:42:21 -05006592 x ^= _Py_HashSecret.suffix;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006593 if (x == -1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006594 x = -2;
Fredrik Lundhdde61642000-07-10 18:27:47 +00006595 self->hash = x;
6596 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597}
6598
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006599PyDoc_STRVAR(index__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006600 "S.index(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006602Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603
6604static PyObject *
6605unicode_index(PyUnicodeObject *self, PyObject *args)
6606{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006607 Py_ssize_t result;
Jesus Cea44e81682011-04-20 16:39:15 +02006608 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00006609 Py_ssize_t start;
6610 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611
Jesus Cea44e81682011-04-20 16:39:15 +02006612 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6613 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615
Fredrik Lundh60d8b182006-05-27 15:20:22 +00006616 result = stringlib_find_slice(
6617 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6618 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6619 start, end
6620 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621
6622 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006623
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 if (result < 0) {
6625 PyErr_SetString(PyExc_ValueError, "substring not found");
6626 return NULL;
6627 }
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00006628
Martin v. Löwis18e16552006-02-15 17:27:45 +00006629 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630}
6631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006632PyDoc_STRVAR(islower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006633 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006635Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006636at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637
6638static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006639unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640{
6641 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6642 register const Py_UNICODE *e;
6643 int cased;
6644
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 /* Shortcut for single character strings */
6646 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006647 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006649 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006650 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006651 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006652
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 e = p + PyUnicode_GET_SIZE(self);
6654 cased = 0;
6655 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006656 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006657
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006658 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6659 return PyBool_FromLong(0);
6660 else if (!cased && Py_UNICODE_ISLOWER(ch))
6661 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006663 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664}
6665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006666PyDoc_STRVAR(isupper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006667 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006669Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006670at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671
6672static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006673unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674{
6675 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6676 register const Py_UNICODE *e;
6677 int cased;
6678
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 /* Shortcut for single character strings */
6680 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006681 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006683 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006684 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006685 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006686
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 e = p + PyUnicode_GET_SIZE(self);
6688 cased = 0;
6689 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006690 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006691
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006692 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6693 return PyBool_FromLong(0);
6694 else if (!cased && Py_UNICODE_ISUPPER(ch))
6695 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006697 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698}
6699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006700PyDoc_STRVAR(istitle__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006701 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006702\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006703Return True if S is a titlecased string and there is at least one\n\
6704character in S, i.e. upper- and titlecase characters may only\n\
6705follow uncased characters and lowercase characters only cased ones.\n\
6706Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006707
6708static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006709unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710{
6711 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6712 register const Py_UNICODE *e;
6713 int cased, previous_is_cased;
6714
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715 /* Shortcut for single character strings */
6716 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006717 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6718 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006720 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006721 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006722 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006723
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 e = p + PyUnicode_GET_SIZE(self);
6725 cased = 0;
6726 previous_is_cased = 0;
6727 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006728 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006729
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006730 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6731 if (previous_is_cased)
6732 return PyBool_FromLong(0);
6733 previous_is_cased = 1;
6734 cased = 1;
6735 }
6736 else if (Py_UNICODE_ISLOWER(ch)) {
6737 if (!previous_is_cased)
6738 return PyBool_FromLong(0);
6739 previous_is_cased = 1;
6740 cased = 1;
6741 }
6742 else
6743 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006745 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746}
6747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006748PyDoc_STRVAR(isspace__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006749 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006751Return True if all characters in S are whitespace\n\
6752and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753
6754static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006755unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756{
6757 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6758 register const Py_UNICODE *e;
6759
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760 /* Shortcut for single character strings */
6761 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006762 Py_UNICODE_ISSPACE(*p))
6763 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006765 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006766 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006767 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006768
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 e = p + PyUnicode_GET_SIZE(self);
6770 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006771 if (!Py_UNICODE_ISSPACE(*p))
6772 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006774 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775}
6776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006777PyDoc_STRVAR(isalpha__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006778 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006779\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006780Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006781and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006782
6783static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006784unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006785{
6786 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6787 register const Py_UNICODE *e;
6788
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006789 /* Shortcut for single character strings */
6790 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006791 Py_UNICODE_ISALPHA(*p))
6792 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006793
6794 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006795 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006796 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006797
6798 e = p + PyUnicode_GET_SIZE(self);
6799 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006800 if (!Py_UNICODE_ISALPHA(*p))
6801 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006802 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006803 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006804}
6805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006806PyDoc_STRVAR(isalnum__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006807 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006808\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006809Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006810and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006811
6812static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006813unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006814{
6815 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6816 register const Py_UNICODE *e;
6817
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006818 /* Shortcut for single character strings */
6819 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006820 Py_UNICODE_ISALNUM(*p))
6821 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006822
6823 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006824 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006825 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006826
6827 e = p + PyUnicode_GET_SIZE(self);
6828 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006829 if (!Py_UNICODE_ISALNUM(*p))
6830 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006831 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006832 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00006833}
6834
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006835PyDoc_STRVAR(isdecimal__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006836 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006838Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006839False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840
6841static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006842unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843{
6844 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6845 register const Py_UNICODE *e;
6846
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847 /* Shortcut for single character strings */
6848 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006849 Py_UNICODE_ISDECIMAL(*p))
6850 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006852 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006853 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006854 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006855
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 e = p + PyUnicode_GET_SIZE(self);
6857 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006858 if (!Py_UNICODE_ISDECIMAL(*p))
6859 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006861 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862}
6863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006864PyDoc_STRVAR(isdigit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006865 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00006867Return True if all characters in S are digits\n\
6868and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869
6870static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006871unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872{
6873 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6874 register const Py_UNICODE *e;
6875
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 /* Shortcut for single character strings */
6877 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006878 Py_UNICODE_ISDIGIT(*p))
6879 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006881 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006882 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006883 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006884
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 e = p + PyUnicode_GET_SIZE(self);
6886 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006887 if (!Py_UNICODE_ISDIGIT(*p))
6888 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006890 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891}
6892
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006893PyDoc_STRVAR(isnumeric__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006894 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00006896Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006897False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898
6899static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006900unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901{
6902 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6903 register const Py_UNICODE *e;
6904
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 /* Shortcut for single character strings */
6906 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006907 Py_UNICODE_ISNUMERIC(*p))
6908 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006910 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00006911 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006912 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00006913
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 e = p + PyUnicode_GET_SIZE(self);
6915 for (; p < e; p++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006916 if (!Py_UNICODE_ISNUMERIC(*p))
6917 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006918 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00006919 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920}
6921
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006922PyDoc_STRVAR(join__doc__,
Georg Brandl9b4e5822009-10-14 18:48:32 +00006923 "S.join(iterable) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924\n\
6925Return a string which is the concatenation of the strings in the\n\
Georg Brandl9b4e5822009-10-14 18:48:32 +00006926iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927
6928static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006929unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006931 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932}
6933
Martin v. Löwis18e16552006-02-15 17:27:45 +00006934static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935unicode_length(PyUnicodeObject *self)
6936{
6937 return self->length;
6938}
6939
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006940PyDoc_STRVAR(ljust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006941 "S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00006943Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006944done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945
6946static PyObject *
6947unicode_ljust(PyUnicodeObject *self, PyObject *args)
6948{
Martin v. Löwis412fb672006-04-13 06:34:32 +00006949 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006950 Py_UNICODE fillchar = ' ';
6951
Martin v. Löwis412fb672006-04-13 06:34:32 +00006952 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953 return NULL;
6954
Tim Peters7a29bd52001-09-12 03:03:31 +00006955 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 Py_INCREF(self);
6957 return (PyObject*) self;
6958 }
6959
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006960 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961}
6962
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006963PyDoc_STRVAR(lower__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006964 "S.lower() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006966Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967
6968static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006969unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971 return fixup(self, fixlower);
6972}
6973
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006974#define LEFTSTRIP 0
6975#define RIGHTSTRIP 1
6976#define BOTHSTRIP 2
6977
6978/* Arrays indexed by above */
6979static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6980
6981#define STRIPNAME(i) (stripformat[i]+3)
6982
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006983/* externally visible for str.strip(unicode) */
6984PyObject *
6985_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6986{
Benjamin Peterson857ce152009-01-31 16:29:18 +00006987 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6988 Py_ssize_t len = PyUnicode_GET_SIZE(self);
6989 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6990 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6991 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006992
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006993 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Fredrik Lundhb63588c2006-05-23 18:44:25 +00006994
Benjamin Peterson857ce152009-01-31 16:29:18 +00006995 i = 0;
6996 if (striptype != RIGHTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00006997 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6998 i++;
6999 }
Benjamin Peterson857ce152009-01-31 16:29:18 +00007000 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007001
Benjamin Peterson857ce152009-01-31 16:29:18 +00007002 j = len;
7003 if (striptype != LEFTSTRIP) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007004 do {
7005 j--;
7006 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7007 j++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007008 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007009
Benjamin Peterson857ce152009-01-31 16:29:18 +00007010 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007011 Py_INCREF(self);
7012 return (PyObject*)self;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007013 }
7014 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007015 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007016}
7017
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018
7019static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007020do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007022 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7023 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007024
Benjamin Peterson857ce152009-01-31 16:29:18 +00007025 i = 0;
7026 if (striptype != RIGHTSTRIP) {
7027 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7028 i++;
7029 }
7030 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007031
Benjamin Peterson857ce152009-01-31 16:29:18 +00007032 j = len;
7033 if (striptype != LEFTSTRIP) {
7034 do {
7035 j--;
7036 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7037 j++;
7038 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007039
Benjamin Peterson857ce152009-01-31 16:29:18 +00007040 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7041 Py_INCREF(self);
7042 return (PyObject*)self;
7043 }
7044 else
7045 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046}
7047
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007048
7049static PyObject *
7050do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7051{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007052 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007053
Benjamin Peterson857ce152009-01-31 16:29:18 +00007054 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7055 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007056
Benjamin Peterson857ce152009-01-31 16:29:18 +00007057 if (sep != NULL && sep != Py_None) {
7058 if (PyUnicode_Check(sep))
7059 return _PyUnicode_XStrip(self, striptype, sep);
7060 else if (PyString_Check(sep)) {
7061 PyObject *res;
7062 sep = PyUnicode_FromObject(sep);
7063 if (sep==NULL)
7064 return NULL;
7065 res = _PyUnicode_XStrip(self, striptype, sep);
7066 Py_DECREF(sep);
7067 return res;
7068 }
7069 else {
7070 PyErr_Format(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007071 "%s arg must be None, unicode or str",
7072 STRIPNAME(striptype));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007073 return NULL;
7074 }
7075 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007076
Benjamin Peterson857ce152009-01-31 16:29:18 +00007077 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007078}
7079
7080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007081PyDoc_STRVAR(strip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007082 "S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007083\n\
7084Return a copy of the string S with leading and trailing\n\
7085whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007086If chars is given and not None, remove characters in chars instead.\n\
7087If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007088
7089static PyObject *
7090unicode_strip(PyUnicodeObject *self, PyObject *args)
7091{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007092 if (PyTuple_GET_SIZE(args) == 0)
7093 return do_strip(self, BOTHSTRIP); /* Common case */
7094 else
7095 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007096}
7097
7098
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007099PyDoc_STRVAR(lstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007100 "S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007101\n\
7102Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007103If chars is given and not None, remove characters in chars instead.\n\
7104If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007105
7106static PyObject *
7107unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7108{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007109 if (PyTuple_GET_SIZE(args) == 0)
7110 return do_strip(self, LEFTSTRIP); /* Common case */
7111 else
7112 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007113}
7114
7115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007116PyDoc_STRVAR(rstrip__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007117 "S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007118\n\
7119Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00007120If chars is given and not None, remove characters in chars instead.\n\
7121If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007122
7123static PyObject *
7124unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7125{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007126 if (PyTuple_GET_SIZE(args) == 0)
7127 return do_strip(self, RIGHTSTRIP); /* Common case */
7128 else
7129 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007130}
7131
7132
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007134unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135{
7136 PyUnicodeObject *u;
7137 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007138 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00007139 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140
7141 if (len < 0)
7142 len = 0;
7143
Tim Peters7a29bd52001-09-12 03:03:31 +00007144 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 /* no repeat, return original string */
7146 Py_INCREF(str);
7147 return (PyObject*) str;
7148 }
Tim Peters8f422462000-09-09 06:13:41 +00007149
7150 /* ensure # of chars needed doesn't overflow int and # of bytes
7151 * needed doesn't overflow size_t
7152 */
7153 nchars = len * str->length;
7154 if (len && nchars / len != str->length) {
7155 PyErr_SetString(PyExc_OverflowError,
7156 "repeated string is too long");
7157 return NULL;
7158 }
7159 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7160 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7161 PyErr_SetString(PyExc_OverflowError,
7162 "repeated string is too long");
7163 return NULL;
7164 }
7165 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 if (!u)
7167 return NULL;
7168
7169 p = u->str;
7170
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007171 if (str->length == 1 && len > 0) {
7172 Py_UNICODE_FILL(p, str->str[0], len);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007173 } else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007174 Py_ssize_t done = 0; /* number of characters copied this far */
7175 if (done < nchars) {
Fredrik Lundhf1d60a52006-05-22 16:29:30 +00007176 Py_UNICODE_COPY(p, str->str, str->length);
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007177 done = str->length;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007178 }
7179 while (done < nchars) {
Neal Norwitz4677fbf72008-03-25 04:18:18 +00007180 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007181 Py_UNICODE_COPY(p+done, p, n);
7182 done += n;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007183 }
Fredrik Lundh8a8e05a2006-05-22 17:12:58 +00007184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185
7186 return (PyObject*) u;
7187}
7188
7189PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007190 PyObject *subobj,
7191 PyObject *replobj,
7192 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193{
7194 PyObject *self;
7195 PyObject *str1;
7196 PyObject *str2;
7197 PyObject *result;
7198
7199 self = PyUnicode_FromObject(obj);
7200 if (self == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007201 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202 str1 = PyUnicode_FromObject(subobj);
7203 if (str1 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007204 Py_DECREF(self);
7205 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 }
7207 str2 = PyUnicode_FromObject(replobj);
7208 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007209 Py_DECREF(self);
7210 Py_DECREF(str1);
7211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212 }
Tim Petersced69f82003-09-16 20:30:58 +00007213 result = replace((PyUnicodeObject *)self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007214 (PyUnicodeObject *)str1,
7215 (PyUnicodeObject *)str2,
7216 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217 Py_DECREF(self);
7218 Py_DECREF(str1);
7219 Py_DECREF(str2);
7220 return result;
7221}
7222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007223PyDoc_STRVAR(replace__doc__,
Ezio Melotti2f06b782010-06-26 18:44:42 +00007224 "S.replace(old, new[, count]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225\n\
7226Return a copy of S with all occurrences of substring\n\
Georg Brandl30fadc12008-05-30 07:54:16 +00007227old replaced by new. If the optional argument count is\n\
7228given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229
7230static PyObject*
7231unicode_replace(PyUnicodeObject *self, PyObject *args)
7232{
7233 PyUnicodeObject *str1;
7234 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007235 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236 PyObject *result;
7237
Martin v. Löwis18e16552006-02-15 17:27:45 +00007238 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239 return NULL;
7240 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7241 if (str1 == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007242 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007243 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007244 if (str2 == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007245 Py_DECREF(str1);
7246 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00007247 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007248
7249 result = replace(self, str1, str2, maxcount);
7250
7251 Py_DECREF(str1);
7252 Py_DECREF(str2);
7253 return result;
7254}
7255
7256static
7257PyObject *unicode_repr(PyObject *unicode)
7258{
7259 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007260 PyUnicode_GET_SIZE(unicode),
7261 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262}
7263
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007264PyDoc_STRVAR(rfind__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007265 "S.rfind(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266\n\
7267Return the highest index in S where substring sub is found,\n\
Senthil Kumaran5e3a19d2011-07-27 23:36:51 +08007268such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269arguments start and end are interpreted as in slice notation.\n\
7270\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007271Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272
7273static PyObject *
7274unicode_rfind(PyUnicodeObject *self, PyObject *args)
7275{
Jesus Cea44e81682011-04-20 16:39:15 +02007276 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007277 Py_ssize_t start;
7278 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007279 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007280
Jesus Cea44e81682011-04-20 16:39:15 +02007281 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7282 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007285 result = stringlib_rfind_slice(
7286 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7287 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7288 start, end
7289 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007290
7291 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007292
7293 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294}
7295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007296PyDoc_STRVAR(rindex__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007297 "S.rindex(sub [,start [,end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007299Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300
7301static PyObject *
7302unicode_rindex(PyUnicodeObject *self, PyObject *args)
7303{
Jesus Cea44e81682011-04-20 16:39:15 +02007304 PyUnicodeObject *substring;
Facundo Batista57d56692007-11-16 18:04:14 +00007305 Py_ssize_t start;
7306 Py_ssize_t end;
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007307 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308
Jesus Cea44e81682011-04-20 16:39:15 +02007309 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7310 &start, &end))
Benjamin Peterson857ce152009-01-31 16:29:18 +00007311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312
Fredrik Lundh60d8b182006-05-27 15:20:22 +00007313 result = stringlib_rfind_slice(
7314 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7315 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7316 start, end
7317 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318
7319 Py_DECREF(substring);
Fredrik Lundh2d23d5b2006-05-27 10:05:10 +00007320
Guido van Rossumd57fd912000-03-10 22:53:23 +00007321 if (result < 0) {
7322 PyErr_SetString(PyExc_ValueError, "substring not found");
7323 return NULL;
7324 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007325 return PyInt_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326}
7327
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007328PyDoc_STRVAR(rjust__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007329 "S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007330\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007331Return S right-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007332done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007333
7334static PyObject *
7335unicode_rjust(PyUnicodeObject *self, PyObject *args)
7336{
Martin v. Löwis412fb672006-04-13 06:34:32 +00007337 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007338 Py_UNICODE fillchar = ' ';
7339
Martin v. Löwis412fb672006-04-13 06:34:32 +00007340 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341 return NULL;
7342
Tim Peters7a29bd52001-09-12 03:03:31 +00007343 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007344 Py_INCREF(self);
7345 return (PyObject*) self;
7346 }
7347
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007348 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349}
7350
Guido van Rossumd57fd912000-03-10 22:53:23 +00007351static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00007352unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353{
7354 /* standard clamping */
7355 if (start < 0)
7356 start = 0;
7357 if (end < 0)
7358 end = 0;
7359 if (end > self->length)
7360 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00007361 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007362 /* full slice, return original string */
7363 Py_INCREF(self);
7364 return (PyObject*) self;
7365 }
7366 if (start > end)
7367 start = end;
7368 /* copy slice */
7369 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007370 end - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371}
7372
7373PyObject *PyUnicode_Split(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007374 PyObject *sep,
7375 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376{
7377 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00007378
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379 s = PyUnicode_FromObject(s);
7380 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007381 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007382 if (sep != NULL) {
7383 sep = PyUnicode_FromObject(sep);
7384 if (sep == NULL) {
7385 Py_DECREF(s);
7386 return NULL;
7387 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388 }
7389
7390 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7391
7392 Py_DECREF(s);
7393 Py_XDECREF(sep);
7394 return result;
7395}
7396
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007397PyDoc_STRVAR(split__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007398 "S.split([sep [,maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399\n\
7400Return a list of the words in S, using sep as the\n\
7401delimiter string. If maxsplit is given, at most maxsplit\n\
Georg Brandldfb77db2008-05-11 09:11:40 +00007402splits are done. If sep is not specified or is None, any\n\
Georg Brandlecbbd942008-05-11 20:53:55 +00007403whitespace string is a separator and empty strings are\n\
7404removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405
7406static PyObject*
7407unicode_split(PyUnicodeObject *self, PyObject *args)
7408{
7409 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007410 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411
Martin v. Löwis18e16552006-02-15 17:27:45 +00007412 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413 return NULL;
7414
7415 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007416 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007417 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007418 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007420 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421}
7422
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007423PyObject *
7424PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7425{
7426 PyObject* str_obj;
7427 PyObject* sep_obj;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007428 PyObject* out;
Fredrik Lundhb9479482006-05-26 17:22:38 +00007429
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007430 str_obj = PyUnicode_FromObject(str_in);
7431 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007432 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007433 sep_obj = PyUnicode_FromObject(sep_in);
Fredrik Lundhb9479482006-05-26 17:22:38 +00007434 if (!sep_obj) {
7435 Py_DECREF(str_obj);
7436 return NULL;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007437 }
7438
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007439 out = stringlib_partition(
Fredrik Lundhb9479482006-05-26 17:22:38 +00007440 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7441 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7442 );
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007443
Fredrik Lundhb9479482006-05-26 17:22:38 +00007444 Py_DECREF(sep_obj);
7445 Py_DECREF(str_obj);
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007446
7447 return out;
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007448}
7449
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007450
7451PyObject *
7452PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7453{
7454 PyObject* str_obj;
7455 PyObject* sep_obj;
7456 PyObject* out;
7457
7458 str_obj = PyUnicode_FromObject(str_in);
7459 if (!str_obj)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007460 return NULL;
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007461 sep_obj = PyUnicode_FromObject(sep_in);
7462 if (!sep_obj) {
7463 Py_DECREF(str_obj);
7464 return NULL;
7465 }
7466
Fredrik Lundh58b5e842006-05-26 19:24:53 +00007467 out = stringlib_rpartition(
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007468 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7469 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7470 );
7471
7472 Py_DECREF(sep_obj);
7473 Py_DECREF(str_obj);
7474
7475 return out;
7476}
7477
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007478PyDoc_STRVAR(partition__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007479 "S.partition(sep) -> (head, sep, tail)\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007480\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007481Search for the separator sep in S, and return the part before it,\n\
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007482the separator itself, and the part after it. If the separator is not\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007483found, return S and two empty strings.");
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007484
7485static PyObject*
Fredrik Lundh450277f2006-05-26 09:46:59 +00007486unicode_partition(PyUnicodeObject *self, PyObject *separator)
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007487{
Fredrik Lundh06a69dd2006-05-26 08:54:28 +00007488 return PyUnicode_Partition((PyObject *)self, separator);
7489}
7490
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007491PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti1fafaab2010-01-25 11:24:37 +00007492 "S.rpartition(sep) -> (head, sep, tail)\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007493\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007494Search for the separator sep in S, starting at the end of S, and return\n\
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007495the part before it, the separator itself, and the part after it. If the\n\
Andrew M. Kuchlingefeb43e2008-10-04 01:05:56 +00007496separator is not found, return two empty strings and S.");
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007497
7498static PyObject*
7499unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7500{
7501 return PyUnicode_RPartition((PyObject *)self, separator);
7502}
7503
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007504PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007505 PyObject *sep,
7506 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007507{
7508 PyObject *result;
Benjamin Peterson857ce152009-01-31 16:29:18 +00007509
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007510 s = PyUnicode_FromObject(s);
7511 if (s == NULL)
Benjamin Peterson857ce152009-01-31 16:29:18 +00007512 return NULL;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007513 if (sep != NULL) {
7514 sep = PyUnicode_FromObject(sep);
7515 if (sep == NULL) {
7516 Py_DECREF(s);
7517 return NULL;
7518 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007519 }
7520
7521 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7522
7523 Py_DECREF(s);
7524 Py_XDECREF(sep);
7525 return result;
7526}
7527
7528PyDoc_STRVAR(rsplit__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007529 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007530\n\
7531Return a list of the words in S, using sep as the\n\
7532delimiter string, starting at the end of the string and\n\
7533working to the front. If maxsplit is given, at most maxsplit\n\
7534splits are done. If sep is not specified, any whitespace string\n\
7535is a separator.");
7536
7537static PyObject*
7538unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7539{
7540 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007541 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007542
Martin v. Löwis18e16552006-02-15 17:27:45 +00007543 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007544 return NULL;
7545
7546 if (substring == Py_None)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007547 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007548 else if (PyUnicode_Check(substring))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007549 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007550 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007551 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007552}
7553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007554PyDoc_STRVAR(splitlines__doc__,
Raymond Hettingeraad5b022012-06-02 01:42:58 -04007555 "S.splitlines(keepends=False) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556\n\
7557Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00007558Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007559is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560
7561static PyObject*
7562unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7563{
Guido van Rossum86662912000-04-11 15:38:46 +00007564 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565
Guido van Rossum86662912000-04-11 15:38:46 +00007566 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567 return NULL;
7568
Guido van Rossum86662912000-04-11 15:38:46 +00007569 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007570}
7571
7572static
7573PyObject *unicode_str(PyUnicodeObject *self)
7574{
Fred Drakee4315f52000-05-09 19:53:39 +00007575 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576}
7577
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007578PyDoc_STRVAR(swapcase__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007579 "S.swapcase() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580\n\
7581Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007582and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583
7584static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007585unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587 return fixup(self, fixswapcase);
7588}
7589
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007590PyDoc_STRVAR(translate__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007591 "S.translate(table) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592\n\
7593Return a copy of the string S, where all characters have been mapped\n\
7594through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00007595Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7596Unmapped characters are left untouched. Characters mapped to None\n\
7597are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598
7599static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007600unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601{
Tim Petersced69f82003-09-16 20:30:58 +00007602 return PyUnicode_TranslateCharmap(self->str,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007603 self->length,
7604 table,
7605 "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606}
7607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007608PyDoc_STRVAR(upper__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007609 "S.upper() -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007611Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612
7613static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007614unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616 return fixup(self, fixupper);
7617}
7618
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007619PyDoc_STRVAR(zfill__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007620 "S.zfill(width) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621\n\
Georg Brandl98064072008-09-09 19:26:00 +00007622Pad a numeric string S with zeros on the left, to fill a field\n\
7623of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624
7625static PyObject *
7626unicode_zfill(PyUnicodeObject *self, PyObject *args)
7627{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007628 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629 PyUnicodeObject *u;
7630
Martin v. Löwis18e16552006-02-15 17:27:45 +00007631 Py_ssize_t width;
7632 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633 return NULL;
7634
7635 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00007636 if (PyUnicode_CheckExact(self)) {
7637 Py_INCREF(self);
7638 return (PyObject*) self;
7639 }
7640 else
7641 return PyUnicode_FromUnicode(
7642 PyUnicode_AS_UNICODE(self),
7643 PyUnicode_GET_SIZE(self)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007644 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645 }
7646
7647 fill = width - self->length;
7648
7649 u = pad(self, fill, 0, '0');
7650
Walter Dörwald068325e2002-04-15 13:36:47 +00007651 if (u == NULL)
7652 return NULL;
7653
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654 if (u->str[fill] == '+' || u->str[fill] == '-') {
7655 /* move sign to beginning of string */
7656 u->str[0] = u->str[fill];
7657 u->str[fill] = '0';
7658 }
7659
7660 return (PyObject*) u;
7661}
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662
7663#if 0
7664static PyObject*
Christian Heimes5b970ad2008-02-06 13:33:44 +00007665free_listsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666{
Christian Heimes5b970ad2008-02-06 13:33:44 +00007667 return PyInt_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668}
7669#endif
7670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007671PyDoc_STRVAR(startswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007672 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007674Return True if S starts with the specified prefix, False otherwise.\n\
7675With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007676With optional end, stop comparing S at that position.\n\
7677prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678
7679static PyObject *
7680unicode_startswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007681 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682{
Georg Brandl24250812006-06-09 18:45:48 +00007683 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007685 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007686 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007687 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688
Jesus Cea44e81682011-04-20 16:39:15 +02007689 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007690 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007691 if (PyTuple_Check(subobj)) {
7692 Py_ssize_t i;
7693 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7694 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007695 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007696 if (substring == NULL)
7697 return NULL;
7698 result = tailmatch(self, substring, start, end, -1);
7699 Py_DECREF(substring);
7700 if (result) {
7701 Py_RETURN_TRUE;
7702 }
7703 }
7704 /* nothing matched */
7705 Py_RETURN_FALSE;
7706 }
7707 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007708 if (substring == NULL) {
7709 if (PyErr_ExceptionMatches(PyExc_TypeError))
7710 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7711 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007712 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007713 }
Georg Brandl24250812006-06-09 18:45:48 +00007714 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007716 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717}
7718
7719
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007720PyDoc_STRVAR(endswith__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007721 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00007723Return True if S ends with the specified suffix, False otherwise.\n\
7724With optional start, test S beginning at that position.\n\
Georg Brandl24250812006-06-09 18:45:48 +00007725With optional end, stop comparing S at that position.\n\
7726suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727
7728static PyObject *
7729unicode_endswith(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007730 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731{
Georg Brandl24250812006-06-09 18:45:48 +00007732 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007734 Py_ssize_t start = 0;
Martin v. Löwis412fb672006-04-13 06:34:32 +00007735 Py_ssize_t end = PY_SSIZE_T_MAX;
Georg Brandl24250812006-06-09 18:45:48 +00007736 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737
Jesus Cea44e81682011-04-20 16:39:15 +02007738 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007739 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007740 if (PyTuple_Check(subobj)) {
7741 Py_ssize_t i;
7742 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7743 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007744 PyTuple_GET_ITEM(subobj, i));
Georg Brandl24250812006-06-09 18:45:48 +00007745 if (substring == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007746 return NULL;
Georg Brandl24250812006-06-09 18:45:48 +00007747 result = tailmatch(self, substring, start, end, +1);
7748 Py_DECREF(substring);
7749 if (result) {
7750 Py_RETURN_TRUE;
7751 }
7752 }
7753 Py_RETURN_FALSE;
7754 }
7755 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottie3685f62011-04-26 05:12:51 +03007756 if (substring == NULL) {
7757 if (PyErr_ExceptionMatches(PyExc_TypeError))
7758 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7759 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007760 return NULL;
Ezio Melottie3685f62011-04-26 05:12:51 +03007761 }
Georg Brandl24250812006-06-09 18:45:48 +00007762 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 Py_DECREF(substring);
Georg Brandl24250812006-06-09 18:45:48 +00007764 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765}
7766
7767
Eric Smitha9f7d622008-02-17 19:46:49 +00007768/* Implements do_string_format, which is unicode because of stringlib */
7769#include "stringlib/string_format.h"
7770
7771PyDoc_STRVAR(format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007772 "S.format(*args, **kwargs) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007773\n\
Eric Smith6c840852010-11-06 19:43:44 +00007774Return a formatted version of S, using substitutions from args and kwargs.\n\
7775The substitutions are identified by braces ('{' and '}').");
Eric Smitha9f7d622008-02-17 19:46:49 +00007776
Eric Smithdc13b792008-05-30 18:10:04 +00007777static PyObject *
7778unicode__format__(PyObject *self, PyObject *args)
7779{
7780 PyObject *format_spec;
7781 PyObject *result = NULL;
7782 PyObject *tmp = NULL;
7783
7784 /* If 2.x, convert format_spec to the same type as value */
7785 /* This is to allow things like u''.format('') */
7786 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7787 goto done;
7788 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7789 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007790 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
Eric Smithdc13b792008-05-30 18:10:04 +00007791 goto done;
7792 }
7793 tmp = PyObject_Unicode(format_spec);
7794 if (tmp == NULL)
7795 goto done;
7796 format_spec = tmp;
7797
7798 result = _PyUnicode_FormatAdvanced(self,
7799 PyUnicode_AS_UNICODE(format_spec),
7800 PyUnicode_GET_SIZE(format_spec));
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007801 done:
Eric Smithdc13b792008-05-30 18:10:04 +00007802 Py_XDECREF(tmp);
7803 return result;
7804}
7805
Eric Smitha9f7d622008-02-17 19:46:49 +00007806PyDoc_STRVAR(p_format__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007807 "S.__format__(format_spec) -> unicode\n\
Eric Smitha9f7d622008-02-17 19:46:49 +00007808\n\
Eric Smith6c840852010-11-06 19:43:44 +00007809Return a formatted version of S as described by format_spec.");
Eric Smitha9f7d622008-02-17 19:46:49 +00007810
Robert Schuppenies901c9972008-06-10 10:10:31 +00007811static PyObject *
7812unicode__sizeof__(PyUnicodeObject *v)
7813{
Robert Schuppenies9be2ec12008-07-10 15:24:04 +00007814 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7815 sizeof(Py_UNICODE) * (v->length + 1));
Robert Schuppenies901c9972008-06-10 10:10:31 +00007816}
7817
7818PyDoc_STRVAR(sizeof__doc__,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007819 "S.__sizeof__() -> size of S in memory, in bytes\n\
Robert Schuppenies901c9972008-06-10 10:10:31 +00007820\n\
7821");
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007822
7823static PyObject *
7824unicode_getnewargs(PyUnicodeObject *v)
7825{
Benjamin Peterson857ce152009-01-31 16:29:18 +00007826 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00007827}
7828
7829
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830static PyMethodDef unicode_methods[] = {
Benjamin Peterson332d7212009-09-18 21:14:55 +00007831 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007832 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7833 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00007834 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007835 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7836 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7837 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7838 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7839 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7840 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7841 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Fredrik Lundh450277f2006-05-26 09:46:59 +00007842 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007843 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7844 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7845 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007846 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Benjamin Peterson332d7212009-09-18 21:14:55 +00007847 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007848/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7849 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7850 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7851 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007852 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Fredrik Lundhb3167cb2006-05-26 18:15:38 +00007853 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007854 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00007855 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007856 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7857 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7858 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7859 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7860 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7861 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7862 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7863 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7864 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7865 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7866 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7867 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7868 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7869 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007870 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smitha9f7d622008-02-17 19:46:49 +00007871 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7872 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7873 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7874 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
Robert Schuppenies901c9972008-06-10 10:10:31 +00007875 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00007876#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007877 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878#endif
7879
7880#if 0
7881 /* This one is just used for debugging the implementation. */
Christian Heimes5b970ad2008-02-06 13:33:44 +00007882 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883#endif
7884
Benjamin Peterson857ce152009-01-31 16:29:18 +00007885 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886 {NULL, NULL}
7887};
7888
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007889static PyObject *
7890unicode_mod(PyObject *v, PyObject *w)
7891{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007892 if (!PyUnicode_Check(v)) {
7893 Py_INCREF(Py_NotImplemented);
7894 return Py_NotImplemented;
7895 }
7896 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007897}
7898
7899static PyNumberMethods unicode_as_number = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007900 0, /*nb_add*/
7901 0, /*nb_subtract*/
7902 0, /*nb_multiply*/
7903 0, /*nb_divide*/
7904 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007905};
7906
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007908 (lenfunc) unicode_length, /* sq_length */
7909 PyUnicode_Concat, /* sq_concat */
7910 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7911 (ssizeargfunc) unicode_getitem, /* sq_item */
7912 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7913 0, /* sq_ass_item */
7914 0, /* sq_ass_slice */
7915 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916};
7917
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007918static PyObject*
7919unicode_subscript(PyUnicodeObject* self, PyObject* item)
7920{
Marc-André Lemburg3a457792006-08-14 12:57:27 +00007921 if (PyIndex_Check(item)) {
7922 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007923 if (i == -1 && PyErr_Occurred())
7924 return NULL;
7925 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007926 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007927 return unicode_getitem(self, i);
7928 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00007929 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007930 Py_UNICODE* source_buf;
7931 Py_UNICODE* result_buf;
7932 PyObject* result;
7933
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007934 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007935 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007936 return NULL;
7937 }
7938
7939 if (slicelength <= 0) {
7940 return PyUnicode_FromUnicode(NULL, 0);
Thomas Wouters3ccec682007-08-28 15:28:19 +00007941 } else if (start == 0 && step == 1 && slicelength == self->length &&
7942 PyUnicode_CheckExact(self)) {
7943 Py_INCREF(self);
7944 return (PyObject *)self;
7945 } else if (step == 1) {
7946 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007947 } else {
7948 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Neal Norwitz419fd492008-03-17 20:22:43 +00007949 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7950 sizeof(Py_UNICODE));
Benjamin Peterson857ce152009-01-31 16:29:18 +00007951
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007952 if (result_buf == NULL)
7953 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007954
7955 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7956 result_buf[i] = source_buf[cur];
7957 }
Tim Petersced69f82003-09-16 20:30:58 +00007958
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007959 result = PyUnicode_FromUnicode(result_buf, slicelength);
Neal Norwitz419fd492008-03-17 20:22:43 +00007960 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007961 return result;
7962 }
7963 } else {
7964 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7965 return NULL;
7966 }
7967}
7968
7969static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson857ce152009-01-31 16:29:18 +00007970 (lenfunc)unicode_length, /* mp_length */
7971 (binaryfunc)unicode_subscript, /* mp_subscript */
7972 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007973};
7974
Martin v. Löwis18e16552006-02-15 17:27:45 +00007975static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976unicode_buffer_getreadbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007977 Py_ssize_t index,
7978 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979{
7980 if (index != 0) {
7981 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007982 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983 return -1;
7984 }
7985 *ptr = (void *) self->str;
7986 return PyUnicode_GET_DATA_SIZE(self);
7987}
7988
Martin v. Löwis18e16552006-02-15 17:27:45 +00007989static Py_ssize_t
7990unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007991 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992{
7993 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00007994 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 return -1;
7996}
7997
7998static int
7999unicode_buffer_getsegcount(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008000 Py_ssize_t *lenp)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001{
8002 if (lenp)
8003 *lenp = PyUnicode_GET_DATA_SIZE(self);
8004 return 1;
8005}
8006
Martin v. Löwiseb079f12006-02-16 14:32:27 +00008007static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008unicode_buffer_getcharbuf(PyUnicodeObject *self,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008009 Py_ssize_t index,
8010 const void **ptr)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011{
8012 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00008013
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 if (index != 0) {
8015 PyErr_SetString(PyExc_SystemError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008016 "accessing non-existent unicode segment");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 return -1;
8018 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00008019 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 if (str == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008021 return -1;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008022 *ptr = (void *) PyString_AS_STRING(str);
8023 return PyString_GET_SIZE(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024}
8025
8026/* Helpers for PyUnicode_Format() */
8027
8028static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00008029getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008031 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 if (argidx < arglen) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008033 (*p_argidx)++;
8034 if (arglen < 0)
8035 return args;
8036 else
8037 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038 }
8039 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008040 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041 return NULL;
8042}
8043
8044#define F_LJUST (1<<0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008045#define F_SIGN (1<<1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046#define F_BLANK (1<<2)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008047#define F_ALT (1<<3)
8048#define F_ZERO (1<<4)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049
Martin v. Löwis18e16552006-02-15 17:27:45 +00008050static Py_ssize_t
Neal Norwitzfc76d632006-01-10 06:03:13 +00008051strtounicode(Py_UNICODE *buffer, const char *charbuffer)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008053 register Py_ssize_t i;
8054 Py_ssize_t len = strlen(charbuffer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 for (i = len - 1; i >= 0; i--)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008056 buffer[i] = (Py_UNICODE) charbuffer[i];
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 return len;
8059}
8060
Neal Norwitzfc76d632006-01-10 06:03:13 +00008061static int
Neal Norwitzfc76d632006-01-10 06:03:13 +00008062longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8063{
Tim Peters15231542006-02-16 01:08:01 +00008064 Py_ssize_t result;
8065
Neal Norwitzfc76d632006-01-10 06:03:13 +00008066 PyOS_snprintf((char *)buffer, len, format, x);
Tim Peters15231542006-02-16 01:08:01 +00008067 result = strtounicode(buffer, (char *)buffer);
8068 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
Neal Norwitzfc76d632006-01-10 06:03:13 +00008069}
8070
Guido van Rossum078151d2002-08-11 04:24:12 +00008071/* XXX To save some code duplication, formatfloat/long/int could have been
8072 shared with stringobject.c, converting from 8-bit to Unicode after the
8073 formatting is done. */
8074
Mark Dickinson18cfada2009-11-23 18:46:41 +00008075/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8076
8077static PyObject *
8078formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079{
Mark Dickinson18cfada2009-11-23 18:46:41 +00008080 char *p;
8081 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082 double x;
Tim Petersced69f82003-09-16 20:30:58 +00008083
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084 x = PyFloat_AsDouble(v);
8085 if (x == -1.0 && PyErr_Occurred())
Mark Dickinson18cfada2009-11-23 18:46:41 +00008086 return NULL;
8087
Guido van Rossumd57fd912000-03-10 22:53:23 +00008088 if (prec < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008089 prec = 6;
Mark Dickinsond4814bf2009-03-29 16:24:29 +00008090
Mark Dickinson18cfada2009-11-23 18:46:41 +00008091 p = PyOS_double_to_string(x, type, prec,
8092 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8093 if (p == NULL)
8094 return NULL;
8095 result = PyUnicode_FromStringAndSize(p, strlen(p));
8096 PyMem_Free(p);
8097 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098}
8099
Tim Peters38fd5b62000-09-21 05:43:11 +00008100static PyObject*
8101formatlong(PyObject *val, int flags, int prec, int type)
8102{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008103 char *buf;
8104 int i, len;
8105 PyObject *str; /* temporary string object. */
8106 PyUnicodeObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008107
Benjamin Peterson857ce152009-01-31 16:29:18 +00008108 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8109 if (!str)
8110 return NULL;
8111 result = _PyUnicode_New(len);
8112 if (!result) {
8113 Py_DECREF(str);
8114 return NULL;
8115 }
8116 for (i = 0; i < len; i++)
8117 result->str[i] = buf[i];
8118 result->str[len] = 0;
8119 Py_DECREF(str);
8120 return (PyObject*)result;
Tim Peters38fd5b62000-09-21 05:43:11 +00008121}
8122
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123static int
8124formatint(Py_UNICODE *buf,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008125 size_t buflen,
8126 int flags,
8127 int prec,
8128 int type,
8129 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008130{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008131 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008132 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8133 * + 1 + 1
8134 * = 24
8135 */
Tim Peters38fd5b62000-09-21 05:43:11 +00008136 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008137 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138 long x;
8139
8140 x = PyInt_AsLong(v);
8141 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008142 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008143 if (x < 0 && type == 'u') {
8144 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00008145 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008146 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8147 sign = "-";
8148 else
8149 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008151 prec = 1;
8152
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008153 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8154 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008155 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008156 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008157 PyErr_SetString(PyExc_OverflowError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008158 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008159 return -1;
8160 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008161
8162 if ((flags & F_ALT) &&
8163 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00008164 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008165 * of issues that cause pain:
8166 * - when 0 is being converted, the C standard leaves off
8167 * the '0x' or '0X', which is inconsistent with other
8168 * %#x/%#X conversions and inconsistent with Python's
8169 * hex() function
8170 * - there are platforms that violate the standard and
8171 * convert 0 with the '0x' or '0X'
8172 * (Metrowerks, Compaq Tru64)
8173 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00008174 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008175 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00008176 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008177 * We can achieve the desired consistency by inserting our
8178 * own '0x' or '0X' prefix, and substituting %x/%X in place
8179 * of %#x/%#X.
8180 *
8181 * Note that this is the same approach as used in
8182 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008183 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008184 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8185 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00008186 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008187 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008188 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8189 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00008190 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00008191 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008192 if (sign[0])
Neal Norwitzfc76d632006-01-10 06:03:13 +00008193 return longtounicode(buf, buflen, fmt, -x);
Guido van Rossum6c9e1302003-11-29 23:52:13 +00008194 else
Neal Norwitzfc76d632006-01-10 06:03:13 +00008195 return longtounicode(buf, buflen, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196}
8197
8198static int
8199formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008200 size_t buflen,
8201 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008202{
Ezio Melotti32125152010-02-25 17:36:04 +00008203 PyObject *unistr;
8204 char *str;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008205 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008206 if (PyUnicode_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008207 if (PyUnicode_GET_SIZE(v) != 1)
8208 goto onError;
8209 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008212 else if (PyString_Check(v)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008213 if (PyString_GET_SIZE(v) != 1)
8214 goto onError;
Ezio Melotti32125152010-02-25 17:36:04 +00008215 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8216 with a UnicodeDecodeError if 'char' is not decodable with the
8217 default encoding (usually ASCII, but it might be something else) */
8218 str = PyString_AS_STRING(v);
8219 if ((unsigned char)str[0] > 0x7F) {
8220 /* the char is not ASCII; try to decode the string using the
8221 default encoding and return -1 to let the UnicodeDecodeError
8222 be raised if the string can't be decoded */
8223 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8224 if (unistr == NULL)
8225 return -1;
8226 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8227 Py_DECREF(unistr);
8228 }
8229 else
8230 buf[0] = (Py_UNICODE)str[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008232
8233 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008234 /* Integer input truncated to a character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235 long x;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008236 x = PyInt_AsLong(v);
8237 if (x == -1 && PyErr_Occurred())
8238 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008239#ifdef Py_UNICODE_WIDE
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008240 if (x < 0 || x > 0x10ffff) {
8241 PyErr_SetString(PyExc_OverflowError,
8242 "%c arg not in range(0x110000) "
8243 "(wide Python build)");
8244 return -1;
8245 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008246#else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008247 if (x < 0 || x > 0xffff) {
8248 PyErr_SetString(PyExc_OverflowError,
8249 "%c arg not in range(0x10000) "
8250 "(narrow Python build)");
8251 return -1;
8252 }
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00008253#endif
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008254 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255 }
8256 buf[1] = '\0';
8257 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008258
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008259 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008260 PyErr_SetString(PyExc_TypeError,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008261 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00008262 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263}
8264
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008265/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8266
Mark Dickinson18cfada2009-11-23 18:46:41 +00008267 FORMATBUFLEN is the length of the buffer in which the ints &
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00008268 chars are formatted. XXX This is a magic number. Each formatting
8269 routine does bounds checking to ensure no overflow, but a better
8270 solution may be to malloc a buffer of appropriate size for each
8271 format. For now, the current solution is sufficient.
8272*/
8273#define FORMATBUFLEN (size_t)120
8274
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275PyObject *PyUnicode_Format(PyObject *format,
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008276 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277{
8278 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008279 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280 int args_owned = 0;
8281 PyUnicodeObject *result = NULL;
8282 PyObject *dict = NULL;
8283 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00008284
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 if (format == NULL || args == NULL) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008286 PyErr_BadInternalCall();
8287 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288 }
8289 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00008290 if (uformat == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008291 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292 fmt = PyUnicode_AS_UNICODE(uformat);
8293 fmtcnt = PyUnicode_GET_SIZE(uformat);
8294
8295 reslen = rescnt = fmtcnt + 100;
8296 result = _PyUnicode_New(reslen);
8297 if (result == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008298 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 res = PyUnicode_AS_UNICODE(result);
8300
8301 if (PyTuple_Check(args)) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008302 arglen = PyTuple_Size(args);
8303 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304 }
8305 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008306 arglen = -1;
8307 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 }
Benjamin Peterson23d49d32012-08-28 17:55:35 -04008309 if (PyMapping_Check(args) && !PyTuple_Check(args) &&
Neal Norwitz80a1bf42002-11-12 23:01:12 +00008310 !PyObject_TypeCheck(args, &PyBaseString_Type))
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008311 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312
8313 while (--fmtcnt >= 0) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008314 if (*fmt != '%') {
8315 if (--rescnt < 0) {
8316 rescnt = fmtcnt + 100;
8317 reslen += rescnt;
8318 if (_PyUnicode_Resize(&result, reslen) < 0)
8319 goto onError;
8320 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8321 --rescnt;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008322 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008323 *res++ = *fmt++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008324 }
8325 else {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008326 /* Got a format specifier */
8327 int flags = 0;
8328 Py_ssize_t width = -1;
8329 int prec = -1;
8330 Py_UNICODE c = '\0';
8331 Py_UNICODE fill;
8332 int isnumok;
8333 PyObject *v = NULL;
8334 PyObject *temp = NULL;
8335 Py_UNICODE *pbuf;
8336 Py_UNICODE sign;
8337 Py_ssize_t len;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008338 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008339
8340 fmt++;
8341 if (*fmt == '(') {
8342 Py_UNICODE *keystart;
8343 Py_ssize_t keylen;
8344 PyObject *key;
8345 int pcount = 1;
8346
8347 if (dict == NULL) {
8348 PyErr_SetString(PyExc_TypeError,
8349 "format requires a mapping");
8350 goto onError;
8351 }
8352 ++fmt;
8353 --fmtcnt;
8354 keystart = fmt;
8355 /* Skip over balanced parentheses */
8356 while (pcount > 0 && --fmtcnt >= 0) {
8357 if (*fmt == ')')
8358 --pcount;
8359 else if (*fmt == '(')
8360 ++pcount;
8361 fmt++;
8362 }
8363 keylen = fmt - keystart - 1;
8364 if (fmtcnt < 0 || pcount > 0) {
8365 PyErr_SetString(PyExc_ValueError,
8366 "incomplete format key");
8367 goto onError;
8368 }
8369#if 0
8370 /* keys are converted to strings using UTF-8 and
8371 then looked up since Python uses strings to hold
8372 variables names etc. in its namespaces and we
8373 wouldn't want to break common idioms. */
8374 key = PyUnicode_EncodeUTF8(keystart,
8375 keylen,
8376 NULL);
8377#else
8378 key = PyUnicode_FromUnicode(keystart, keylen);
8379#endif
8380 if (key == NULL)
8381 goto onError;
8382 if (args_owned) {
8383 Py_DECREF(args);
8384 args_owned = 0;
8385 }
8386 args = PyObject_GetItem(dict, key);
8387 Py_DECREF(key);
8388 if (args == NULL) {
8389 goto onError;
8390 }
8391 args_owned = 1;
8392 arglen = -1;
8393 argidx = -2;
8394 }
8395 while (--fmtcnt >= 0) {
8396 switch (c = *fmt++) {
8397 case '-': flags |= F_LJUST; continue;
8398 case '+': flags |= F_SIGN; continue;
8399 case ' ': flags |= F_BLANK; continue;
8400 case '#': flags |= F_ALT; continue;
8401 case '0': flags |= F_ZERO; continue;
8402 }
8403 break;
8404 }
8405 if (c == '*') {
8406 v = getnextarg(args, arglen, &argidx);
8407 if (v == NULL)
8408 goto onError;
8409 if (!PyInt_Check(v)) {
8410 PyErr_SetString(PyExc_TypeError,
8411 "* wants int");
8412 goto onError;
8413 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008414 width = PyInt_AsSsize_t(v);
8415 if (width == -1 && PyErr_Occurred())
8416 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008417 if (width < 0) {
8418 flags |= F_LJUST;
8419 width = -width;
8420 }
8421 if (--fmtcnt >= 0)
8422 c = *fmt++;
8423 }
8424 else if (c >= '0' && c <= '9') {
8425 width = c - '0';
8426 while (--fmtcnt >= 0) {
8427 c = *fmt++;
8428 if (c < '0' || c > '9')
8429 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008430 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008431 PyErr_SetString(PyExc_ValueError,
8432 "width too big");
8433 goto onError;
8434 }
8435 width = width*10 + (c - '0');
8436 }
8437 }
8438 if (c == '.') {
8439 prec = 0;
8440 if (--fmtcnt >= 0)
8441 c = *fmt++;
8442 if (c == '*') {
8443 v = getnextarg(args, arglen, &argidx);
8444 if (v == NULL)
8445 goto onError;
8446 if (!PyInt_Check(v)) {
8447 PyErr_SetString(PyExc_TypeError,
8448 "* wants int");
8449 goto onError;
8450 }
Serhiy Storchaka74f49ab2013-01-19 12:55:39 +02008451 prec = _PyInt_AsInt(v);
8452 if (prec == -1 && PyErr_Occurred())
8453 goto onError;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008454 if (prec < 0)
8455 prec = 0;
8456 if (--fmtcnt >= 0)
8457 c = *fmt++;
8458 }
8459 else if (c >= '0' && c <= '9') {
8460 prec = c - '0';
8461 while (--fmtcnt >= 0) {
Stefan Krah0b9201f2010-07-19 18:06:46 +00008462 c = *fmt++;
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008463 if (c < '0' || c > '9')
8464 break;
Mark Dickinson75d36002012-10-28 10:00:46 +00008465 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008466 PyErr_SetString(PyExc_ValueError,
8467 "prec too big");
8468 goto onError;
8469 }
8470 prec = prec*10 + (c - '0');
8471 }
8472 }
8473 } /* prec */
8474 if (fmtcnt >= 0) {
8475 if (c == 'h' || c == 'l' || c == 'L') {
8476 if (--fmtcnt >= 0)
8477 c = *fmt++;
8478 }
8479 }
8480 if (fmtcnt < 0) {
8481 PyErr_SetString(PyExc_ValueError,
8482 "incomplete format");
8483 goto onError;
8484 }
8485 if (c != '%') {
8486 v = getnextarg(args, arglen, &argidx);
8487 if (v == NULL)
8488 goto onError;
8489 }
8490 sign = 0;
8491 fill = ' ';
8492 switch (c) {
8493
8494 case '%':
8495 pbuf = formatbuf;
8496 /* presume that buffer length is at least 1 */
8497 pbuf[0] = '%';
8498 len = 1;
8499 break;
8500
8501 case 's':
8502 case 'r':
Victor Stinner95affc42010-03-22 12:24:37 +00008503 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008504 temp = v;
8505 Py_INCREF(temp);
8506 }
8507 else {
8508 PyObject *unicode;
8509 if (c == 's')
8510 temp = PyObject_Unicode(v);
8511 else
8512 temp = PyObject_Repr(v);
8513 if (temp == NULL)
8514 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008515 if (PyUnicode_Check(temp))
8516 /* nothing to do */;
Gregory P. Smithdd96db62008-06-09 04:58:54 +00008517 else if (PyString_Check(temp)) {
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00008518 /* convert to string to Unicode */
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008519 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8520 PyString_GET_SIZE(temp),
8521 NULL,
8522 "strict");
8523 Py_DECREF(temp);
8524 temp = unicode;
8525 if (temp == NULL)
8526 goto onError;
8527 }
8528 else {
8529 Py_DECREF(temp);
8530 PyErr_SetString(PyExc_TypeError,
8531 "%s argument has non-string str()");
8532 goto onError;
8533 }
8534 }
8535 pbuf = PyUnicode_AS_UNICODE(temp);
8536 len = PyUnicode_GET_SIZE(temp);
8537 if (prec >= 0 && len > prec)
8538 len = prec;
8539 break;
8540
8541 case 'i':
8542 case 'd':
8543 case 'u':
8544 case 'o':
8545 case 'x':
8546 case 'X':
8547 if (c == 'i')
8548 c = 'd';
8549 isnumok = 0;
8550 if (PyNumber_Check(v)) {
8551 PyObject *iobj=NULL;
8552
8553 if (PyInt_Check(v) || (PyLong_Check(v))) {
8554 iobj = v;
8555 Py_INCREF(iobj);
8556 }
8557 else {
8558 iobj = PyNumber_Int(v);
8559 if (iobj==NULL) iobj = PyNumber_Long(v);
8560 }
8561 if (iobj!=NULL) {
8562 if (PyInt_Check(iobj)) {
8563 isnumok = 1;
8564 pbuf = formatbuf;
8565 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8566 flags, prec, c, iobj);
8567 Py_DECREF(iobj);
8568 if (len < 0)
8569 goto onError;
8570 sign = 1;
8571 }
8572 else if (PyLong_Check(iobj)) {
8573 isnumok = 1;
8574 temp = formatlong(iobj, flags, prec, c);
8575 Py_DECREF(iobj);
8576 if (!temp)
8577 goto onError;
8578 pbuf = PyUnicode_AS_UNICODE(temp);
8579 len = PyUnicode_GET_SIZE(temp);
8580 sign = 1;
8581 }
8582 else {
8583 Py_DECREF(iobj);
8584 }
8585 }
8586 }
8587 if (!isnumok) {
8588 PyErr_Format(PyExc_TypeError,
8589 "%%%c format: a number is required, "
8590 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8591 goto onError;
8592 }
8593 if (flags & F_ZERO)
8594 fill = '0';
8595 break;
8596
8597 case 'e':
8598 case 'E':
8599 case 'f':
8600 case 'F':
8601 case 'g':
8602 case 'G':
Mark Dickinson18cfada2009-11-23 18:46:41 +00008603 temp = formatfloat(v, flags, prec, c);
8604 if (temp == NULL)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008605 goto onError;
Mark Dickinson18cfada2009-11-23 18:46:41 +00008606 pbuf = PyUnicode_AS_UNICODE(temp);
8607 len = PyUnicode_GET_SIZE(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008608 sign = 1;
8609 if (flags & F_ZERO)
8610 fill = '0';
8611 break;
8612
8613 case 'c':
8614 pbuf = formatbuf;
8615 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8616 if (len < 0)
8617 goto onError;
8618 break;
8619
8620 default:
8621 PyErr_Format(PyExc_ValueError,
8622 "unsupported format character '%c' (0x%x) "
8623 "at index %zd",
8624 (31<=c && c<=126) ? (char)c : '?',
8625 (int)c,
8626 (Py_ssize_t)(fmt - 1 -
8627 PyUnicode_AS_UNICODE(uformat)));
Benjamin Peterson857ce152009-01-31 16:29:18 +00008628 goto onError;
8629 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008630 if (sign) {
8631 if (*pbuf == '-' || *pbuf == '+') {
8632 sign = *pbuf++;
8633 len--;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008634 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008635 else if (flags & F_SIGN)
8636 sign = '+';
8637 else if (flags & F_BLANK)
8638 sign = ' ';
8639 else
8640 sign = 0;
8641 }
8642 if (width < len)
8643 width = len;
8644 if (rescnt - (sign != 0) < width) {
8645 reslen -= rescnt;
8646 rescnt = width + fmtcnt + 100;
8647 reslen += rescnt;
8648 if (reslen < 0) {
8649 Py_XDECREF(temp);
8650 PyErr_NoMemory();
8651 goto onError;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008652 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008653 if (_PyUnicode_Resize(&result, reslen) < 0) {
8654 Py_XDECREF(temp);
8655 goto onError;
8656 }
8657 res = PyUnicode_AS_UNICODE(result)
8658 + reslen - rescnt;
8659 }
8660 if (sign) {
8661 if (fill != ' ')
8662 *res++ = sign;
8663 rescnt--;
8664 if (width > len)
8665 width--;
8666 }
8667 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8668 assert(pbuf[0] == '0');
8669 assert(pbuf[1] == c);
8670 if (fill != ' ') {
8671 *res++ = *pbuf++;
8672 *res++ = *pbuf++;
8673 }
8674 rescnt -= 2;
8675 width -= 2;
8676 if (width < 0)
8677 width = 0;
8678 len -= 2;
8679 }
8680 if (width > len && !(flags & F_LJUST)) {
8681 do {
8682 --rescnt;
8683 *res++ = fill;
8684 } while (--width > len);
8685 }
8686 if (fill == ' ') {
8687 if (sign)
8688 *res++ = sign;
8689 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8690 assert(pbuf[0] == '0');
8691 assert(pbuf[1] == c);
8692 *res++ = *pbuf++;
8693 *res++ = *pbuf++;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008694 }
8695 }
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008696 Py_UNICODE_COPY(res, pbuf, len);
8697 res += len;
8698 rescnt -= len;
8699 while (--width >= len) {
8700 --rescnt;
8701 *res++ = ' ';
8702 }
8703 if (dict && (argidx < arglen) && c != '%') {
8704 PyErr_SetString(PyExc_TypeError,
8705 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00008706 Py_XDECREF(temp);
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008707 goto onError;
8708 }
8709 Py_XDECREF(temp);
8710 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711 } /* until end */
8712 if (argidx < arglen && !dict) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008713 PyErr_SetString(PyExc_TypeError,
8714 "not all arguments converted during string formatting");
8715 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 }
8717
Thomas Woutersa96affe2006-03-12 00:29:36 +00008718 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008719 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008721 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722 }
8723 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724 return (PyObject *)result;
8725
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008726 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008727 Py_XDECREF(result);
8728 Py_DECREF(uformat);
8729 if (args_owned) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008730 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008731 }
8732 return NULL;
8733}
8734
8735static PyBufferProcs unicode_as_buffer = {
Martin v. Löwis18e16552006-02-15 17:27:45 +00008736 (readbufferproc) unicode_buffer_getreadbuf,
8737 (writebufferproc) unicode_buffer_getwritebuf,
8738 (segcountproc) unicode_buffer_getsegcount,
8739 (charbufferproc) unicode_buffer_getcharbuf,
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740};
8741
Jeremy Hylton938ace62002-07-17 16:30:39 +00008742static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00008743unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8744
Tim Peters6d6c1a32001-08-02 04:15:00 +00008745static PyObject *
8746unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8747{
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008748 PyObject *x = NULL;
Benjamin Peterson857ce152009-01-31 16:29:18 +00008749 static char *kwlist[] = {"string", "encoding", "errors", 0};
8750 char *encoding = NULL;
8751 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00008752
Benjamin Peterson857ce152009-01-31 16:29:18 +00008753 if (type != &PyUnicode_Type)
8754 return unicode_subtype_new(type, args, kwds);
8755 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008756 kwlist, &x, &encoding, &errors))
Benjamin Peterson857ce152009-01-31 16:29:18 +00008757 return NULL;
8758 if (x == NULL)
8759 return (PyObject *)_PyUnicode_New(0);
8760 if (encoding == NULL && errors == NULL)
8761 return PyObject_Unicode(x);
8762 else
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008763 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00008764}
8765
Guido van Rossume023fe02001-08-30 03:12:59 +00008766static PyObject *
8767unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8768{
Benjamin Peterson857ce152009-01-31 16:29:18 +00008769 PyUnicodeObject *tmp, *pnew;
8770 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00008771
Benjamin Peterson857ce152009-01-31 16:29:18 +00008772 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8773 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8774 if (tmp == NULL)
8775 return NULL;
8776 assert(PyUnicode_Check(tmp));
8777 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8778 if (pnew == NULL) {
8779 Py_DECREF(tmp);
8780 return NULL;
8781 }
8782 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8783 if (pnew->str == NULL) {
8784 _Py_ForgetReference((PyObject *)pnew);
8785 PyObject_Del(pnew);
8786 Py_DECREF(tmp);
8787 return PyErr_NoMemory();
8788 }
8789 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8790 pnew->length = n;
8791 pnew->hash = tmp->hash;
8792 Py_DECREF(tmp);
8793 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00008794}
8795
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008796PyDoc_STRVAR(unicode_doc,
Chris Jerdonekad4b0002012-10-07 20:37:54 -07008797 "unicode(object='') -> unicode object\n\
8798unicode(string[, encoding[, errors]]) -> unicode object\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00008799\n\
8800Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00008801encoding defaults to the current default string encoding.\n\
8802errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00008803
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804PyTypeObject PyUnicode_Type = {
Martin v. Löwis68192102007-07-21 06:55:02 +00008805 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson857ce152009-01-31 16:29:18 +00008806 "unicode", /* tp_name */
8807 sizeof(PyUnicodeObject), /* tp_size */
8808 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008809 /* Slots */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008810 (destructor)unicode_dealloc, /* tp_dealloc */
8811 0, /* tp_print */
8812 0, /* tp_getattr */
8813 0, /* tp_setattr */
8814 0, /* tp_compare */
8815 unicode_repr, /* tp_repr */
8816 &unicode_as_number, /* tp_as_number */
8817 &unicode_as_sequence, /* tp_as_sequence */
8818 &unicode_as_mapping, /* tp_as_mapping */
8819 (hashfunc) unicode_hash, /* tp_hash*/
8820 0, /* tp_call*/
8821 (reprfunc) unicode_str, /* tp_str */
8822 PyObject_GenericGetAttr, /* tp_getattro */
8823 0, /* tp_setattro */
8824 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00008825 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008826 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson857ce152009-01-31 16:29:18 +00008827 unicode_doc, /* tp_doc */
8828 0, /* tp_traverse */
8829 0, /* tp_clear */
8830 PyUnicode_RichCompare, /* tp_richcompare */
8831 0, /* tp_weaklistoffset */
8832 0, /* tp_iter */
8833 0, /* tp_iternext */
8834 unicode_methods, /* tp_methods */
8835 0, /* tp_members */
8836 0, /* tp_getset */
8837 &PyBaseString_Type, /* tp_base */
8838 0, /* tp_dict */
8839 0, /* tp_descr_get */
8840 0, /* tp_descr_set */
8841 0, /* tp_dictoffset */
8842 0, /* tp_init */
8843 0, /* tp_alloc */
8844 unicode_new, /* tp_new */
8845 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846};
8847
8848/* Initialize the Unicode implementation */
8849
Thomas Wouters78890102000-07-22 19:25:51 +00008850void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008852 int i;
8853
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008854 /* XXX - move this array to unicodectype.c ? */
8855 Py_UNICODE linebreak[] = {
8856 0x000A, /* LINE FEED */
8857 0x000D, /* CARRIAGE RETURN */
8858 0x001C, /* FILE SEPARATOR */
8859 0x001D, /* GROUP SEPARATOR */
8860 0x001E, /* RECORD SEPARATOR */
8861 0x0085, /* NEXT LINE */
8862 0x2028, /* LINE SEPARATOR */
8863 0x2029, /* PARAGRAPH SEPARATOR */
8864 };
8865
Fred Drakee4315f52000-05-09 19:53:39 +00008866 /* Init the implementation */
Christian Heimes5b970ad2008-02-06 13:33:44 +00008867 free_list = NULL;
8868 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869 unicode_empty = _PyUnicode_New(0);
Neal Norwitze1fdb322006-07-21 05:32:28 +00008870 if (!unicode_empty)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008871 return;
Neal Norwitze1fdb322006-07-21 05:32:28 +00008872
Marc-André Lemburg90e81472000-06-07 09:13:21 +00008873 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008874 for (i = 0; i < 256; i++)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008875 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00008876 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008877 Py_FatalError("Can't initialize 'unicode'");
Fredrik Lundhb63588c2006-05-23 18:44:25 +00008878
8879 /* initialize the linebreak bloom filter */
8880 bloom_linebreak = make_bloom_mask(
8881 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8882 );
Neal Norwitzde4c78a2006-06-13 08:28:19 +00008883
8884 PyType_Ready(&EncodingMapType);
Benjamin Peterson6da3ed62012-10-30 23:21:10 -04008885
8886 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8887 Py_FatalError("Can't initialize field name iterator type");
8888
8889 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8890 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891}
8892
8893/* Finalize the Unicode implementation */
8894
Christian Heimes3b718a72008-02-14 12:47:33 +00008895int
8896PyUnicode_ClearFreeList(void)
8897{
8898 int freelist_size = numfree;
8899 PyUnicodeObject *u;
8900
8901 for (u = free_list; u != NULL;) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008902 PyUnicodeObject *v = u;
8903 u = *(PyUnicodeObject **)u;
8904 if (v->str)
8905 PyObject_DEL(v->str);
8906 Py_XDECREF(v->defenc);
8907 PyObject_Del(v);
8908 numfree--;
Christian Heimes3b718a72008-02-14 12:47:33 +00008909 }
8910 free_list = NULL;
8911 assert(numfree == 0);
8912 return freelist_size;
8913}
8914
Guido van Rossumd57fd912000-03-10 22:53:23 +00008915void
Thomas Wouters78890102000-07-22 19:25:51 +00008916_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008917{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008918 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00008920 Py_XDECREF(unicode_empty);
8921 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00008922
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008923 for (i = 0; i < 256; i++) {
Benjamin Petersonbe1399e2009-01-31 22:03:19 +00008924 if (unicode_latin1[i]) {
8925 Py_DECREF(unicode_latin1[i]);
8926 unicode_latin1[i] = NULL;
8927 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008928 }
Christian Heimes3b718a72008-02-14 12:47:33 +00008929 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00008931
Anthony Baxterac6bd462006-04-13 02:06:09 +00008932#ifdef __cplusplus
8933}
8934#endif