blob: c4cfe1bca05cf7a4da8838e8cea812ae77db7067 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000145static PyObject *unicode_encode_call_errorhandler(const char *errors,
146 PyObject **errorHandler,const char *encoding, const char *reason,
147 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
148 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
149
Victor Stinner31be90b2010-04-22 19:38:16 +0000150static void raise_encode_exception(PyObject **exceptionObject,
151 const char *encoding,
152 const Py_UNICODE *unicode, Py_ssize_t size,
153 Py_ssize_t startpos, Py_ssize_t endpos,
154 const char *reason);
155
Christian Heimes190d79e2008-01-30 11:58:22 +0000156/* Same for linebreaks */
157static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000160/* 0x000B, * LINE TABULATION */
161/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x001C, * FILE SEPARATOR */
166/* 0x001D, * GROUP SEPARATOR */
167/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000168 0, 0, 0, 0, 1, 1, 1, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000173
Benjamin Peterson14339b62009-01-31 16:36:08 +0000174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000182};
183
184
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000185Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000186PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000188#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000189 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 /* This is actually an illegal character, so it should
192 not be passed to unichr. */
193 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#endif
195}
196
Thomas Wouters477c8d52006-05-27 19:21:47 +0000197/* --- Bloom Filters ----------------------------------------------------- */
198
199/* stuff to implement simple "bloom filters" for Unicode characters.
200 to keep things simple, we use a single bitmask, using the least 5
201 bits from each unicode characters as the bit index. */
202
203/* the linebreak mask is set up by Unicode_Init below */
204
Antoine Pitrouf068f942010-01-13 14:19:12 +0000205#if LONG_BIT >= 128
206#define BLOOM_WIDTH 128
207#elif LONG_BIT >= 64
208#define BLOOM_WIDTH 64
209#elif LONG_BIT >= 32
210#define BLOOM_WIDTH 32
211#else
212#error "LONG_BIT is smaller than 32"
213#endif
214
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215#define BLOOM_MASK unsigned long
216
217static BLOOM_MASK bloom_linebreak;
218
Antoine Pitrouf068f942010-01-13 14:19:12 +0000219#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
220#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221
Benjamin Peterson29060642009-01-31 22:14:21 +0000222#define BLOOM_LINEBREAK(ch) \
223 ((ch) < 128U ? ascii_linebreak[(ch)] : \
224 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000225
226Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
227{
228 /* calculate simple bloom-style bitmask for a given unicode string */
229
Antoine Pitrouf068f942010-01-13 14:19:12 +0000230 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231 Py_ssize_t i;
232
233 mask = 0;
234 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000235 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000236
237 return mask;
238}
239
240Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
241{
242 Py_ssize_t i;
243
244 for (i = 0; i < setlen; i++)
245 if (set[i] == chr)
246 return 1;
247
248 return 0;
249}
250
Benjamin Peterson29060642009-01-31 22:14:21 +0000251#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000252 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
253
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254/* --- Unicode Object ----------------------------------------------------- */
255
256static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000257int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000258 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259{
260 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000261
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000266 /* Resizing shared object (unicode_empty or single character
267 objects) in-place is not allowed. Use PyUnicode_Resize()
268 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000271 (unicode->length == 1 &&
272 unicode->str[0] < 256U &&
273 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000275 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 return -1;
277 }
278
Thomas Wouters477c8d52006-05-27 19:21:47 +0000279 /* We allocate one more byte to make sure the string is Ux0000 terminated.
280 The overallocation is also used by fastsearch, which assumes that it's
281 safe to look at str[length] (without making any assumptions about what
282 it contains). */
283
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000285 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000286 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 PyErr_NoMemory();
290 return -1;
291 }
292 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294
Benjamin Peterson29060642009-01-31 22:14:21 +0000295 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000297 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000298 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 }
300 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000301
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 return 0;
303}
304
305/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000306 Ux0000 terminated; some code (e.g. new_identifier)
307 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308
309 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000310 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312*/
313
314static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316{
317 register PyUnicodeObject *unicode;
318
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 if (length == 0 && unicode_empty != NULL) {
321 Py_INCREF(unicode_empty);
322 return unicode_empty;
323 }
324
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000325 /* Ensure we won't overflow the size. */
326 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
327 return (PyUnicodeObject *)PyErr_NoMemory();
328 }
329
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000331 if (free_list) {
332 unicode = free_list;
333 free_list = *(PyUnicodeObject **)unicode;
334 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000335 if (unicode->str) {
336 /* Keep-Alive optimization: we only upsize the buffer,
337 never downsize it. */
338 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000339 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000340 PyObject_DEL(unicode->str);
341 unicode->str = NULL;
342 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000343 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 }
348 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 }
350 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000351 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000352 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 if (unicode == NULL)
354 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
356 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 }
358
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000359 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000360 PyErr_NoMemory();
361 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000363 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000364 * the caller fails before initializing str -- unicode_resize()
365 * reads str[0], and the Keep-Alive optimization can keep memory
366 * allocated for str alive across a call to unicode_dealloc(unicode).
367 * We don't want unicode_resize to read uninitialized memory in
368 * that case.
369 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000370 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000374 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000375 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377
Benjamin Peterson29060642009-01-31 22:14:21 +0000378 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000379 /* XXX UNREF/NEWREF interface should be more symmetrical */
380 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000382 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000383 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384}
385
386static
Guido van Rossum9475a232001-10-05 20:51:39 +0000387void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388{
Walter Dörwald16807132007-05-25 13:52:07 +0000389 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 case SSTATE_NOT_INTERNED:
391 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000392
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_INTERNED_MORTAL:
394 /* revive dead object temporarily for DelItem */
395 Py_REFCNT(unicode) = 3;
396 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
397 Py_FatalError(
398 "deletion of interned string failed");
399 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000400
Benjamin Peterson29060642009-01-31 22:14:21 +0000401 case SSTATE_INTERNED_IMMORTAL:
402 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 default:
405 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406 }
407
Guido van Rossum604ddf82001-12-06 20:03:56 +0000408 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000409 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000410 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000411 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
412 PyObject_DEL(unicode->str);
413 unicode->str = NULL;
414 unicode->length = 0;
415 }
416 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000417 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000418 }
419 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000420 *(PyUnicodeObject **)unicode = free_list;
421 free_list = unicode;
422 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000425 PyObject_DEL(unicode->str);
426 Py_XDECREF(unicode->defenc);
427 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428 }
429}
430
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000431static
432int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433{
434 register PyUnicodeObject *v;
435
436 /* Argument checks */
437 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000441 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000442 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000443 PyErr_BadInternalCall();
444 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000445 }
446
447 /* Resizing unicode_empty and single character objects is not
448 possible since these are being shared. We simply return a fresh
449 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000450 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000451 (v == unicode_empty || v->length == 1)) {
452 PyUnicodeObject *w = _PyUnicode_New(length);
453 if (w == NULL)
454 return -1;
455 Py_UNICODE_COPY(w->str, v->str,
456 length < v->length ? length : v->length);
457 Py_DECREF(*unicode);
458 *unicode = w;
459 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 }
461
462 /* Note that we don't have to modify *unicode for unshared Unicode
463 objects, since we can modify them in-place. */
464 return unicode_resize(v, length);
465}
466
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000467int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
468{
469 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
470}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000471
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000473 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474{
475 PyUnicodeObject *unicode;
476
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000477 /* If the Unicode data is known at construction time, we can apply
478 some optimizations which share commonly used objects. */
479 if (u != NULL) {
480
Benjamin Peterson29060642009-01-31 22:14:21 +0000481 /* Optimization for empty strings */
482 if (size == 0 && unicode_empty != NULL) {
483 Py_INCREF(unicode_empty);
484 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000485 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000486
487 /* Single character Unicode objects in the Latin-1 range are
488 shared when using this constructor */
489 if (size == 1 && *u < 256) {
490 unicode = unicode_latin1[*u];
491 if (!unicode) {
492 unicode = _PyUnicode_New(1);
493 if (!unicode)
494 return NULL;
495 unicode->str[0] = *u;
496 unicode_latin1[*u] = unicode;
497 }
498 Py_INCREF(unicode);
499 return (PyObject *)unicode;
500 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000501 }
Tim Petersced69f82003-09-16 20:30:58 +0000502
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 unicode = _PyUnicode_New(size);
504 if (!unicode)
505 return NULL;
506
507 /* Copy the Unicode data into the new object */
508 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000509 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510
511 return (PyObject *)unicode;
512}
513
Walter Dörwaldd2034312007-05-18 16:29:38 +0000514PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000515{
516 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000517
Benjamin Peterson14339b62009-01-31 16:36:08 +0000518 if (size < 0) {
519 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000520 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 return NULL;
522 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000524 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000525 some optimizations which share commonly used objects.
526 Also, this means the input must be UTF-8, so fall back to the
527 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000528 if (u != NULL) {
529
Benjamin Peterson29060642009-01-31 22:14:21 +0000530 /* Optimization for empty strings */
531 if (size == 0 && unicode_empty != NULL) {
532 Py_INCREF(unicode_empty);
533 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000534 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000535
536 /* Single characters are shared when using this constructor.
537 Restrict to ASCII, since the input must be UTF-8. */
538 if (size == 1 && Py_CHARMASK(*u) < 128) {
539 unicode = unicode_latin1[Py_CHARMASK(*u)];
540 if (!unicode) {
541 unicode = _PyUnicode_New(1);
542 if (!unicode)
543 return NULL;
544 unicode->str[0] = Py_CHARMASK(*u);
545 unicode_latin1[Py_CHARMASK(*u)] = unicode;
546 }
547 Py_INCREF(unicode);
548 return (PyObject *)unicode;
549 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000550
551 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000552 }
553
Walter Dörwald55507312007-05-18 13:12:10 +0000554 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 if (!unicode)
556 return NULL;
557
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 return (PyObject *)unicode;
559}
560
Walter Dörwaldd2034312007-05-18 16:29:38 +0000561PyObject *PyUnicode_FromString(const char *u)
562{
563 size_t size = strlen(u);
564 if (size > PY_SSIZE_T_MAX) {
565 PyErr_SetString(PyExc_OverflowError, "input too long");
566 return NULL;
567 }
568
569 return PyUnicode_FromStringAndSize(u, size);
570}
571
Guido van Rossumd57fd912000-03-10 22:53:23 +0000572#ifdef HAVE_WCHAR_H
573
Mark Dickinson081dfee2009-03-18 14:47:41 +0000574#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
575# define CONVERT_WCHAR_TO_SURROGATES
576#endif
577
578#ifdef CONVERT_WCHAR_TO_SURROGATES
579
580/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
581 to convert from UTF32 to UTF16. */
582
583PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
584 Py_ssize_t size)
585{
586 PyUnicodeObject *unicode;
587 register Py_ssize_t i;
588 Py_ssize_t alloc;
589 const wchar_t *orig_w;
590
591 if (w == NULL) {
592 if (size == 0)
593 return PyUnicode_FromStringAndSize(NULL, 0);
594 PyErr_BadInternalCall();
595 return NULL;
596 }
597
598 if (size == -1) {
599 size = wcslen(w);
600 }
601
602 alloc = size;
603 orig_w = w;
604 for (i = size; i > 0; i--) {
605 if (*w > 0xFFFF)
606 alloc++;
607 w++;
608 }
609 w = orig_w;
610 unicode = _PyUnicode_New(alloc);
611 if (!unicode)
612 return NULL;
613
614 /* Copy the wchar_t data into the new object */
615 {
616 register Py_UNICODE *u;
617 u = PyUnicode_AS_UNICODE(unicode);
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF) {
620 wchar_t ordinal = *w++;
621 ordinal -= 0x10000;
622 *u++ = 0xD800 | (ordinal >> 10);
623 *u++ = 0xDC00 | (ordinal & 0x3FF);
624 }
625 else
626 *u++ = *w++;
627 }
628 }
629 return (PyObject *)unicode;
630}
631
632#else
633
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000635 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636{
637 PyUnicodeObject *unicode;
638
639 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000640 if (size == 0)
641 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000642 PyErr_BadInternalCall();
643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000644 }
645
Martin v. Löwis790465f2008-04-05 20:41:37 +0000646 if (size == -1) {
647 size = wcslen(w);
648 }
649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = _PyUnicode_New(size);
651 if (!unicode)
652 return NULL;
653
654 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000655#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000657#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000659 register Py_UNICODE *u;
660 register Py_ssize_t i;
661 u = PyUnicode_AS_UNICODE(unicode);
662 for (i = size; i > 0; i--)
663 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 }
665#endif
666
667 return (PyObject *)unicode;
668}
669
Mark Dickinson081dfee2009-03-18 14:47:41 +0000670#endif /* CONVERT_WCHAR_TO_SURROGATES */
671
672#undef CONVERT_WCHAR_TO_SURROGATES
673
Walter Dörwald346737f2007-05-31 10:44:43 +0000674static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000675makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
676 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000677{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000678 *fmt++ = '%';
679 if (width) {
680 if (zeropad)
681 *fmt++ = '0';
682 fmt += sprintf(fmt, "%d", width);
683 }
684 if (precision)
685 fmt += sprintf(fmt, ".%d", precision);
686 if (longflag)
687 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000688 else if (longlongflag) {
689 /* longlongflag should only ever be nonzero on machines with
690 HAVE_LONG_LONG defined */
691#ifdef HAVE_LONG_LONG
692 char *f = PY_FORMAT_LONG_LONG;
693 while (*f)
694 *fmt++ = *f++;
695#else
696 /* we shouldn't ever get here */
697 assert(0);
698 *fmt++ = 'l';
699#endif
700 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000701 else if (size_tflag) {
702 char *f = PY_FORMAT_SIZE_T;
703 while (*f)
704 *fmt++ = *f++;
705 }
706 *fmt++ = c;
707 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000708}
709
Walter Dörwaldd2034312007-05-18 16:29:38 +0000710#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
711
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000712/* size of fixed-size buffer for formatting single arguments */
713#define ITEM_BUFFER_LEN 21
714/* maximum number of characters required for output of %ld. 21 characters
715 allows for 64-bit integers (in decimal) and an optional sign. */
716#define MAX_LONG_CHARS 21
717/* maximum number of characters required for output of %lld.
718 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
719 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
720#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
721
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722PyObject *
723PyUnicode_FromFormatV(const char *format, va_list vargs)
724{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000725 va_list count;
726 Py_ssize_t callcount = 0;
727 PyObject **callresults = NULL;
728 PyObject **callresult = NULL;
729 Py_ssize_t n = 0;
730 int width = 0;
731 int precision = 0;
732 int zeropad;
733 const char* f;
734 Py_UNICODE *s;
735 PyObject *string;
736 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000737 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000738 /* use abuffer instead of buffer, if we need more space
739 * (which can happen if there's a format specifier with width). */
740 char *abuffer = NULL;
741 char *realbuffer;
742 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000743 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000744 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000745
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000746 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000747 /* step 1: count the number of %S/%R/%A/%s format specifications
748 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
749 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
750 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000751 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000752 if (*f == '%') {
753 if (*(f+1)=='%')
754 continue;
Victor Stinner2b574a22011-03-01 22:48:49 +0000755 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000756 ++callcount;
David Malcolm96960882010-11-05 17:23:41 +0000757 while (Py_ISDIGIT((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000758 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000759 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000760 ;
761 if (*f == 's')
762 ++callcount;
763 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000764 else if (128 <= (unsigned char)*f) {
765 PyErr_Format(PyExc_ValueError,
766 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000767 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000768 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000769 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 }
772 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000773 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000774 if (callcount) {
775 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
776 if (!callresults) {
777 PyErr_NoMemory();
778 return NULL;
779 }
780 callresult = callresults;
781 }
782 /* step 3: figure out how large a buffer we need */
783 for (f = format; *f; f++) {
784 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000785#ifdef HAVE_LONG_LONG
786 int longlongflag = 0;
787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000788 const char* p = f;
789 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000790 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000792 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000793 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794
Benjamin Peterson14339b62009-01-31 16:36:08 +0000795 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
796 * they don't affect the amount of space we reserve.
797 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000798 if (*f == 'l') {
799 if (f[1] == 'd' || f[1] == 'u') {
800 ++f;
801 }
802#ifdef HAVE_LONG_LONG
803 else if (f[1] == 'l' &&
804 (f[2] == 'd' || f[2] == 'u')) {
805 longlongflag = 1;
806 f += 2;
807 }
808#endif
809 }
810 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000811 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000812 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000813
Benjamin Peterson14339b62009-01-31 16:36:08 +0000814 switch (*f) {
815 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +0000816 {
817#ifndef Py_UNICODE_WIDE
818 int ordinal = va_arg(count, int);
819 if (ordinal > 0xffff)
820 n += 2;
821 else
822 n++;
823#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000824 (void)va_arg(count, int);
Victor Stinner659eb842011-02-23 12:14:22 +0000825 n++;
826#endif
827 break;
828 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 case '%':
830 n++;
831 break;
832 case 'd': case 'u': case 'i': case 'x':
833 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000834#ifdef HAVE_LONG_LONG
835 if (longlongflag) {
836 if (width < MAX_LONG_LONG_CHARS)
837 width = MAX_LONG_LONG_CHARS;
838 }
839 else
840#endif
841 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
842 including sign. Decimal takes the most space. This
843 isn't enough for octal. If a width is specified we
844 need more (which we allocate later). */
845 if (width < MAX_LONG_CHARS)
846 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000847 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000848 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000849 if (abuffersize < width)
850 abuffersize = width;
851 break;
852 case 's':
853 {
854 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000855 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000856 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
857 if (!str)
858 goto fail;
859 n += PyUnicode_GET_SIZE(str);
860 /* Remember the str and switch to the next slot */
861 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000862 break;
863 }
864 case 'U':
865 {
866 PyObject *obj = va_arg(count, PyObject *);
867 assert(obj && PyUnicode_Check(obj));
868 n += PyUnicode_GET_SIZE(obj);
869 break;
870 }
871 case 'V':
872 {
873 PyObject *obj = va_arg(count, PyObject *);
874 const char *str = va_arg(count, const char *);
Victor Stinner2b574a22011-03-01 22:48:49 +0000875 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000876 assert(obj || str);
877 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2b574a22011-03-01 22:48:49 +0000878 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000879 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2b574a22011-03-01 22:48:49 +0000880 *callresult++ = NULL;
881 }
882 else {
883 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
884 if (!str_obj)
885 goto fail;
886 n += PyUnicode_GET_SIZE(str_obj);
887 *callresult++ = str_obj;
888 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000889 break;
890 }
891 case 'S':
892 {
893 PyObject *obj = va_arg(count, PyObject *);
894 PyObject *str;
895 assert(obj);
896 str = PyObject_Str(obj);
897 if (!str)
898 goto fail;
899 n += PyUnicode_GET_SIZE(str);
900 /* Remember the str and switch to the next slot */
901 *callresult++ = str;
902 break;
903 }
904 case 'R':
905 {
906 PyObject *obj = va_arg(count, PyObject *);
907 PyObject *repr;
908 assert(obj);
909 repr = PyObject_Repr(obj);
910 if (!repr)
911 goto fail;
912 n += PyUnicode_GET_SIZE(repr);
913 /* Remember the repr and switch to the next slot */
914 *callresult++ = repr;
915 break;
916 }
917 case 'A':
918 {
919 PyObject *obj = va_arg(count, PyObject *);
920 PyObject *ascii;
921 assert(obj);
922 ascii = PyObject_ASCII(obj);
923 if (!ascii)
924 goto fail;
925 n += PyUnicode_GET_SIZE(ascii);
926 /* Remember the repr and switch to the next slot */
927 *callresult++ = ascii;
928 break;
929 }
930 case 'p':
931 (void) va_arg(count, int);
932 /* maximum 64-bit pointer representation:
933 * 0xffffffffffffffff
934 * so 19 characters is enough.
935 * XXX I count 18 -- what's the extra for?
936 */
937 n += 19;
938 break;
939 default:
940 /* if we stumble upon an unknown
941 formatting code, copy the rest of
942 the format string to the output
943 string. (we cannot just skip the
944 code, since there's no way to know
945 what's in the argument list) */
946 n += strlen(p);
947 goto expand;
948 }
949 } else
950 n++;
951 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000952 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000953 if (abuffersize > ITEM_BUFFER_LEN) {
954 /* add 1 for sprintf's trailing null byte */
955 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000956 if (!abuffer) {
957 PyErr_NoMemory();
958 goto fail;
959 }
960 realbuffer = abuffer;
961 }
962 else
963 realbuffer = buffer;
964 /* step 4: fill the buffer */
965 /* Since we've analyzed how much space we need for the worst case,
966 we don't have to resize the string.
967 There can be no errors beyond this point. */
968 string = PyUnicode_FromUnicode(NULL, n);
969 if (!string)
970 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000971
Benjamin Peterson14339b62009-01-31 16:36:08 +0000972 s = PyUnicode_AS_UNICODE(string);
973 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000974
Benjamin Peterson14339b62009-01-31 16:36:08 +0000975 for (f = format; *f; f++) {
976 if (*f == '%') {
977 const char* p = f++;
978 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000979 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000980 int size_tflag = 0;
981 zeropad = (*f == '0');
982 /* parse the width.precision part */
983 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000984 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000985 width = (width*10) + *f++ - '0';
986 precision = 0;
987 if (*f == '.') {
988 f++;
David Malcolm96960882010-11-05 17:23:41 +0000989 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000990 precision = (precision*10) + *f++ - '0';
991 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000992 /* Handle %ld, %lu, %lld and %llu. */
993 if (*f == 'l') {
994 if (f[1] == 'd' || f[1] == 'u') {
995 longflag = 1;
996 ++f;
997 }
998#ifdef HAVE_LONG_LONG
999 else if (f[1] == 'l' &&
1000 (f[2] == 'd' || f[2] == 'u')) {
1001 longlongflag = 1;
1002 f += 2;
1003 }
1004#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001005 }
1006 /* handle the size_t flag. */
1007 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1008 size_tflag = 1;
1009 ++f;
1010 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001011
Benjamin Peterson14339b62009-01-31 16:36:08 +00001012 switch (*f) {
1013 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +00001014 {
1015 int ordinal = va_arg(vargs, int);
1016#ifndef Py_UNICODE_WIDE
1017 if (ordinal > 0xffff) {
1018 ordinal -= 0x10000;
1019 *s++ = 0xD800 | (ordinal >> 10);
1020 *s++ = 0xDC00 | (ordinal & 0x3FF);
1021 } else
1022#endif
1023 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001024 break;
Victor Stinner659eb842011-02-23 12:14:22 +00001025 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001026 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001027 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1028 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001029 if (longflag)
1030 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001031#ifdef HAVE_LONG_LONG
1032 else if (longlongflag)
1033 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1034#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001035 else if (size_tflag)
1036 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1037 else
1038 sprintf(realbuffer, fmt, va_arg(vargs, int));
1039 appendstring(realbuffer);
1040 break;
1041 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001042 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1043 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001044 if (longflag)
1045 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001046#ifdef HAVE_LONG_LONG
1047 else if (longlongflag)
1048 sprintf(realbuffer, fmt, va_arg(vargs,
1049 unsigned PY_LONG_LONG));
1050#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 else if (size_tflag)
1052 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1053 else
1054 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1055 appendstring(realbuffer);
1056 break;
1057 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001058 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001059 sprintf(realbuffer, fmt, va_arg(vargs, int));
1060 appendstring(realbuffer);
1061 break;
1062 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001063 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001064 sprintf(realbuffer, fmt, va_arg(vargs, int));
1065 appendstring(realbuffer);
1066 break;
1067 case 's':
1068 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001069 /* unused, since we already have the result */
1070 (void) va_arg(vargs, char *);
1071 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1072 PyUnicode_GET_SIZE(*callresult));
1073 s += PyUnicode_GET_SIZE(*callresult);
1074 /* We're done with the unicode()/repr() => forget it */
1075 Py_DECREF(*callresult);
1076 /* switch to next unicode()/repr() result */
1077 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001078 break;
1079 }
1080 case 'U':
1081 {
1082 PyObject *obj = va_arg(vargs, PyObject *);
1083 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1084 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1085 s += size;
1086 break;
1087 }
1088 case 'V':
1089 {
1090 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2b574a22011-03-01 22:48:49 +00001091 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001092 if (obj) {
1093 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1094 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1095 s += size;
1096 } else {
Victor Stinner2b574a22011-03-01 22:48:49 +00001097 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1098 PyUnicode_GET_SIZE(*callresult));
1099 s += PyUnicode_GET_SIZE(*callresult);
1100 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001101 }
Victor Stinner2b574a22011-03-01 22:48:49 +00001102 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001103 break;
1104 }
1105 case 'S':
1106 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001107 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001108 {
1109 Py_UNICODE *ucopy;
1110 Py_ssize_t usize;
1111 Py_ssize_t upos;
1112 /* unused, since we already have the result */
1113 (void) va_arg(vargs, PyObject *);
1114 ucopy = PyUnicode_AS_UNICODE(*callresult);
1115 usize = PyUnicode_GET_SIZE(*callresult);
1116 for (upos = 0; upos<usize;)
1117 *s++ = ucopy[upos++];
1118 /* We're done with the unicode()/repr() => forget it */
1119 Py_DECREF(*callresult);
1120 /* switch to next unicode()/repr() result */
1121 ++callresult;
1122 break;
1123 }
1124 case 'p':
1125 sprintf(buffer, "%p", va_arg(vargs, void*));
1126 /* %p is ill-defined: ensure leading 0x. */
1127 if (buffer[1] == 'X')
1128 buffer[1] = 'x';
1129 else if (buffer[1] != 'x') {
1130 memmove(buffer+2, buffer, strlen(buffer)+1);
1131 buffer[0] = '0';
1132 buffer[1] = 'x';
1133 }
1134 appendstring(buffer);
1135 break;
1136 case '%':
1137 *s++ = '%';
1138 break;
1139 default:
1140 appendstring(p);
1141 goto end;
1142 }
Victor Stinner1205f272010-09-11 00:54:47 +00001143 }
Victor Stinner1205f272010-09-11 00:54:47 +00001144 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001145 *s++ = *f;
1146 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001147
Benjamin Peterson29060642009-01-31 22:14:21 +00001148 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001149 if (callresults)
1150 PyObject_Free(callresults);
1151 if (abuffer)
1152 PyObject_Free(abuffer);
1153 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1154 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001155 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001156 if (callresults) {
1157 PyObject **callresult2 = callresults;
1158 while (callresult2 < callresult) {
Victor Stinner2b574a22011-03-01 22:48:49 +00001159 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001160 ++callresult2;
1161 }
1162 PyObject_Free(callresults);
1163 }
1164 if (abuffer)
1165 PyObject_Free(abuffer);
1166 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001167}
1168
1169#undef appendstring
1170
1171PyObject *
1172PyUnicode_FromFormat(const char *format, ...)
1173{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001174 PyObject* ret;
1175 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001176
1177#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001178 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001179#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001180 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001181#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001182 ret = PyUnicode_FromFormatV(format, vargs);
1183 va_end(vargs);
1184 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001185}
1186
Victor Stinner5593d8a2010-10-02 11:11:27 +00001187/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1188 convert a Unicode object to a wide character string.
1189
Victor Stinnerd88d9832011-09-06 02:00:05 +02001190 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001191 character) required to convert the unicode object. Ignore size argument.
1192
Victor Stinnerd88d9832011-09-06 02:00:05 +02001193 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001194 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001195 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001196static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001197unicode_aswidechar(PyUnicodeObject *unicode,
1198 wchar_t *w,
1199 Py_ssize_t size)
1200{
1201#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001202 Py_ssize_t res;
1203 if (w != NULL) {
1204 res = PyUnicode_GET_SIZE(unicode);
1205 if (size > res)
1206 size = res + 1;
1207 else
1208 res = size;
1209 memcpy(w, unicode->str, size * sizeof(wchar_t));
1210 return res;
1211 }
1212 else
1213 return PyUnicode_GET_SIZE(unicode) + 1;
1214#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1215 register const Py_UNICODE *u;
1216 const Py_UNICODE *uend;
1217 const wchar_t *worig, *wend;
1218 Py_ssize_t nchar;
1219
Victor Stinner137c34c2010-09-29 10:25:54 +00001220 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001221 uend = u + PyUnicode_GET_SIZE(unicode);
1222 if (w != NULL) {
1223 worig = w;
1224 wend = w + size;
1225 while (u != uend && w != wend) {
1226 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1227 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1228 {
1229 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1230 u += 2;
1231 }
1232 else {
1233 *w = *u;
1234 u++;
1235 }
1236 w++;
1237 }
1238 if (w != wend)
1239 *w = L'\0';
1240 return w - worig;
1241 }
1242 else {
Victor Stinnerd88d9832011-09-06 02:00:05 +02001243 nchar = 1; /* null character at the end */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001244 while (u != uend) {
1245 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1246 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1247 u += 2;
1248 else
1249 u++;
1250 nchar++;
1251 }
1252 }
1253 return nchar;
1254#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1255 register Py_UNICODE *u, *uend, ordinal;
1256 register Py_ssize_t i;
1257 wchar_t *worig, *wend;
1258 Py_ssize_t nchar;
1259
1260 u = PyUnicode_AS_UNICODE(unicode);
1261 uend = u + PyUnicode_GET_SIZE(u);
1262 if (w != NULL) {
1263 worig = w;
1264 wend = w + size;
1265 while (u != uend && w != wend) {
1266 ordinal = *u;
1267 if (ordinal > 0xffff) {
1268 ordinal -= 0x10000;
1269 *w++ = 0xD800 | (ordinal >> 10);
1270 *w++ = 0xDC00 | (ordinal & 0x3FF);
1271 }
1272 else
1273 *w++ = ordinal;
1274 u++;
1275 }
1276 if (w != wend)
1277 *w = 0;
1278 return w - worig;
1279 }
1280 else {
Victor Stinnerd88d9832011-09-06 02:00:05 +02001281 nchar = 1; /* null character */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001282 while (u != uend) {
1283 if (*u > 0xffff)
1284 nchar += 2;
1285 else
1286 nchar++;
1287 u++;
1288 }
1289 return nchar;
1290 }
1291#else
1292# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001293#endif
1294}
1295
1296Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001297PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001298 wchar_t *w,
1299 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300{
1301 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001302 PyErr_BadInternalCall();
1303 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001305 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306}
1307
Victor Stinner137c34c2010-09-29 10:25:54 +00001308wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001309PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001310 Py_ssize_t *size)
1311{
1312 wchar_t* buffer;
1313 Py_ssize_t buflen;
1314
1315 if (unicode == NULL) {
1316 PyErr_BadInternalCall();
1317 return NULL;
1318 }
1319
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001320 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001321 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001322 PyErr_NoMemory();
1323 return NULL;
1324 }
1325
Victor Stinner137c34c2010-09-29 10:25:54 +00001326 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1327 if (buffer == NULL) {
1328 PyErr_NoMemory();
1329 return NULL;
1330 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001331 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001332 if (size != NULL)
1333 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001334 return buffer;
1335}
1336
Guido van Rossumd57fd912000-03-10 22:53:23 +00001337#endif
1338
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001339PyObject *PyUnicode_FromOrdinal(int ordinal)
1340{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001341 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001342
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001343 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001344 PyErr_SetString(PyExc_ValueError,
1345 "chr() arg not in range(0x110000)");
1346 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001347 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001348
1349#ifndef Py_UNICODE_WIDE
1350 if (ordinal > 0xffff) {
1351 ordinal -= 0x10000;
1352 s[0] = 0xD800 | (ordinal >> 10);
1353 s[1] = 0xDC00 | (ordinal & 0x3FF);
1354 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001355 }
1356#endif
1357
Hye-Shik Chang40574832004-04-06 07:24:51 +00001358 s[0] = (Py_UNICODE)ordinal;
1359 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001360}
1361
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362PyObject *PyUnicode_FromObject(register PyObject *obj)
1363{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001364 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001365 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001366 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001367 Py_INCREF(obj);
1368 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001369 }
1370 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001371 /* For a Unicode subtype that's not a Unicode object,
1372 return a true Unicode object with the same data. */
1373 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1374 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001375 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001376 PyErr_Format(PyExc_TypeError,
1377 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001378 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001379 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001380}
1381
1382PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001383 const char *encoding,
1384 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001385{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001386 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001387 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001388
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 PyErr_BadInternalCall();
1391 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001393
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001394 /* Decoding bytes objects is the most common case and should be fast */
1395 if (PyBytes_Check(obj)) {
1396 if (PyBytes_GET_SIZE(obj) == 0) {
1397 Py_INCREF(unicode_empty);
1398 v = (PyObject *) unicode_empty;
1399 }
1400 else {
1401 v = PyUnicode_Decode(
1402 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1403 encoding, errors);
1404 }
1405 return v;
1406 }
1407
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001408 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001409 PyErr_SetString(PyExc_TypeError,
1410 "decoding str is not supported");
1411 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001412 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001413
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001414 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1415 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1416 PyErr_Format(PyExc_TypeError,
1417 "coercing to str: need bytes, bytearray "
1418 "or buffer-like object, %.80s found",
1419 Py_TYPE(obj)->tp_name);
1420 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001421 }
Tim Petersced69f82003-09-16 20:30:58 +00001422
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001423 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001424 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001425 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426 }
Tim Petersced69f82003-09-16 20:30:58 +00001427 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001428 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001429
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001430 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001431 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432}
1433
Victor Stinner600d3be2010-06-10 12:00:55 +00001434/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001435 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1436 1 on success. */
1437static int
1438normalize_encoding(const char *encoding,
1439 char *lower,
1440 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001442 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001443 char *l;
1444 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001446 e = encoding;
1447 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001448 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001449 while (*e) {
1450 if (l == l_end)
1451 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001452 if (Py_ISUPPER(*e)) {
1453 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001454 }
1455 else if (*e == '_') {
1456 *l++ = '-';
1457 e++;
1458 }
1459 else {
1460 *l++ = *e++;
1461 }
1462 }
1463 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001464 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001465}
1466
1467PyObject *PyUnicode_Decode(const char *s,
1468 Py_ssize_t size,
1469 const char *encoding,
1470 const char *errors)
1471{
1472 PyObject *buffer = NULL, *unicode;
1473 Py_buffer info;
1474 char lower[11]; /* Enough for any encoding shortcut */
1475
1476 if (encoding == NULL)
1477 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001478
1479 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001480 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1481 if (strcmp(lower, "utf-8") == 0)
1482 return PyUnicode_DecodeUTF8(s, size, errors);
1483 else if ((strcmp(lower, "latin-1") == 0) ||
1484 (strcmp(lower, "iso-8859-1") == 0))
1485 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001486#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001487 else if (strcmp(lower, "mbcs") == 0)
1488 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001489#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001490 else if (strcmp(lower, "ascii") == 0)
1491 return PyUnicode_DecodeASCII(s, size, errors);
1492 else if (strcmp(lower, "utf-16") == 0)
1493 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1494 else if (strcmp(lower, "utf-32") == 0)
1495 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497
1498 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001499 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001500 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001501 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001502 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503 if (buffer == NULL)
1504 goto onError;
1505 unicode = PyCodec_Decode(buffer, encoding, errors);
1506 if (unicode == NULL)
1507 goto onError;
1508 if (!PyUnicode_Check(unicode)) {
1509 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001510 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001511 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001512 Py_DECREF(unicode);
1513 goto onError;
1514 }
1515 Py_DECREF(buffer);
1516 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001517
Benjamin Peterson29060642009-01-31 22:14:21 +00001518 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519 Py_XDECREF(buffer);
1520 return NULL;
1521}
1522
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001523PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1524 const char *encoding,
1525 const char *errors)
1526{
1527 PyObject *v;
1528
1529 if (!PyUnicode_Check(unicode)) {
1530 PyErr_BadArgument();
1531 goto onError;
1532 }
1533
1534 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001535 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001536
1537 /* Decode via the codec registry */
1538 v = PyCodec_Decode(unicode, encoding, errors);
1539 if (v == NULL)
1540 goto onError;
1541 return v;
1542
Benjamin Peterson29060642009-01-31 22:14:21 +00001543 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001544 return NULL;
1545}
1546
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001547PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1548 const char *encoding,
1549 const char *errors)
1550{
1551 PyObject *v;
1552
1553 if (!PyUnicode_Check(unicode)) {
1554 PyErr_BadArgument();
1555 goto onError;
1556 }
1557
1558 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001559 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001560
1561 /* Decode via the codec registry */
1562 v = PyCodec_Decode(unicode, encoding, errors);
1563 if (v == NULL)
1564 goto onError;
1565 if (!PyUnicode_Check(v)) {
1566 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001567 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001568 Py_TYPE(v)->tp_name);
1569 Py_DECREF(v);
1570 goto onError;
1571 }
1572 return v;
1573
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001575 return NULL;
1576}
1577
Guido van Rossumd57fd912000-03-10 22:53:23 +00001578PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001579 Py_ssize_t size,
1580 const char *encoding,
1581 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001582{
1583 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001584
Guido van Rossumd57fd912000-03-10 22:53:23 +00001585 unicode = PyUnicode_FromUnicode(s, size);
1586 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001588 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1589 Py_DECREF(unicode);
1590 return v;
1591}
1592
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001593PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1594 const char *encoding,
1595 const char *errors)
1596{
1597 PyObject *v;
1598
1599 if (!PyUnicode_Check(unicode)) {
1600 PyErr_BadArgument();
1601 goto onError;
1602 }
1603
1604 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001605 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001606
1607 /* Encode via the codec registry */
1608 v = PyCodec_Encode(unicode, encoding, errors);
1609 if (v == NULL)
1610 goto onError;
1611 return v;
1612
Benjamin Peterson29060642009-01-31 22:14:21 +00001613 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001614 return NULL;
1615}
1616
Victor Stinnerad158722010-10-27 00:25:46 +00001617PyObject *
1618PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001619{
Victor Stinner313a1202010-06-11 23:56:51 +00001620#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001621 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1622 PyUnicode_GET_SIZE(unicode),
1623 NULL);
1624#elif defined(__APPLE__)
1625 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1626 PyUnicode_GET_SIZE(unicode),
1627 "surrogateescape");
1628#else
Victor Stinner3cbf14b2011-04-27 00:24:21 +02001629 PyInterpreterState *interp = PyThreadState_GET()->interp;
1630 /* Bootstrap check: if the filesystem codec is implemented in Python, we
1631 cannot use it to encode and decode filenames before it is loaded. Load
1632 the Python codec requires to encode at least its own filename. Use the C
1633 version of the locale codec until the codec registry is initialized and
1634 the Python codec is loaded.
1635
1636 Py_FileSystemDefaultEncoding is shared between all interpreters, we
1637 cannot only rely on it: check also interp->fscodec_initialized for
1638 subinterpreters. */
1639 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001640 return PyUnicode_AsEncodedString(unicode,
1641 Py_FileSystemDefaultEncoding,
1642 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001643 }
1644 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001645 /* locale encoding with surrogateescape */
1646 wchar_t *wchar;
1647 char *bytes;
1648 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001649 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001650
1651 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1652 if (wchar == NULL)
1653 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001654 bytes = _Py_wchar2char(wchar, &error_pos);
1655 if (bytes == NULL) {
1656 if (error_pos != (size_t)-1) {
1657 char *errmsg = strerror(errno);
1658 PyObject *exc = NULL;
1659 if (errmsg == NULL)
1660 errmsg = "Py_wchar2char() failed";
1661 raise_encode_exception(&exc,
1662 "filesystemencoding",
1663 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1664 error_pos, error_pos+1,
1665 errmsg);
1666 Py_XDECREF(exc);
1667 }
1668 else
1669 PyErr_NoMemory();
1670 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001671 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001672 }
1673 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001674
1675 bytes_obj = PyBytes_FromString(bytes);
1676 PyMem_Free(bytes);
1677 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001678 }
Victor Stinnerad158722010-10-27 00:25:46 +00001679#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001680}
1681
Guido van Rossumd57fd912000-03-10 22:53:23 +00001682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1683 const char *encoding,
1684 const char *errors)
1685{
1686 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001687 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001688
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689 if (!PyUnicode_Check(unicode)) {
1690 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001691 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 }
Fred Drakee4315f52000-05-09 19:53:39 +00001693
Tim Petersced69f82003-09-16 20:30:58 +00001694 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001695 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001696
1697 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001698 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1699 if (strcmp(lower, "utf-8") == 0)
1700 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1701 PyUnicode_GET_SIZE(unicode),
1702 errors);
1703 else if ((strcmp(lower, "latin-1") == 0) ||
1704 (strcmp(lower, "iso-8859-1") == 0))
1705 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1706 PyUnicode_GET_SIZE(unicode),
1707 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001708#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001709 else if (strcmp(lower, "mbcs") == 0)
1710 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1711 PyUnicode_GET_SIZE(unicode),
1712 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001713#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001714 else if (strcmp(lower, "ascii") == 0)
1715 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1716 PyUnicode_GET_SIZE(unicode),
1717 errors);
1718 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001719 /* During bootstrap, we may need to find the encodings
1720 package, to load the file system encoding, and require the
1721 file system encoding in order to load the encodings
1722 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001723
Victor Stinner59e62db2010-05-15 13:14:32 +00001724 Break out of this dependency by assuming that the path to
1725 the encodings module is ASCII-only. XXX could try wcstombs
1726 instead, if the file system encoding is the locale's
1727 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001728 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001729 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1730 !PyThreadState_GET()->interp->codecs_initialized)
1731 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1732 PyUnicode_GET_SIZE(unicode),
1733 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734
1735 /* Encode via the codec registry */
1736 v = PyCodec_Encode(unicode, encoding, errors);
1737 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001738 return NULL;
1739
1740 /* The normal path */
1741 if (PyBytes_Check(v))
1742 return v;
1743
1744 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001745 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001746 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001747 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001748
1749 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1750 "encoder %s returned bytearray instead of bytes",
1751 encoding);
1752 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001753 Py_DECREF(v);
1754 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001755 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001756
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001757 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1758 Py_DECREF(v);
1759 return b;
1760 }
1761
1762 PyErr_Format(PyExc_TypeError,
1763 "encoder did not return a bytes object (type=%.400s)",
1764 Py_TYPE(v)->tp_name);
1765 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001766 return NULL;
1767}
1768
1769PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1770 const char *encoding,
1771 const char *errors)
1772{
1773 PyObject *v;
1774
1775 if (!PyUnicode_Check(unicode)) {
1776 PyErr_BadArgument();
1777 goto onError;
1778 }
1779
1780 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001781 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001782
1783 /* Encode via the codec registry */
1784 v = PyCodec_Encode(unicode, encoding, errors);
1785 if (v == NULL)
1786 goto onError;
1787 if (!PyUnicode_Check(v)) {
1788 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001789 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001790 Py_TYPE(v)->tp_name);
1791 Py_DECREF(v);
1792 goto onError;
1793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001795
Benjamin Peterson29060642009-01-31 22:14:21 +00001796 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797 return NULL;
1798}
1799
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001800PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001801 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001802{
1803 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001804 if (v)
1805 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001806 if (errors != NULL)
1807 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001808 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001809 PyUnicode_GET_SIZE(unicode),
1810 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001811 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001812 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001813 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001814 return v;
1815}
1816
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001817PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001818PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001819 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001820 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1821}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001822
Christian Heimes5894ba72007-11-04 11:43:14 +00001823PyObject*
1824PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1825{
Victor Stinnerad158722010-10-27 00:25:46 +00001826#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1827 return PyUnicode_DecodeMBCS(s, size, NULL);
1828#elif defined(__APPLE__)
1829 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1830#else
Victor Stinner3cbf14b2011-04-27 00:24:21 +02001831 PyInterpreterState *interp = PyThreadState_GET()->interp;
1832 /* Bootstrap check: if the filesystem codec is implemented in Python, we
1833 cannot use it to encode and decode filenames before it is loaded. Load
1834 the Python codec requires to encode at least its own filename. Use the C
1835 version of the locale codec until the codec registry is initialized and
1836 the Python codec is loaded.
1837
1838 Py_FileSystemDefaultEncoding is shared between all interpreters, we
1839 cannot only rely on it: check also interp->fscodec_initialized for
1840 subinterpreters. */
1841 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001842 return PyUnicode_Decode(s, size,
1843 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001844 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001845 }
1846 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001847 /* locale encoding with surrogateescape */
1848 wchar_t *wchar;
1849 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001850 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001851
1852 if (s[size] != '\0' || size != strlen(s)) {
1853 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1854 return NULL;
1855 }
1856
Victor Stinner168e1172010-10-16 23:16:16 +00001857 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001858 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001859 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001860
Victor Stinner168e1172010-10-16 23:16:16 +00001861 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001862 PyMem_Free(wchar);
1863 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001864 }
Victor Stinnerad158722010-10-27 00:25:46 +00001865#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001866}
1867
Martin v. Löwis011e8422009-05-05 04:43:17 +00001868
1869int
1870PyUnicode_FSConverter(PyObject* arg, void* addr)
1871{
1872 PyObject *output = NULL;
1873 Py_ssize_t size;
1874 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001875 if (arg == NULL) {
1876 Py_DECREF(*(PyObject**)addr);
1877 return 1;
1878 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001879 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001880 output = arg;
1881 Py_INCREF(output);
1882 }
1883 else {
1884 arg = PyUnicode_FromObject(arg);
1885 if (!arg)
1886 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001887 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001888 Py_DECREF(arg);
1889 if (!output)
1890 return 0;
1891 if (!PyBytes_Check(output)) {
1892 Py_DECREF(output);
1893 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1894 return 0;
1895 }
1896 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001897 size = PyBytes_GET_SIZE(output);
1898 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001899 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05001900 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001901 Py_DECREF(output);
1902 return 0;
1903 }
1904 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001905 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001906}
1907
1908
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001909int
1910PyUnicode_FSDecoder(PyObject* arg, void* addr)
1911{
1912 PyObject *output = NULL;
1913 Py_ssize_t size;
1914 void *data;
1915 if (arg == NULL) {
1916 Py_DECREF(*(PyObject**)addr);
1917 return 1;
1918 }
1919 if (PyUnicode_Check(arg)) {
1920 output = arg;
1921 Py_INCREF(output);
1922 }
1923 else {
1924 arg = PyBytes_FromObject(arg);
1925 if (!arg)
1926 return 0;
1927 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1928 PyBytes_GET_SIZE(arg));
1929 Py_DECREF(arg);
1930 if (!output)
1931 return 0;
1932 if (!PyUnicode_Check(output)) {
1933 Py_DECREF(output);
1934 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1935 return 0;
1936 }
1937 }
1938 size = PyUnicode_GET_SIZE(output);
1939 data = PyUnicode_AS_UNICODE(output);
1940 if (size != Py_UNICODE_strlen(data)) {
1941 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1942 Py_DECREF(output);
1943 return 0;
1944 }
1945 *(PyObject**)addr = output;
1946 return Py_CLEANUP_SUPPORTED;
1947}
1948
1949
Martin v. Löwis5b222132007-06-10 09:51:05 +00001950char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001951_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001952{
Christian Heimesf3863112007-11-22 07:46:41 +00001953 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001954 if (!PyUnicode_Check(unicode)) {
1955 PyErr_BadArgument();
1956 return NULL;
1957 }
Christian Heimesf3863112007-11-22 07:46:41 +00001958 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1959 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001960 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001961 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001962 *psize = PyBytes_GET_SIZE(bytes);
1963 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001964}
1965
1966char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001967_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001968{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001969 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001970}
1971
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1973{
1974 if (!PyUnicode_Check(unicode)) {
1975 PyErr_BadArgument();
1976 goto onError;
1977 }
1978 return PyUnicode_AS_UNICODE(unicode);
1979
Benjamin Peterson29060642009-01-31 22:14:21 +00001980 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981 return NULL;
1982}
1983
Martin v. Löwis18e16552006-02-15 17:27:45 +00001984Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985{
1986 if (!PyUnicode_Check(unicode)) {
1987 PyErr_BadArgument();
1988 goto onError;
1989 }
1990 return PyUnicode_GET_SIZE(unicode);
1991
Benjamin Peterson29060642009-01-31 22:14:21 +00001992 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993 return -1;
1994}
1995
Thomas Wouters78890102000-07-22 19:25:51 +00001996const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001997{
Victor Stinner42cb4622010-09-01 19:39:01 +00001998 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001999}
2000
Victor Stinner554f3f02010-06-16 23:33:54 +00002001/* create or adjust a UnicodeDecodeError */
2002static void
2003make_decode_exception(PyObject **exceptionObject,
2004 const char *encoding,
2005 const char *input, Py_ssize_t length,
2006 Py_ssize_t startpos, Py_ssize_t endpos,
2007 const char *reason)
2008{
2009 if (*exceptionObject == NULL) {
2010 *exceptionObject = PyUnicodeDecodeError_Create(
2011 encoding, input, length, startpos, endpos, reason);
2012 }
2013 else {
2014 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2015 goto onError;
2016 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2017 goto onError;
2018 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2019 goto onError;
2020 }
2021 return;
2022
2023onError:
2024 Py_DECREF(*exceptionObject);
2025 *exceptionObject = NULL;
2026}
2027
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002028/* error handling callback helper:
2029 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002030 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002031 and adjust various state variables.
2032 return 0 on success, -1 on error
2033*/
2034
2035static
2036int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00002037 const char *encoding, const char *reason,
2038 const char **input, const char **inend, Py_ssize_t *startinpos,
2039 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2040 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002041{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002042 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002043
2044 PyObject *restuple = NULL;
2045 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002046 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002047 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002048 Py_ssize_t requiredsize;
2049 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002050 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002051 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002052 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053 int res = -1;
2054
2055 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002056 *errorHandler = PyCodec_LookupError(errors);
2057 if (*errorHandler == NULL)
2058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002059 }
2060
Victor Stinner554f3f02010-06-16 23:33:54 +00002061 make_decode_exception(exceptionObject,
2062 encoding,
2063 *input, *inend - *input,
2064 *startinpos, *endinpos,
2065 reason);
2066 if (*exceptionObject == NULL)
2067 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002068
2069 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2070 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002071 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002072 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002073 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002074 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002075 }
2076 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002077 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002078
2079 /* Copy back the bytes variables, which might have been modified by the
2080 callback */
2081 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2082 if (!inputobj)
2083 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002084 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002085 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002086 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002087 *input = PyBytes_AS_STRING(inputobj);
2088 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002089 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002090 /* we can DECREF safely, as the exception has another reference,
2091 so the object won't go away. */
2092 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002093
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002094 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002095 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002096 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002097 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2098 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002099 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002100
2101 /* need more space? (at least enough for what we
2102 have+the replacement+the rest of the string (starting
2103 at the new input position), so we won't have to check space
2104 when there are no errors in the rest of the string) */
2105 repptr = PyUnicode_AS_UNICODE(repunicode);
2106 repsize = PyUnicode_GET_SIZE(repunicode);
2107 requiredsize = *outpos + repsize + insize-newpos;
2108 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002109 if (requiredsize<2*outsize)
2110 requiredsize = 2*outsize;
2111 if (_PyUnicode_Resize(output, requiredsize) < 0)
2112 goto onError;
2113 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002114 }
2115 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002116 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002117 Py_UNICODE_COPY(*outptr, repptr, repsize);
2118 *outptr += repsize;
2119 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002120
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002121 /* we made it! */
2122 res = 0;
2123
Benjamin Peterson29060642009-01-31 22:14:21 +00002124 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002125 Py_XDECREF(restuple);
2126 return res;
2127}
2128
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002129/* --- UTF-7 Codec -------------------------------------------------------- */
2130
Antoine Pitrou244651a2009-05-04 18:56:13 +00002131/* See RFC2152 for details. We encode conservatively and decode liberally. */
2132
2133/* Three simple macros defining base-64. */
2134
2135/* Is c a base-64 character? */
2136
2137#define IS_BASE64(c) \
2138 (((c) >= 'A' && (c) <= 'Z') || \
2139 ((c) >= 'a' && (c) <= 'z') || \
2140 ((c) >= '0' && (c) <= '9') || \
2141 (c) == '+' || (c) == '/')
2142
2143/* given that c is a base-64 character, what is its base-64 value? */
2144
2145#define FROM_BASE64(c) \
2146 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2147 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2148 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2149 (c) == '+' ? 62 : 63)
2150
2151/* What is the base-64 character of the bottom 6 bits of n? */
2152
2153#define TO_BASE64(n) \
2154 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2155
2156/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2157 * decoded as itself. We are permissive on decoding; the only ASCII
2158 * byte not decoding to itself is the + which begins a base64
2159 * string. */
2160
2161#define DECODE_DIRECT(c) \
2162 ((c) <= 127 && (c) != '+')
2163
2164/* The UTF-7 encoder treats ASCII characters differently according to
2165 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2166 * the above). See RFC2152. This array identifies these different
2167 * sets:
2168 * 0 : "Set D"
2169 * alphanumeric and '(),-./:?
2170 * 1 : "Set O"
2171 * !"#$%&*;<=>@[]^_`{|}
2172 * 2 : "whitespace"
2173 * ht nl cr sp
2174 * 3 : special (must be base64 encoded)
2175 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2176 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002177
Tim Petersced69f82003-09-16 20:30:58 +00002178static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002179char utf7_category[128] = {
2180/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2181 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2182/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2183 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2184/* sp ! " # $ % & ' ( ) * + , - . / */
2185 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2186/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2188/* @ A B C D E F G H I J K L M N O */
2189 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2190/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2192/* ` a b c d e f g h i j k l m n o */
2193 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2194/* p q r s t u v w x y z { | } ~ del */
2195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002196};
2197
Antoine Pitrou244651a2009-05-04 18:56:13 +00002198/* ENCODE_DIRECT: this character should be encoded as itself. The
2199 * answer depends on whether we are encoding set O as itself, and also
2200 * on whether we are encoding whitespace as itself. RFC2152 makes it
2201 * clear that the answers to these questions vary between
2202 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002203
Antoine Pitrou244651a2009-05-04 18:56:13 +00002204#define ENCODE_DIRECT(c, directO, directWS) \
2205 ((c) < 128 && (c) > 0 && \
2206 ((utf7_category[(c)] == 0) || \
2207 (directWS && (utf7_category[(c)] == 2)) || \
2208 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002209
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002210PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002211 Py_ssize_t size,
2212 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002213{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002214 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2215}
2216
Antoine Pitrou244651a2009-05-04 18:56:13 +00002217/* The decoder. The only state we preserve is our read position,
2218 * i.e. how many characters we have consumed. So if we end in the
2219 * middle of a shift sequence we have to back off the read position
2220 * and the output to the beginning of the sequence, otherwise we lose
2221 * all the shift state (seen bits, number of bits seen, high
2222 * surrogate). */
2223
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002224PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002225 Py_ssize_t size,
2226 const char *errors,
2227 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002228{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002229 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002230 Py_ssize_t startinpos;
2231 Py_ssize_t endinpos;
2232 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002233 const char *e;
2234 PyUnicodeObject *unicode;
2235 Py_UNICODE *p;
2236 const char *errmsg = "";
2237 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002238 Py_UNICODE *shiftOutStart;
2239 unsigned int base64bits = 0;
2240 unsigned long base64buffer = 0;
2241 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002242 PyObject *errorHandler = NULL;
2243 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002244
2245 unicode = _PyUnicode_New(size);
2246 if (!unicode)
2247 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002248 if (size == 0) {
2249 if (consumed)
2250 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002251 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002252 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002253
2254 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002255 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002256 e = s + size;
2257
2258 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002259 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002260 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002261 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002262
Antoine Pitrou244651a2009-05-04 18:56:13 +00002263 if (inShift) { /* in a base-64 section */
2264 if (IS_BASE64(ch)) { /* consume a base-64 character */
2265 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2266 base64bits += 6;
2267 s++;
2268 if (base64bits >= 16) {
2269 /* we have enough bits for a UTF-16 value */
2270 Py_UNICODE outCh = (Py_UNICODE)
2271 (base64buffer >> (base64bits-16));
2272 base64bits -= 16;
2273 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2274 if (surrogate) {
2275 /* expecting a second surrogate */
2276 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2277#ifdef Py_UNICODE_WIDE
2278 *p++ = (((surrogate & 0x3FF)<<10)
2279 | (outCh & 0x3FF)) + 0x10000;
2280#else
2281 *p++ = surrogate;
2282 *p++ = outCh;
2283#endif
2284 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002285 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002286 }
2287 else {
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002288 *p++ = surrogate;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002289 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002290 }
2291 }
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002292 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002293 /* first surrogate */
2294 surrogate = outCh;
2295 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002296 else {
2297 *p++ = outCh;
2298 }
2299 }
2300 }
2301 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002302 inShift = 0;
2303 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002304 if (surrogate) {
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002305 *p++ = surrogate;
2306 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002307 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002308 if (base64bits > 0) { /* left-over bits */
2309 if (base64bits >= 6) {
2310 /* We've seen at least one base-64 character */
2311 errmsg = "partial character in shift sequence";
2312 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002313 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002314 else {
2315 /* Some bits remain; they should be zero */
2316 if (base64buffer != 0) {
2317 errmsg = "non-zero padding bits in shift sequence";
2318 goto utf7Error;
2319 }
2320 }
2321 }
2322 if (ch != '-') {
2323 /* '-' is absorbed; other terminating
2324 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002325 *p++ = ch;
2326 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002327 }
2328 }
2329 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002330 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002331 s++; /* consume '+' */
2332 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002333 s++;
2334 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002335 }
2336 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002337 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002338 shiftOutStart = p;
2339 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002340 }
2341 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002342 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002343 *p++ = ch;
2344 s++;
2345 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002346 else {
2347 startinpos = s-starts;
2348 s++;
2349 errmsg = "unexpected special character";
2350 goto utf7Error;
2351 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002352 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002353utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002354 outpos = p-PyUnicode_AS_UNICODE(unicode);
2355 endinpos = s-starts;
2356 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002357 errors, &errorHandler,
2358 "utf7", errmsg,
2359 &starts, &e, &startinpos, &endinpos, &exc, &s,
2360 &unicode, &outpos, &p))
2361 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002362 }
2363
Antoine Pitrou244651a2009-05-04 18:56:13 +00002364 /* end of string */
2365
2366 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2367 /* if we're in an inconsistent state, that's an error */
2368 if (surrogate ||
2369 (base64bits >= 6) ||
2370 (base64bits > 0 && base64buffer != 0)) {
2371 outpos = p-PyUnicode_AS_UNICODE(unicode);
2372 endinpos = size;
2373 if (unicode_decode_call_errorhandler(
2374 errors, &errorHandler,
2375 "utf7", "unterminated shift sequence",
2376 &starts, &e, &startinpos, &endinpos, &exc, &s,
2377 &unicode, &outpos, &p))
2378 goto onError;
2379 if (s < e)
2380 goto restart;
2381 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002382 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002383
2384 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002385 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002386 if (inShift) {
2387 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002388 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002389 }
2390 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002391 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002392 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002393 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002394
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002395 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002396 goto onError;
2397
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002398 Py_XDECREF(errorHandler);
2399 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002400 return (PyObject *)unicode;
2401
Benjamin Peterson29060642009-01-31 22:14:21 +00002402 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002403 Py_XDECREF(errorHandler);
2404 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002405 Py_DECREF(unicode);
2406 return NULL;
2407}
2408
2409
2410PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002411 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002412 int base64SetO,
2413 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002414 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002415{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002416 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002417 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002418 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002419 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002420 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002421 unsigned int base64bits = 0;
2422 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002423 char * out;
2424 char * start;
2425
2426 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002427 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002428
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002429 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002430 return PyErr_NoMemory();
2431
Antoine Pitrou244651a2009-05-04 18:56:13 +00002432 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002433 if (v == NULL)
2434 return NULL;
2435
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002436 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002437 for (;i < size; ++i) {
2438 Py_UNICODE ch = s[i];
2439
Antoine Pitrou244651a2009-05-04 18:56:13 +00002440 if (inShift) {
2441 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2442 /* shifting out */
2443 if (base64bits) { /* output remaining bits */
2444 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2445 base64buffer = 0;
2446 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002447 }
2448 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002449 /* Characters not in the BASE64 set implicitly unshift the sequence
2450 so no '-' is required, except if the character is itself a '-' */
2451 if (IS_BASE64(ch) || ch == '-') {
2452 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002453 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002454 *out++ = (char) ch;
2455 }
2456 else {
2457 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002458 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002459 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002460 else { /* not in a shift sequence */
2461 if (ch == '+') {
2462 *out++ = '+';
2463 *out++ = '-';
2464 }
2465 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2466 *out++ = (char) ch;
2467 }
2468 else {
2469 *out++ = '+';
2470 inShift = 1;
2471 goto encode_char;
2472 }
2473 }
2474 continue;
2475encode_char:
2476#ifdef Py_UNICODE_WIDE
2477 if (ch >= 0x10000) {
2478 /* code first surrogate */
2479 base64bits += 16;
2480 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2481 while (base64bits >= 6) {
2482 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2483 base64bits -= 6;
2484 }
2485 /* prepare second surrogate */
2486 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2487 }
2488#endif
2489 base64bits += 16;
2490 base64buffer = (base64buffer << 16) | ch;
2491 while (base64bits >= 6) {
2492 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2493 base64bits -= 6;
2494 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002495 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002496 if (base64bits)
2497 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2498 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002499 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002500 if (_PyBytes_Resize(&v, out - start) < 0)
2501 return NULL;
2502 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002503}
2504
Antoine Pitrou244651a2009-05-04 18:56:13 +00002505#undef IS_BASE64
2506#undef FROM_BASE64
2507#undef TO_BASE64
2508#undef DECODE_DIRECT
2509#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002510
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511/* --- UTF-8 Codec -------------------------------------------------------- */
2512
Tim Petersced69f82003-09-16 20:30:58 +00002513static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002514char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002515 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2516 illegal prefix. See RFC 3629 for details */
2517 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2518 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002519 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2521 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2522 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2523 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002524 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2527 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002528 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2529 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2530 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2531 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2532 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533};
2534
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002536 Py_ssize_t size,
2537 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538{
Walter Dörwald69652032004-09-07 20:24:22 +00002539 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2540}
2541
Antoine Pitrouab868312009-01-10 15:40:25 +00002542/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2543#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2544
2545/* Mask to quickly check whether a C 'long' contains a
2546 non-ASCII, UTF8-encoded char. */
2547#if (SIZEOF_LONG == 8)
2548# define ASCII_CHAR_MASK 0x8080808080808080L
2549#elif (SIZEOF_LONG == 4)
2550# define ASCII_CHAR_MASK 0x80808080L
2551#else
2552# error C 'long' size should be either 4 or 8!
2553#endif
2554
Walter Dörwald69652032004-09-07 20:24:22 +00002555PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002556 Py_ssize_t size,
2557 const char *errors,
2558 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002559{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002560 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002562 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002563 Py_ssize_t startinpos;
2564 Py_ssize_t endinpos;
2565 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002566 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002567 PyUnicodeObject *unicode;
2568 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002569 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002570 PyObject *errorHandler = NULL;
2571 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572
2573 /* Note: size will always be longer than the resulting Unicode
2574 character count */
2575 unicode = _PyUnicode_New(size);
2576 if (!unicode)
2577 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002578 if (size == 0) {
2579 if (consumed)
2580 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002581 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002582 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583
2584 /* Unpack UTF-8 encoded data */
2585 p = unicode->str;
2586 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002587 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002588
2589 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002590 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591
2592 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002593 /* Fast path for runs of ASCII characters. Given that common UTF-8
2594 input will consist of an overwhelming majority of ASCII
2595 characters, we try to optimize for this case by checking
2596 as many characters as a C 'long' can contain.
2597 First, check if we can do an aligned read, as most CPUs have
2598 a penalty for unaligned reads.
2599 */
2600 if (!((size_t) s & LONG_PTR_MASK)) {
2601 /* Help register allocation */
2602 register const char *_s = s;
2603 register Py_UNICODE *_p = p;
2604 while (_s < aligned_end) {
2605 /* Read a whole long at a time (either 4 or 8 bytes),
2606 and do a fast unrolled copy if it only contains ASCII
2607 characters. */
2608 unsigned long data = *(unsigned long *) _s;
2609 if (data & ASCII_CHAR_MASK)
2610 break;
2611 _p[0] = (unsigned char) _s[0];
2612 _p[1] = (unsigned char) _s[1];
2613 _p[2] = (unsigned char) _s[2];
2614 _p[3] = (unsigned char) _s[3];
2615#if (SIZEOF_LONG == 8)
2616 _p[4] = (unsigned char) _s[4];
2617 _p[5] = (unsigned char) _s[5];
2618 _p[6] = (unsigned char) _s[6];
2619 _p[7] = (unsigned char) _s[7];
2620#endif
2621 _s += SIZEOF_LONG;
2622 _p += SIZEOF_LONG;
2623 }
2624 s = _s;
2625 p = _p;
2626 if (s == e)
2627 break;
2628 ch = (unsigned char)*s;
2629 }
2630 }
2631
2632 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002633 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634 s++;
2635 continue;
2636 }
2637
2638 n = utf8_code_length[ch];
2639
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002640 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002641 if (consumed)
2642 break;
2643 else {
2644 errmsg = "unexpected end of data";
2645 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002646 endinpos = startinpos+1;
2647 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2648 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002649 goto utf8Error;
2650 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002651 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002652
2653 switch (n) {
2654
2655 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002656 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002657 startinpos = s-starts;
2658 endinpos = startinpos+1;
2659 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660
2661 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002662 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002663 startinpos = s-starts;
2664 endinpos = startinpos+1;
2665 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666
2667 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002668 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002669 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002670 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002671 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002672 goto utf8Error;
2673 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002675 assert ((ch > 0x007F) && (ch <= 0x07FF));
2676 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 break;
2678
2679 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002680 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2681 will result in surrogates in range d800-dfff. Surrogates are
2682 not valid UTF-8 so they are rejected.
2683 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2684 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002685 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002686 (s[2] & 0xc0) != 0x80 ||
2687 ((unsigned char)s[0] == 0xE0 &&
2688 (unsigned char)s[1] < 0xA0) ||
2689 ((unsigned char)s[0] == 0xED &&
2690 (unsigned char)s[1] > 0x9F)) {
2691 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002692 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002693 endinpos = startinpos + 1;
2694
2695 /* if s[1] first two bits are 1 and 0, then the invalid
2696 continuation byte is s[2], so increment endinpos by 1,
2697 if not, s[1] is invalid and endinpos doesn't need to
2698 be incremented. */
2699 if ((s[1] & 0xC0) == 0x80)
2700 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002701 goto utf8Error;
2702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002704 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2705 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002706 break;
2707
2708 case 4:
2709 if ((s[1] & 0xc0) != 0x80 ||
2710 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002711 (s[3] & 0xc0) != 0x80 ||
2712 ((unsigned char)s[0] == 0xF0 &&
2713 (unsigned char)s[1] < 0x90) ||
2714 ((unsigned char)s[0] == 0xF4 &&
2715 (unsigned char)s[1] > 0x8F)) {
2716 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002717 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002718 endinpos = startinpos + 1;
2719 if ((s[1] & 0xC0) == 0x80) {
2720 endinpos++;
2721 if ((s[2] & 0xC0) == 0x80)
2722 endinpos++;
2723 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002724 goto utf8Error;
2725 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002726 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002727 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2728 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2729
Fredrik Lundh8f455852001-06-27 18:59:43 +00002730#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002731 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002732#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002733 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002734
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002735 /* translate from 10000..10FFFF to 0..FFFF */
2736 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002737
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002738 /* high surrogate = top 10 bits added to D800 */
2739 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002740
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002741 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002742 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002743#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 }
2746 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002747 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002748
Benjamin Peterson29060642009-01-31 22:14:21 +00002749 utf8Error:
2750 outpos = p-PyUnicode_AS_UNICODE(unicode);
2751 if (unicode_decode_call_errorhandler(
2752 errors, &errorHandler,
2753 "utf8", errmsg,
2754 &starts, &e, &startinpos, &endinpos, &exc, &s,
2755 &unicode, &outpos, &p))
2756 goto onError;
2757 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 }
Walter Dörwald69652032004-09-07 20:24:22 +00002759 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002760 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761
2762 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002763 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 goto onError;
2765
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002766 Py_XDECREF(errorHandler);
2767 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768 return (PyObject *)unicode;
2769
Benjamin Peterson29060642009-01-31 22:14:21 +00002770 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002771 Py_XDECREF(errorHandler);
2772 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 Py_DECREF(unicode);
2774 return NULL;
2775}
2776
Antoine Pitrouab868312009-01-10 15:40:25 +00002777#undef ASCII_CHAR_MASK
2778
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002779#ifdef __APPLE__
2780
2781/* Simplified UTF-8 decoder using surrogateescape error handler,
2782 used to decode the command line arguments on Mac OS X. */
2783
2784wchar_t*
2785_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2786{
2787 int n;
2788 const char *e;
2789 wchar_t *unicode, *p;
2790
2791 /* Note: size will always be longer than the resulting Unicode
2792 character count */
2793 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2794 PyErr_NoMemory();
2795 return NULL;
2796 }
2797 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2798 if (!unicode)
2799 return NULL;
2800
2801 /* Unpack UTF-8 encoded data */
2802 p = unicode;
2803 e = s + size;
2804 while (s < e) {
2805 Py_UCS4 ch = (unsigned char)*s;
2806
2807 if (ch < 0x80) {
2808 *p++ = (wchar_t)ch;
2809 s++;
2810 continue;
2811 }
2812
2813 n = utf8_code_length[ch];
2814 if (s + n > e) {
2815 goto surrogateescape;
2816 }
2817
2818 switch (n) {
2819 case 0:
2820 case 1:
2821 goto surrogateescape;
2822
2823 case 2:
2824 if ((s[1] & 0xc0) != 0x80)
2825 goto surrogateescape;
2826 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2827 assert ((ch > 0x007F) && (ch <= 0x07FF));
2828 *p++ = (wchar_t)ch;
2829 break;
2830
2831 case 3:
2832 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2833 will result in surrogates in range d800-dfff. Surrogates are
2834 not valid UTF-8 so they are rejected.
2835 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2836 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2837 if ((s[1] & 0xc0) != 0x80 ||
2838 (s[2] & 0xc0) != 0x80 ||
2839 ((unsigned char)s[0] == 0xE0 &&
2840 (unsigned char)s[1] < 0xA0) ||
2841 ((unsigned char)s[0] == 0xED &&
2842 (unsigned char)s[1] > 0x9F)) {
2843
2844 goto surrogateescape;
2845 }
2846 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2847 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2848 *p++ = (Py_UNICODE)ch;
2849 break;
2850
2851 case 4:
2852 if ((s[1] & 0xc0) != 0x80 ||
2853 (s[2] & 0xc0) != 0x80 ||
2854 (s[3] & 0xc0) != 0x80 ||
2855 ((unsigned char)s[0] == 0xF0 &&
2856 (unsigned char)s[1] < 0x90) ||
2857 ((unsigned char)s[0] == 0xF4 &&
2858 (unsigned char)s[1] > 0x8F)) {
2859 goto surrogateescape;
2860 }
2861 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2862 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2863 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2864
2865#if SIZEOF_WCHAR_T == 4
2866 *p++ = (wchar_t)ch;
2867#else
2868 /* compute and append the two surrogates: */
2869
2870 /* translate from 10000..10FFFF to 0..FFFF */
2871 ch -= 0x10000;
2872
2873 /* high surrogate = top 10 bits added to D800 */
2874 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2875
2876 /* low surrogate = bottom 10 bits added to DC00 */
2877 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2878#endif
2879 break;
2880 }
2881 s += n;
2882 continue;
2883
2884 surrogateescape:
2885 *p++ = 0xDC00 + ch;
2886 s++;
2887 }
2888 *p = L'\0';
2889 return unicode;
2890}
2891
2892#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002893
Tim Peters602f7402002-04-27 18:03:26 +00002894/* Allocation strategy: if the string is short, convert into a stack buffer
2895 and allocate exactly as much space needed at the end. Else allocate the
2896 maximum possible needed (4 result bytes per Unicode character), and return
2897 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002898*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002899PyObject *
2900PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002901 Py_ssize_t size,
2902 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903{
Tim Peters602f7402002-04-27 18:03:26 +00002904#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002905
Guido van Rossum98297ee2007-11-06 21:34:58 +00002906 Py_ssize_t i; /* index into s of next input byte */
2907 PyObject *result; /* result string object */
2908 char *p; /* next free byte in output buffer */
2909 Py_ssize_t nallocated; /* number of result bytes allocated */
2910 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002911 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002912 PyObject *errorHandler = NULL;
2913 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002914
Tim Peters602f7402002-04-27 18:03:26 +00002915 assert(s != NULL);
2916 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002917
Tim Peters602f7402002-04-27 18:03:26 +00002918 if (size <= MAX_SHORT_UNICHARS) {
2919 /* Write into the stack buffer; nallocated can't overflow.
2920 * At the end, we'll allocate exactly as much heap space as it
2921 * turns out we need.
2922 */
2923 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002924 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002925 p = stackbuf;
2926 }
2927 else {
2928 /* Overallocate on the heap, and give the excess back at the end. */
2929 nallocated = size * 4;
2930 if (nallocated / 4 != size) /* overflow! */
2931 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002932 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002933 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002934 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002935 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002936 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002937
Tim Peters602f7402002-04-27 18:03:26 +00002938 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002939 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002940
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002941 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002942 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002944
Guido van Rossumd57fd912000-03-10 22:53:23 +00002945 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002946 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002947 *p++ = (char)(0xc0 | (ch >> 6));
2948 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002949 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002950#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002951 /* Special case: check for high and low surrogate */
2952 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2953 Py_UCS4 ch2 = s[i];
2954 /* Combine the two surrogates to form a UCS4 value */
2955 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2956 i++;
2957
2958 /* Encode UCS4 Unicode ordinals */
2959 *p++ = (char)(0xf0 | (ch >> 18));
2960 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002961 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2962 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002963 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002964#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002965 Py_ssize_t newpos;
2966 PyObject *rep;
2967 Py_ssize_t repsize, k;
2968 rep = unicode_encode_call_errorhandler
2969 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2970 s, size, &exc, i-1, i, &newpos);
2971 if (!rep)
2972 goto error;
2973
2974 if (PyBytes_Check(rep))
2975 repsize = PyBytes_GET_SIZE(rep);
2976 else
2977 repsize = PyUnicode_GET_SIZE(rep);
2978
2979 if (repsize > 4) {
2980 Py_ssize_t offset;
2981
2982 if (result == NULL)
2983 offset = p - stackbuf;
2984 else
2985 offset = p - PyBytes_AS_STRING(result);
2986
2987 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2988 /* integer overflow */
2989 PyErr_NoMemory();
2990 goto error;
2991 }
2992 nallocated += repsize - 4;
2993 if (result != NULL) {
2994 if (_PyBytes_Resize(&result, nallocated) < 0)
2995 goto error;
2996 } else {
2997 result = PyBytes_FromStringAndSize(NULL, nallocated);
2998 if (result == NULL)
2999 goto error;
3000 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3001 }
3002 p = PyBytes_AS_STRING(result) + offset;
3003 }
3004
3005 if (PyBytes_Check(rep)) {
3006 char *prep = PyBytes_AS_STRING(rep);
3007 for(k = repsize; k > 0; k--)
3008 *p++ = *prep++;
3009 } else /* rep is unicode */ {
3010 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3011 Py_UNICODE c;
3012
3013 for(k=0; k<repsize; k++) {
3014 c = prep[k];
3015 if (0x80 <= c) {
3016 raise_encode_exception(&exc, "utf-8", s, size,
3017 i-1, i, "surrogates not allowed");
3018 goto error;
3019 }
3020 *p++ = (char)prep[k];
3021 }
3022 }
3023 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003024#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003025 }
Victor Stinner445a6232010-04-22 20:01:57 +00003026#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003027 } else if (ch < 0x10000) {
3028 *p++ = (char)(0xe0 | (ch >> 12));
3029 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3030 *p++ = (char)(0x80 | (ch & 0x3f));
3031 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003032 /* Encode UCS4 Unicode ordinals */
3033 *p++ = (char)(0xf0 | (ch >> 18));
3034 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3035 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3036 *p++ = (char)(0x80 | (ch & 0x3f));
3037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003039
Guido van Rossum98297ee2007-11-06 21:34:58 +00003040 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003041 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003042 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003043 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003044 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003045 }
3046 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003047 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003048 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003049 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003050 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003051 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003052 Py_XDECREF(errorHandler);
3053 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003054 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003055 error:
3056 Py_XDECREF(errorHandler);
3057 Py_XDECREF(exc);
3058 Py_XDECREF(result);
3059 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003060
Tim Peters602f7402002-04-27 18:03:26 +00003061#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062}
3063
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3065{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066 if (!PyUnicode_Check(unicode)) {
3067 PyErr_BadArgument();
3068 return NULL;
3069 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003070 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003071 PyUnicode_GET_SIZE(unicode),
3072 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073}
3074
Walter Dörwald41980ca2007-08-16 21:55:45 +00003075/* --- UTF-32 Codec ------------------------------------------------------- */
3076
3077PyObject *
3078PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003079 Py_ssize_t size,
3080 const char *errors,
3081 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003082{
3083 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3084}
3085
3086PyObject *
3087PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003088 Py_ssize_t size,
3089 const char *errors,
3090 int *byteorder,
3091 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003092{
3093 const char *starts = s;
3094 Py_ssize_t startinpos;
3095 Py_ssize_t endinpos;
3096 Py_ssize_t outpos;
3097 PyUnicodeObject *unicode;
3098 Py_UNICODE *p;
3099#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003100 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003101 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003102#else
3103 const int pairs = 0;
3104#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003105 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003106 int bo = 0; /* assume native ordering by default */
3107 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003108 /* Offsets from q for retrieving bytes in the right order. */
3109#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3110 int iorder[] = {0, 1, 2, 3};
3111#else
3112 int iorder[] = {3, 2, 1, 0};
3113#endif
3114 PyObject *errorHandler = NULL;
3115 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003116
Walter Dörwald41980ca2007-08-16 21:55:45 +00003117 q = (unsigned char *)s;
3118 e = q + size;
3119
3120 if (byteorder)
3121 bo = *byteorder;
3122
3123 /* Check for BOM marks (U+FEFF) in the input and adjust current
3124 byte order setting accordingly. In native mode, the leading BOM
3125 mark is skipped, in all other modes, it is copied to the output
3126 stream as-is (giving a ZWNBSP character). */
3127 if (bo == 0) {
3128 if (size >= 4) {
3129 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003130 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003131#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003132 if (bom == 0x0000FEFF) {
3133 q += 4;
3134 bo = -1;
3135 }
3136 else if (bom == 0xFFFE0000) {
3137 q += 4;
3138 bo = 1;
3139 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003140#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003141 if (bom == 0x0000FEFF) {
3142 q += 4;
3143 bo = 1;
3144 }
3145 else if (bom == 0xFFFE0000) {
3146 q += 4;
3147 bo = -1;
3148 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003149#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003150 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003151 }
3152
3153 if (bo == -1) {
3154 /* force LE */
3155 iorder[0] = 0;
3156 iorder[1] = 1;
3157 iorder[2] = 2;
3158 iorder[3] = 3;
3159 }
3160 else if (bo == 1) {
3161 /* force BE */
3162 iorder[0] = 3;
3163 iorder[1] = 2;
3164 iorder[2] = 1;
3165 iorder[3] = 0;
3166 }
3167
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003168 /* On narrow builds we split characters outside the BMP into two
3169 codepoints => count how much extra space we need. */
3170#ifndef Py_UNICODE_WIDE
3171 for (qq = q; qq < e; qq += 4)
3172 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3173 pairs++;
3174#endif
3175
3176 /* This might be one to much, because of a BOM */
3177 unicode = _PyUnicode_New((size+3)/4+pairs);
3178 if (!unicode)
3179 return NULL;
3180 if (size == 0)
3181 return (PyObject *)unicode;
3182
3183 /* Unpack UTF-32 encoded data */
3184 p = unicode->str;
3185
Walter Dörwald41980ca2007-08-16 21:55:45 +00003186 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003187 Py_UCS4 ch;
3188 /* remaining bytes at the end? (size should be divisible by 4) */
3189 if (e-q<4) {
3190 if (consumed)
3191 break;
3192 errmsg = "truncated data";
3193 startinpos = ((const char *)q)-starts;
3194 endinpos = ((const char *)e)-starts;
3195 goto utf32Error;
3196 /* The remaining input chars are ignored if the callback
3197 chooses to skip the input */
3198 }
3199 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3200 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003201
Benjamin Peterson29060642009-01-31 22:14:21 +00003202 if (ch >= 0x110000)
3203 {
3204 errmsg = "codepoint not in range(0x110000)";
3205 startinpos = ((const char *)q)-starts;
3206 endinpos = startinpos+4;
3207 goto utf32Error;
3208 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003209#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003210 if (ch >= 0x10000)
3211 {
3212 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3213 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3214 }
3215 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003216#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003217 *p++ = ch;
3218 q += 4;
3219 continue;
3220 utf32Error:
3221 outpos = p-PyUnicode_AS_UNICODE(unicode);
3222 if (unicode_decode_call_errorhandler(
3223 errors, &errorHandler,
3224 "utf32", errmsg,
3225 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3226 &unicode, &outpos, &p))
3227 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003228 }
3229
3230 if (byteorder)
3231 *byteorder = bo;
3232
3233 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003234 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003235
3236 /* Adjust length */
3237 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3238 goto onError;
3239
3240 Py_XDECREF(errorHandler);
3241 Py_XDECREF(exc);
3242 return (PyObject *)unicode;
3243
Benjamin Peterson29060642009-01-31 22:14:21 +00003244 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003245 Py_DECREF(unicode);
3246 Py_XDECREF(errorHandler);
3247 Py_XDECREF(exc);
3248 return NULL;
3249}
3250
3251PyObject *
3252PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003253 Py_ssize_t size,
3254 const char *errors,
3255 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003256{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003257 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003258 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003259 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003260#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003261 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003262#else
3263 const int pairs = 0;
3264#endif
3265 /* Offsets from p for storing byte pairs in the right order. */
3266#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3267 int iorder[] = {0, 1, 2, 3};
3268#else
3269 int iorder[] = {3, 2, 1, 0};
3270#endif
3271
Benjamin Peterson29060642009-01-31 22:14:21 +00003272#define STORECHAR(CH) \
3273 do { \
3274 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3275 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3276 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3277 p[iorder[0]] = (CH) & 0xff; \
3278 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003279 } while(0)
3280
3281 /* In narrow builds we can output surrogate pairs as one codepoint,
3282 so we need less space. */
3283#ifndef Py_UNICODE_WIDE
3284 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3286 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3287 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003288#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003289 nsize = (size - pairs + (byteorder == 0));
3290 bytesize = nsize * 4;
3291 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003292 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003293 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003294 if (v == NULL)
3295 return NULL;
3296
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003297 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003298 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003299 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003300 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003301 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003302
3303 if (byteorder == -1) {
3304 /* force LE */
3305 iorder[0] = 0;
3306 iorder[1] = 1;
3307 iorder[2] = 2;
3308 iorder[3] = 3;
3309 }
3310 else if (byteorder == 1) {
3311 /* force BE */
3312 iorder[0] = 3;
3313 iorder[1] = 2;
3314 iorder[2] = 1;
3315 iorder[3] = 0;
3316 }
3317
3318 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003319 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003320#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003321 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3322 Py_UCS4 ch2 = *s;
3323 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3324 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3325 s++;
3326 size--;
3327 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003328 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003329#endif
3330 STORECHAR(ch);
3331 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003332
3333 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003334 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003335#undef STORECHAR
3336}
3337
3338PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3339{
3340 if (!PyUnicode_Check(unicode)) {
3341 PyErr_BadArgument();
3342 return NULL;
3343 }
3344 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003345 PyUnicode_GET_SIZE(unicode),
3346 NULL,
3347 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003348}
3349
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350/* --- UTF-16 Codec ------------------------------------------------------- */
3351
Tim Peters772747b2001-08-09 22:21:55 +00003352PyObject *
3353PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003354 Py_ssize_t size,
3355 const char *errors,
3356 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357{
Walter Dörwald69652032004-09-07 20:24:22 +00003358 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3359}
3360
Antoine Pitrouab868312009-01-10 15:40:25 +00003361/* Two masks for fast checking of whether a C 'long' may contain
3362 UTF16-encoded surrogate characters. This is an efficient heuristic,
3363 assuming that non-surrogate characters with a code point >= 0x8000 are
3364 rare in most input.
3365 FAST_CHAR_MASK is used when the input is in native byte ordering,
3366 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003367*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003368#if (SIZEOF_LONG == 8)
3369# define FAST_CHAR_MASK 0x8000800080008000L
3370# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3371#elif (SIZEOF_LONG == 4)
3372# define FAST_CHAR_MASK 0x80008000L
3373# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3374#else
3375# error C 'long' size should be either 4 or 8!
3376#endif
3377
Walter Dörwald69652032004-09-07 20:24:22 +00003378PyObject *
3379PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003380 Py_ssize_t size,
3381 const char *errors,
3382 int *byteorder,
3383 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003384{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003385 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003386 Py_ssize_t startinpos;
3387 Py_ssize_t endinpos;
3388 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389 PyUnicodeObject *unicode;
3390 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003391 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003392 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003393 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003394 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003395 /* Offsets from q for retrieving byte pairs in the right order. */
3396#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3397 int ihi = 1, ilo = 0;
3398#else
3399 int ihi = 0, ilo = 1;
3400#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003401 PyObject *errorHandler = NULL;
3402 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403
3404 /* Note: size will always be longer than the resulting Unicode
3405 character count */
3406 unicode = _PyUnicode_New(size);
3407 if (!unicode)
3408 return NULL;
3409 if (size == 0)
3410 return (PyObject *)unicode;
3411
3412 /* Unpack UTF-16 encoded data */
3413 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003414 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003415 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416
3417 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003418 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003420 /* Check for BOM marks (U+FEFF) in the input and adjust current
3421 byte order setting accordingly. In native mode, the leading BOM
3422 mark is skipped, in all other modes, it is copied to the output
3423 stream as-is (giving a ZWNBSP character). */
3424 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003425 if (size >= 2) {
3426 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003427#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003428 if (bom == 0xFEFF) {
3429 q += 2;
3430 bo = -1;
3431 }
3432 else if (bom == 0xFFFE) {
3433 q += 2;
3434 bo = 1;
3435 }
Tim Petersced69f82003-09-16 20:30:58 +00003436#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003437 if (bom == 0xFEFF) {
3438 q += 2;
3439 bo = 1;
3440 }
3441 else if (bom == 0xFFFE) {
3442 q += 2;
3443 bo = -1;
3444 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003445#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003446 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003447 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448
Tim Peters772747b2001-08-09 22:21:55 +00003449 if (bo == -1) {
3450 /* force LE */
3451 ihi = 1;
3452 ilo = 0;
3453 }
3454 else if (bo == 1) {
3455 /* force BE */
3456 ihi = 0;
3457 ilo = 1;
3458 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003459#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3460 native_ordering = ilo < ihi;
3461#else
3462 native_ordering = ilo > ihi;
3463#endif
Tim Peters772747b2001-08-09 22:21:55 +00003464
Antoine Pitrouab868312009-01-10 15:40:25 +00003465 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003466 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003467 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003468 /* First check for possible aligned read of a C 'long'. Unaligned
3469 reads are more expensive, better to defer to another iteration. */
3470 if (!((size_t) q & LONG_PTR_MASK)) {
3471 /* Fast path for runs of non-surrogate chars. */
3472 register const unsigned char *_q = q;
3473 Py_UNICODE *_p = p;
3474 if (native_ordering) {
3475 /* Native ordering is simple: as long as the input cannot
3476 possibly contain a surrogate char, do an unrolled copy
3477 of several 16-bit code points to the target object.
3478 The non-surrogate check is done on several input bytes
3479 at a time (as many as a C 'long' can contain). */
3480 while (_q < aligned_end) {
3481 unsigned long data = * (unsigned long *) _q;
3482 if (data & FAST_CHAR_MASK)
3483 break;
3484 _p[0] = ((unsigned short *) _q)[0];
3485 _p[1] = ((unsigned short *) _q)[1];
3486#if (SIZEOF_LONG == 8)
3487 _p[2] = ((unsigned short *) _q)[2];
3488 _p[3] = ((unsigned short *) _q)[3];
3489#endif
3490 _q += SIZEOF_LONG;
3491 _p += SIZEOF_LONG / 2;
3492 }
3493 }
3494 else {
3495 /* Byteswapped ordering is similar, but we must decompose
3496 the copy bytewise, and take care of zero'ing out the
3497 upper bytes if the target object is in 32-bit units
3498 (that is, in UCS-4 builds). */
3499 while (_q < aligned_end) {
3500 unsigned long data = * (unsigned long *) _q;
3501 if (data & SWAPPED_FAST_CHAR_MASK)
3502 break;
3503 /* Zero upper bytes in UCS-4 builds */
3504#if (Py_UNICODE_SIZE > 2)
3505 _p[0] = 0;
3506 _p[1] = 0;
3507#if (SIZEOF_LONG == 8)
3508 _p[2] = 0;
3509 _p[3] = 0;
3510#endif
3511#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003512 /* Issue #4916; UCS-4 builds on big endian machines must
3513 fill the two last bytes of each 4-byte unit. */
3514#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3515# define OFF 2
3516#else
3517# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003518#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003519 ((unsigned char *) _p)[OFF + 1] = _q[0];
3520 ((unsigned char *) _p)[OFF + 0] = _q[1];
3521 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3522 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3523#if (SIZEOF_LONG == 8)
3524 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3525 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3526 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3527 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3528#endif
3529#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003530 _q += SIZEOF_LONG;
3531 _p += SIZEOF_LONG / 2;
3532 }
3533 }
3534 p = _p;
3535 q = _q;
3536 if (q >= e)
3537 break;
3538 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003539 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540
Benjamin Peterson14339b62009-01-31 16:36:08 +00003541 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003542
3543 if (ch < 0xD800 || ch > 0xDFFF) {
3544 *p++ = ch;
3545 continue;
3546 }
3547
3548 /* UTF-16 code pair: */
3549 if (q > e) {
3550 errmsg = "unexpected end of data";
3551 startinpos = (((const char *)q) - 2) - starts;
3552 endinpos = ((const char *)e) + 1 - starts;
3553 goto utf16Error;
3554 }
3555 if (0xD800 <= ch && ch <= 0xDBFF) {
3556 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3557 q += 2;
3558 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003559#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003560 *p++ = ch;
3561 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003562#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003563 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003564#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003565 continue;
3566 }
3567 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003568 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003569 startinpos = (((const char *)q)-4)-starts;
3570 endinpos = startinpos+2;
3571 goto utf16Error;
3572 }
3573
Benjamin Peterson14339b62009-01-31 16:36:08 +00003574 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003575 errmsg = "illegal encoding";
3576 startinpos = (((const char *)q)-2)-starts;
3577 endinpos = startinpos+2;
3578 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003579
Benjamin Peterson29060642009-01-31 22:14:21 +00003580 utf16Error:
3581 outpos = p - PyUnicode_AS_UNICODE(unicode);
3582 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003583 errors,
3584 &errorHandler,
3585 "utf16", errmsg,
3586 &starts,
3587 (const char **)&e,
3588 &startinpos,
3589 &endinpos,
3590 &exc,
3591 (const char **)&q,
3592 &unicode,
3593 &outpos,
3594 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003595 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003597 /* remaining byte at the end? (size should be even) */
3598 if (e == q) {
3599 if (!consumed) {
3600 errmsg = "truncated data";
3601 startinpos = ((const char *)q) - starts;
3602 endinpos = ((const char *)e) + 1 - starts;
3603 outpos = p - PyUnicode_AS_UNICODE(unicode);
3604 if (unicode_decode_call_errorhandler(
3605 errors,
3606 &errorHandler,
3607 "utf16", errmsg,
3608 &starts,
3609 (const char **)&e,
3610 &startinpos,
3611 &endinpos,
3612 &exc,
3613 (const char **)&q,
3614 &unicode,
3615 &outpos,
3616 &p))
3617 goto onError;
3618 /* The remaining input chars are ignored if the callback
3619 chooses to skip the input */
3620 }
3621 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622
3623 if (byteorder)
3624 *byteorder = bo;
3625
Walter Dörwald69652032004-09-07 20:24:22 +00003626 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003627 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003628
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003630 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631 goto onError;
3632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 Py_XDECREF(errorHandler);
3634 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003635 return (PyObject *)unicode;
3636
Benjamin Peterson29060642009-01-31 22:14:21 +00003637 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639 Py_XDECREF(errorHandler);
3640 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641 return NULL;
3642}
3643
Antoine Pitrouab868312009-01-10 15:40:25 +00003644#undef FAST_CHAR_MASK
3645#undef SWAPPED_FAST_CHAR_MASK
3646
Tim Peters772747b2001-08-09 22:21:55 +00003647PyObject *
3648PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003649 Py_ssize_t size,
3650 const char *errors,
3651 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003653 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003654 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003655 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003656#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003657 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003658#else
3659 const int pairs = 0;
3660#endif
Tim Peters772747b2001-08-09 22:21:55 +00003661 /* Offsets from p for storing byte pairs in the right order. */
3662#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3663 int ihi = 1, ilo = 0;
3664#else
3665 int ihi = 0, ilo = 1;
3666#endif
3667
Benjamin Peterson29060642009-01-31 22:14:21 +00003668#define STORECHAR(CH) \
3669 do { \
3670 p[ihi] = ((CH) >> 8) & 0xff; \
3671 p[ilo] = (CH) & 0xff; \
3672 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003673 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003675#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003676 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003677 if (s[i] >= 0x10000)
3678 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003679#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003680 /* 2 * (size + pairs + (byteorder == 0)) */
3681 if (size > PY_SSIZE_T_MAX ||
3682 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003683 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003684 nsize = size + pairs + (byteorder == 0);
3685 bytesize = nsize * 2;
3686 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003687 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003688 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 if (v == NULL)
3690 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003692 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003694 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003695 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003696 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003697
3698 if (byteorder == -1) {
3699 /* force LE */
3700 ihi = 1;
3701 ilo = 0;
3702 }
3703 else if (byteorder == 1) {
3704 /* force BE */
3705 ihi = 0;
3706 ilo = 1;
3707 }
3708
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003709 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003710 Py_UNICODE ch = *s++;
3711 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003712#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003713 if (ch >= 0x10000) {
3714 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3715 ch = 0xD800 | ((ch-0x10000) >> 10);
3716 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003717#endif
Tim Peters772747b2001-08-09 22:21:55 +00003718 STORECHAR(ch);
3719 if (ch2)
3720 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003721 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003722
3723 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003724 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003725#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003726}
3727
3728PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3729{
3730 if (!PyUnicode_Check(unicode)) {
3731 PyErr_BadArgument();
3732 return NULL;
3733 }
3734 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003735 PyUnicode_GET_SIZE(unicode),
3736 NULL,
3737 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738}
3739
3740/* --- Unicode Escape Codec ----------------------------------------------- */
3741
Fredrik Lundh06d12682001-01-24 07:59:11 +00003742static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003743
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003745 Py_ssize_t size,
3746 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003748 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003749 Py_ssize_t startinpos;
3750 Py_ssize_t endinpos;
3751 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003754 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003756 char* message;
3757 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 PyObject *errorHandler = NULL;
3759 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003760
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761 /* Escaped strings will always be longer than the resulting
3762 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003763 length after conversion to the true value.
3764 (but if the error callback returns a long replacement string
3765 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766 v = _PyUnicode_New(size);
3767 if (v == NULL)
3768 goto onError;
3769 if (size == 0)
3770 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003771
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003772 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003774
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 while (s < end) {
3776 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003777 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779
3780 /* Non-escape characters are interpreted as Unicode ordinals */
3781 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003782 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 continue;
3784 }
3785
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003786 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003787 /* \ - Escapes */
3788 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003789 c = *s++;
3790 if (s > end)
3791 c = '\0'; /* Invalid after \ */
3792 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793
Benjamin Peterson29060642009-01-31 22:14:21 +00003794 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795 case '\n': break;
3796 case '\\': *p++ = '\\'; break;
3797 case '\'': *p++ = '\''; break;
3798 case '\"': *p++ = '\"'; break;
3799 case 'b': *p++ = '\b'; break;
3800 case 'f': *p++ = '\014'; break; /* FF */
3801 case 't': *p++ = '\t'; break;
3802 case 'n': *p++ = '\n'; break;
3803 case 'r': *p++ = '\r'; break;
3804 case 'v': *p++ = '\013'; break; /* VT */
3805 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3806
Benjamin Peterson29060642009-01-31 22:14:21 +00003807 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808 case '0': case '1': case '2': case '3':
3809 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003810 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003811 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003812 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003813 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003814 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003816 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 break;
3818
Benjamin Peterson29060642009-01-31 22:14:21 +00003819 /* hex escapes */
3820 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003822 digits = 2;
3823 message = "truncated \\xXX escape";
3824 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825
Benjamin Peterson29060642009-01-31 22:14:21 +00003826 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003828 digits = 4;
3829 message = "truncated \\uXXXX escape";
3830 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831
Benjamin Peterson29060642009-01-31 22:14:21 +00003832 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003833 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003834 digits = 8;
3835 message = "truncated \\UXXXXXXXX escape";
3836 hexescape:
3837 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003838 outpos = p-PyUnicode_AS_UNICODE(v);
3839 if (s+digits>end) {
3840 endinpos = size;
3841 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003842 errors, &errorHandler,
3843 "unicodeescape", "end of string in escape sequence",
3844 &starts, &end, &startinpos, &endinpos, &exc, &s,
3845 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003846 goto onError;
3847 goto nextByte;
3848 }
3849 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003850 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003851 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003852 endinpos = (s+i+1)-starts;
3853 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003854 errors, &errorHandler,
3855 "unicodeescape", message,
3856 &starts, &end, &startinpos, &endinpos, &exc, &s,
3857 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003858 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003859 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003860 }
3861 chr = (chr<<4) & ~0xF;
3862 if (c >= '0' && c <= '9')
3863 chr += c - '0';
3864 else if (c >= 'a' && c <= 'f')
3865 chr += 10 + c - 'a';
3866 else
3867 chr += 10 + c - 'A';
3868 }
3869 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003870 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003871 /* _decoding_error will have already written into the
3872 target buffer. */
3873 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003874 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003875 /* when we get here, chr is a 32-bit unicode character */
3876 if (chr <= 0xffff)
3877 /* UCS-2 character */
3878 *p++ = (Py_UNICODE) chr;
3879 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003880 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003881 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003882#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003883 *p++ = chr;
3884#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003885 chr -= 0x10000L;
3886 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003887 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003888#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003889 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003890 endinpos = s-starts;
3891 outpos = p-PyUnicode_AS_UNICODE(v);
3892 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003893 errors, &errorHandler,
3894 "unicodeescape", "illegal Unicode character",
3895 &starts, &end, &startinpos, &endinpos, &exc, &s,
3896 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003897 goto onError;
3898 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003899 break;
3900
Benjamin Peterson29060642009-01-31 22:14:21 +00003901 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003902 case 'N':
3903 message = "malformed \\N character escape";
3904 if (ucnhash_CAPI == NULL) {
3905 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003906 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003907 if (ucnhash_CAPI == NULL)
3908 goto ucnhashError;
3909 }
3910 if (*s == '{') {
3911 const char *start = s+1;
3912 /* look for the closing brace */
3913 while (*s != '}' && s < end)
3914 s++;
3915 if (s > start && s < end && *s == '}') {
3916 /* found a name. look it up in the unicode database */
3917 message = "unknown Unicode character name";
3918 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003919 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003920 goto store;
3921 }
3922 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003923 endinpos = s-starts;
3924 outpos = p-PyUnicode_AS_UNICODE(v);
3925 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003926 errors, &errorHandler,
3927 "unicodeescape", message,
3928 &starts, &end, &startinpos, &endinpos, &exc, &s,
3929 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003930 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003931 break;
3932
3933 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003934 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 message = "\\ at end of string";
3936 s--;
3937 endinpos = s-starts;
3938 outpos = p-PyUnicode_AS_UNICODE(v);
3939 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003940 errors, &errorHandler,
3941 "unicodeescape", message,
3942 &starts, &end, &startinpos, &endinpos, &exc, &s,
3943 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003944 goto onError;
3945 }
3946 else {
3947 *p++ = '\\';
3948 *p++ = (unsigned char)s[-1];
3949 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003950 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003952 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003953 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003955 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003956 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003957 Py_XDECREF(errorHandler);
3958 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003960
Benjamin Peterson29060642009-01-31 22:14:21 +00003961 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003962 PyErr_SetString(
3963 PyExc_UnicodeError,
3964 "\\N escapes not supported (can't load unicodedata module)"
3965 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003966 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003967 Py_XDECREF(errorHandler);
3968 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003969 return NULL;
3970
Benjamin Peterson29060642009-01-31 22:14:21 +00003971 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003973 Py_XDECREF(errorHandler);
3974 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 return NULL;
3976}
3977
3978/* Return a Unicode-Escape string version of the Unicode object.
3979
3980 If quotes is true, the string is enclosed in u"" or u'' quotes as
3981 appropriate.
3982
3983*/
3984
Thomas Wouters477c8d52006-05-27 19:21:47 +00003985Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003986 Py_ssize_t size,
3987 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003988{
3989 /* like wcschr, but doesn't stop at NULL characters */
3990
3991 while (size-- > 0) {
3992 if (*s == ch)
3993 return s;
3994 s++;
3995 }
3996
3997 return NULL;
3998}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003999
Walter Dörwald79e913e2007-05-12 11:08:06 +00004000static const char *hexdigits = "0123456789abcdef";
4001
4002PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004003 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004005 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004008#ifdef Py_UNICODE_WIDE
4009 const Py_ssize_t expandsize = 10;
4010#else
4011 const Py_ssize_t expandsize = 6;
4012#endif
4013
Thomas Wouters89f507f2006-12-13 04:49:30 +00004014 /* XXX(nnorwitz): rather than over-allocating, it would be
4015 better to choose a different scheme. Perhaps scan the
4016 first N-chars of the string and allocate based on that size.
4017 */
4018 /* Initial allocation is based on the longest-possible unichr
4019 escape.
4020
4021 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4022 unichr, so in this case it's the longest unichr escape. In
4023 narrow (UTF-16) builds this is five chars per source unichr
4024 since there are two unichrs in the surrogate pair, so in narrow
4025 (UTF-16) builds it's not the longest unichr escape.
4026
4027 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4028 so in the narrow (UTF-16) build case it's the longest unichr
4029 escape.
4030 */
4031
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004032 if (size == 0)
4033 return PyBytes_FromStringAndSize(NULL, 0);
4034
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004035 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004036 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004037
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004038 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004039 2
4040 + expandsize*size
4041 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042 if (repr == NULL)
4043 return NULL;
4044
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004045 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046
Guido van Rossumd57fd912000-03-10 22:53:23 +00004047 while (size-- > 0) {
4048 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004049
Walter Dörwald79e913e2007-05-12 11:08:06 +00004050 /* Escape backslashes */
4051 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 *p++ = '\\';
4053 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004054 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004055 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004056
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004057#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004058 /* Map 21-bit characters to '\U00xxxxxx' */
4059 else if (ch >= 0x10000) {
4060 *p++ = '\\';
4061 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004062 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4063 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4064 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4065 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4066 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4067 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4068 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4069 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004071 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004072#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004073 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4074 else if (ch >= 0xD800 && ch < 0xDC00) {
4075 Py_UNICODE ch2;
4076 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004077
Benjamin Peterson29060642009-01-31 22:14:21 +00004078 ch2 = *s++;
4079 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004080 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004081 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4082 *p++ = '\\';
4083 *p++ = 'U';
4084 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4085 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4086 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4087 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4088 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4089 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4090 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4091 *p++ = hexdigits[ucs & 0x0000000F];
4092 continue;
4093 }
4094 /* Fall through: isolated surrogates are copied as-is */
4095 s--;
4096 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004097 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004098#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004099
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004101 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102 *p++ = '\\';
4103 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004104 *p++ = hexdigits[(ch >> 12) & 0x000F];
4105 *p++ = hexdigits[(ch >> 8) & 0x000F];
4106 *p++ = hexdigits[(ch >> 4) & 0x000F];
4107 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004108 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004109
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004110 /* Map special whitespace to '\t', \n', '\r' */
4111 else if (ch == '\t') {
4112 *p++ = '\\';
4113 *p++ = 't';
4114 }
4115 else if (ch == '\n') {
4116 *p++ = '\\';
4117 *p++ = 'n';
4118 }
4119 else if (ch == '\r') {
4120 *p++ = '\\';
4121 *p++ = 'r';
4122 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004123
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004124 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004125 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004127 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004128 *p++ = hexdigits[(ch >> 4) & 0x000F];
4129 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004130 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004131
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 /* Copy everything else as-is */
4133 else
4134 *p++ = (char) ch;
4135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004137 assert(p - PyBytes_AS_STRING(repr) > 0);
4138 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4139 return NULL;
4140 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141}
4142
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00004143PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004145 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146 if (!PyUnicode_Check(unicode)) {
4147 PyErr_BadArgument();
4148 return NULL;
4149 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004150 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4151 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004152 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153}
4154
4155/* --- Raw Unicode Escape Codec ------------------------------------------- */
4156
4157PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004158 Py_ssize_t size,
4159 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004162 Py_ssize_t startinpos;
4163 Py_ssize_t endinpos;
4164 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167 const char *end;
4168 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004169 PyObject *errorHandler = NULL;
4170 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004171
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172 /* Escaped strings will always be longer than the resulting
4173 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174 length after conversion to the true value. (But decoding error
4175 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176 v = _PyUnicode_New(size);
4177 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004180 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004181 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182 end = s + size;
4183 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004184 unsigned char c;
4185 Py_UCS4 x;
4186 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004187 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188
Benjamin Peterson29060642009-01-31 22:14:21 +00004189 /* Non-escape characters are interpreted as Unicode ordinals */
4190 if (*s != '\\') {
4191 *p++ = (unsigned char)*s++;
4192 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004193 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 startinpos = s-starts;
4195
4196 /* \u-escapes are only interpreted iff the number of leading
4197 backslashes if odd */
4198 bs = s;
4199 for (;s < end;) {
4200 if (*s != '\\')
4201 break;
4202 *p++ = (unsigned char)*s++;
4203 }
4204 if (((s - bs) & 1) == 0 ||
4205 s >= end ||
4206 (*s != 'u' && *s != 'U')) {
4207 continue;
4208 }
4209 p--;
4210 count = *s=='u' ? 4 : 8;
4211 s++;
4212
4213 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4214 outpos = p-PyUnicode_AS_UNICODE(v);
4215 for (x = 0, i = 0; i < count; ++i, ++s) {
4216 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004217 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004218 endinpos = s-starts;
4219 if (unicode_decode_call_errorhandler(
4220 errors, &errorHandler,
4221 "rawunicodeescape", "truncated \\uXXXX",
4222 &starts, &end, &startinpos, &endinpos, &exc, &s,
4223 &v, &outpos, &p))
4224 goto onError;
4225 goto nextByte;
4226 }
4227 x = (x<<4) & ~0xF;
4228 if (c >= '0' && c <= '9')
4229 x += c - '0';
4230 else if (c >= 'a' && c <= 'f')
4231 x += 10 + c - 'a';
4232 else
4233 x += 10 + c - 'A';
4234 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004235 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004236 /* UCS-2 character */
4237 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004238 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004239 /* UCS-4 character. Either store directly, or as
4240 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004241#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004242 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004243#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 x -= 0x10000L;
4245 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4246 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004247#endif
4248 } else {
4249 endinpos = s-starts;
4250 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004251 if (unicode_decode_call_errorhandler(
4252 errors, &errorHandler,
4253 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004254 &starts, &end, &startinpos, &endinpos, &exc, &s,
4255 &v, &outpos, &p))
4256 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004257 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004258 nextByte:
4259 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004261 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004262 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263 Py_XDECREF(errorHandler);
4264 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004265 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004266
Benjamin Peterson29060642009-01-31 22:14:21 +00004267 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269 Py_XDECREF(errorHandler);
4270 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271 return NULL;
4272}
4273
4274PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004275 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004277 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278 char *p;
4279 char *q;
4280
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004281#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004282 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004283#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004284 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004285#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004286
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004287 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004289
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004290 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291 if (repr == NULL)
4292 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004293 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004294 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004295
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004296 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 while (size-- > 0) {
4298 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004299#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004300 /* Map 32-bit characters to '\Uxxxxxxxx' */
4301 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004302 *p++ = '\\';
4303 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004304 *p++ = hexdigits[(ch >> 28) & 0xf];
4305 *p++ = hexdigits[(ch >> 24) & 0xf];
4306 *p++ = hexdigits[(ch >> 20) & 0xf];
4307 *p++ = hexdigits[(ch >> 16) & 0xf];
4308 *p++ = hexdigits[(ch >> 12) & 0xf];
4309 *p++ = hexdigits[(ch >> 8) & 0xf];
4310 *p++ = hexdigits[(ch >> 4) & 0xf];
4311 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004312 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004313 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004314#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004315 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4316 if (ch >= 0xD800 && ch < 0xDC00) {
4317 Py_UNICODE ch2;
4318 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004319
Benjamin Peterson29060642009-01-31 22:14:21 +00004320 ch2 = *s++;
4321 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004322 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004323 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4324 *p++ = '\\';
4325 *p++ = 'U';
4326 *p++ = hexdigits[(ucs >> 28) & 0xf];
4327 *p++ = hexdigits[(ucs >> 24) & 0xf];
4328 *p++ = hexdigits[(ucs >> 20) & 0xf];
4329 *p++ = hexdigits[(ucs >> 16) & 0xf];
4330 *p++ = hexdigits[(ucs >> 12) & 0xf];
4331 *p++ = hexdigits[(ucs >> 8) & 0xf];
4332 *p++ = hexdigits[(ucs >> 4) & 0xf];
4333 *p++ = hexdigits[ucs & 0xf];
4334 continue;
4335 }
4336 /* Fall through: isolated surrogates are copied as-is */
4337 s--;
4338 size++;
4339 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004340#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004341 /* Map 16-bit characters to '\uxxxx' */
4342 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343 *p++ = '\\';
4344 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004345 *p++ = hexdigits[(ch >> 12) & 0xf];
4346 *p++ = hexdigits[(ch >> 8) & 0xf];
4347 *p++ = hexdigits[(ch >> 4) & 0xf];
4348 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004350 /* Copy everything else as-is */
4351 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004352 *p++ = (char) ch;
4353 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004354 size = p - q;
4355
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004356 assert(size > 0);
4357 if (_PyBytes_Resize(&repr, size) < 0)
4358 return NULL;
4359 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004360}
4361
4362PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4363{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004364 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004366 PyErr_BadArgument();
4367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004368 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004369 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4370 PyUnicode_GET_SIZE(unicode));
4371
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004372 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373}
4374
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004375/* --- Unicode Internal Codec ------------------------------------------- */
4376
4377PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004378 Py_ssize_t size,
4379 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004380{
4381 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004382 Py_ssize_t startinpos;
4383 Py_ssize_t endinpos;
4384 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004385 PyUnicodeObject *v;
4386 Py_UNICODE *p;
4387 const char *end;
4388 const char *reason;
4389 PyObject *errorHandler = NULL;
4390 PyObject *exc = NULL;
4391
Neal Norwitzd43069c2006-01-08 01:12:10 +00004392#ifdef Py_UNICODE_WIDE
4393 Py_UNICODE unimax = PyUnicode_GetMax();
4394#endif
4395
Thomas Wouters89f507f2006-12-13 04:49:30 +00004396 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004397 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4398 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004400 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004401 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004402 p = PyUnicode_AS_UNICODE(v);
4403 end = s + size;
4404
4405 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004406 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004407 /* We have to sanity check the raw data, otherwise doom looms for
4408 some malformed UCS-4 data. */
4409 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004410#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004411 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004412#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004413 end-s < Py_UNICODE_SIZE
4414 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004415 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004416 startinpos = s - starts;
4417 if (end-s < Py_UNICODE_SIZE) {
4418 endinpos = end-starts;
4419 reason = "truncated input";
4420 }
4421 else {
4422 endinpos = s - starts + Py_UNICODE_SIZE;
4423 reason = "illegal code point (> 0x10FFFF)";
4424 }
4425 outpos = p - PyUnicode_AS_UNICODE(v);
4426 if (unicode_decode_call_errorhandler(
4427 errors, &errorHandler,
4428 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004429 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004430 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004431 goto onError;
4432 }
4433 }
4434 else {
4435 p++;
4436 s += Py_UNICODE_SIZE;
4437 }
4438 }
4439
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004440 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004441 goto onError;
4442 Py_XDECREF(errorHandler);
4443 Py_XDECREF(exc);
4444 return (PyObject *)v;
4445
Benjamin Peterson29060642009-01-31 22:14:21 +00004446 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004447 Py_XDECREF(v);
4448 Py_XDECREF(errorHandler);
4449 Py_XDECREF(exc);
4450 return NULL;
4451}
4452
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453/* --- Latin-1 Codec ------------------------------------------------------ */
4454
4455PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004456 Py_ssize_t size,
4457 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458{
4459 PyUnicodeObject *v;
4460 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004461 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004462
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004464 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 Py_UNICODE r = *(unsigned char*)s;
4466 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004467 }
4468
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 v = _PyUnicode_New(size);
4470 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004471 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004473 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004475 e = s + size;
4476 /* Unrolling the copy makes it much faster by reducing the looping
4477 overhead. This is similar to what many memcpy() implementations do. */
4478 unrolled_end = e - 4;
4479 while (s < unrolled_end) {
4480 p[0] = (unsigned char) s[0];
4481 p[1] = (unsigned char) s[1];
4482 p[2] = (unsigned char) s[2];
4483 p[3] = (unsigned char) s[3];
4484 s += 4;
4485 p += 4;
4486 }
4487 while (s < e)
4488 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004490
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 Py_XDECREF(v);
4493 return NULL;
4494}
4495
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004496/* create or adjust a UnicodeEncodeError */
4497static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004498 const char *encoding,
4499 const Py_UNICODE *unicode, Py_ssize_t size,
4500 Py_ssize_t startpos, Py_ssize_t endpos,
4501 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004503 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004504 *exceptionObject = PyUnicodeEncodeError_Create(
4505 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506 }
4507 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4509 goto onError;
4510 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4511 goto onError;
4512 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4513 goto onError;
4514 return;
4515 onError:
4516 Py_DECREF(*exceptionObject);
4517 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 }
4519}
4520
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521/* raises a UnicodeEncodeError */
4522static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004523 const char *encoding,
4524 const Py_UNICODE *unicode, Py_ssize_t size,
4525 Py_ssize_t startpos, Py_ssize_t endpos,
4526 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527{
4528 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532}
4533
4534/* error handling callback helper:
4535 build arguments, call the callback and check the arguments,
4536 put the result into newpos and return the replacement string, which
4537 has to be freed by the caller */
4538static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 PyObject **errorHandler,
4540 const char *encoding, const char *reason,
4541 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4542 Py_ssize_t startpos, Py_ssize_t endpos,
4543 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004545 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546
4547 PyObject *restuple;
4548 PyObject *resunicode;
4549
4550 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004551 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554 }
4555
4556 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004557 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004559 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004560
4561 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004562 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004564 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004566 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004567 Py_DECREF(restuple);
4568 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004570 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004571 &resunicode, newpos)) {
4572 Py_DECREF(restuple);
4573 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004575 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4576 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4577 Py_DECREF(restuple);
4578 return NULL;
4579 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004581 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004582 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004583 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4584 Py_DECREF(restuple);
4585 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004586 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 Py_INCREF(resunicode);
4588 Py_DECREF(restuple);
4589 return resunicode;
4590}
4591
4592static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 Py_ssize_t size,
4594 const char *errors,
4595 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596{
4597 /* output object */
4598 PyObject *res;
4599 /* pointers to the beginning and end+1 of input */
4600 const Py_UNICODE *startp = p;
4601 const Py_UNICODE *endp = p + size;
4602 /* pointer to the beginning of the unencodable characters */
4603 /* const Py_UNICODE *badp = NULL; */
4604 /* pointer into the output */
4605 char *str;
4606 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004607 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004608 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4609 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610 PyObject *errorHandler = NULL;
4611 PyObject *exc = NULL;
4612 /* the following variable is used for caching string comparisons
4613 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4614 int known_errorHandler = -1;
4615
4616 /* allocate enough for a simple encoding without
4617 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004618 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004619 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004620 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004621 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004622 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004623 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004624 ressize = size;
4625
4626 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004627 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004628
Benjamin Peterson29060642009-01-31 22:14:21 +00004629 /* can we encode this? */
4630 if (c<limit) {
4631 /* no overflow check, because we know that the space is enough */
4632 *str++ = (char)c;
4633 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004634 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004635 else {
4636 Py_ssize_t unicodepos = p-startp;
4637 Py_ssize_t requiredsize;
4638 PyObject *repunicode;
4639 Py_ssize_t repsize;
4640 Py_ssize_t newpos;
4641 Py_ssize_t respos;
4642 Py_UNICODE *uni2;
4643 /* startpos for collecting unencodable chars */
4644 const Py_UNICODE *collstart = p;
4645 const Py_UNICODE *collend = p;
4646 /* find all unecodable characters */
4647 while ((collend < endp) && ((*collend)>=limit))
4648 ++collend;
4649 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4650 if (known_errorHandler==-1) {
4651 if ((errors==NULL) || (!strcmp(errors, "strict")))
4652 known_errorHandler = 1;
4653 else if (!strcmp(errors, "replace"))
4654 known_errorHandler = 2;
4655 else if (!strcmp(errors, "ignore"))
4656 known_errorHandler = 3;
4657 else if (!strcmp(errors, "xmlcharrefreplace"))
4658 known_errorHandler = 4;
4659 else
4660 known_errorHandler = 0;
4661 }
4662 switch (known_errorHandler) {
4663 case 1: /* strict */
4664 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4665 goto onError;
4666 case 2: /* replace */
4667 while (collstart++<collend)
4668 *str++ = '?'; /* fall through */
4669 case 3: /* ignore */
4670 p = collend;
4671 break;
4672 case 4: /* xmlcharrefreplace */
4673 respos = str - PyBytes_AS_STRING(res);
4674 /* determine replacement size (temporarily (mis)uses p) */
4675 for (p = collstart, repsize = 0; p < collend; ++p) {
4676 if (*p<10)
4677 repsize += 2+1+1;
4678 else if (*p<100)
4679 repsize += 2+2+1;
4680 else if (*p<1000)
4681 repsize += 2+3+1;
4682 else if (*p<10000)
4683 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004684#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004685 else
4686 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004687#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004688 else if (*p<100000)
4689 repsize += 2+5+1;
4690 else if (*p<1000000)
4691 repsize += 2+6+1;
4692 else
4693 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004694#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004695 }
4696 requiredsize = respos+repsize+(endp-collend);
4697 if (requiredsize > ressize) {
4698 if (requiredsize<2*ressize)
4699 requiredsize = 2*ressize;
4700 if (_PyBytes_Resize(&res, requiredsize))
4701 goto onError;
4702 str = PyBytes_AS_STRING(res) + respos;
4703 ressize = requiredsize;
4704 }
4705 /* generate replacement (temporarily (mis)uses p) */
4706 for (p = collstart; p < collend; ++p) {
4707 str += sprintf(str, "&#%d;", (int)*p);
4708 }
4709 p = collend;
4710 break;
4711 default:
4712 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4713 encoding, reason, startp, size, &exc,
4714 collstart-startp, collend-startp, &newpos);
4715 if (repunicode == NULL)
4716 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004717 if (PyBytes_Check(repunicode)) {
4718 /* Directly copy bytes result to output. */
4719 repsize = PyBytes_Size(repunicode);
4720 if (repsize > 1) {
4721 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004722 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004723 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4724 Py_DECREF(repunicode);
4725 goto onError;
4726 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004727 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004728 ressize += repsize-1;
4729 }
4730 memcpy(str, PyBytes_AsString(repunicode), repsize);
4731 str += repsize;
4732 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004733 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004734 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004735 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004736 /* need more space? (at least enough for what we
4737 have+the replacement+the rest of the string, so
4738 we won't have to check space for encodable characters) */
4739 respos = str - PyBytes_AS_STRING(res);
4740 repsize = PyUnicode_GET_SIZE(repunicode);
4741 requiredsize = respos+repsize+(endp-collend);
4742 if (requiredsize > ressize) {
4743 if (requiredsize<2*ressize)
4744 requiredsize = 2*ressize;
4745 if (_PyBytes_Resize(&res, requiredsize)) {
4746 Py_DECREF(repunicode);
4747 goto onError;
4748 }
4749 str = PyBytes_AS_STRING(res) + respos;
4750 ressize = requiredsize;
4751 }
4752 /* check if there is anything unencodable in the replacement
4753 and copy it to the output */
4754 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4755 c = *uni2;
4756 if (c >= limit) {
4757 raise_encode_exception(&exc, encoding, startp, size,
4758 unicodepos, unicodepos+1, reason);
4759 Py_DECREF(repunicode);
4760 goto onError;
4761 }
4762 *str = (char)c;
4763 }
4764 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004765 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004766 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004767 }
4768 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004769 /* Resize if we allocated to much */
4770 size = str - PyBytes_AS_STRING(res);
4771 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004772 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004773 if (_PyBytes_Resize(&res, size) < 0)
4774 goto onError;
4775 }
4776
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004777 Py_XDECREF(errorHandler);
4778 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004779 return res;
4780
4781 onError:
4782 Py_XDECREF(res);
4783 Py_XDECREF(errorHandler);
4784 Py_XDECREF(exc);
4785 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786}
4787
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004789 Py_ssize_t size,
4790 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004792 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793}
4794
4795PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4796{
4797 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 PyErr_BadArgument();
4799 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 }
4801 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004802 PyUnicode_GET_SIZE(unicode),
4803 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804}
4805
4806/* --- 7-bit ASCII Codec -------------------------------------------------- */
4807
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004809 Py_ssize_t size,
4810 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813 PyUnicodeObject *v;
4814 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004815 Py_ssize_t startinpos;
4816 Py_ssize_t endinpos;
4817 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 const char *e;
4819 PyObject *errorHandler = NULL;
4820 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004821
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004823 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004824 Py_UNICODE r = *(unsigned char*)s;
4825 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004826 }
Tim Petersced69f82003-09-16 20:30:58 +00004827
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828 v = _PyUnicode_New(size);
4829 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004830 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004832 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004834 e = s + size;
4835 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004836 register unsigned char c = (unsigned char)*s;
4837 if (c < 128) {
4838 *p++ = c;
4839 ++s;
4840 }
4841 else {
4842 startinpos = s-starts;
4843 endinpos = startinpos + 1;
4844 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4845 if (unicode_decode_call_errorhandler(
4846 errors, &errorHandler,
4847 "ascii", "ordinal not in range(128)",
4848 &starts, &e, &startinpos, &endinpos, &exc, &s,
4849 &v, &outpos, &p))
4850 goto onError;
4851 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004853 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4855 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856 Py_XDECREF(errorHandler);
4857 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004859
Benjamin Peterson29060642009-01-31 22:14:21 +00004860 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004862 Py_XDECREF(errorHandler);
4863 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 return NULL;
4865}
4866
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004868 Py_ssize_t size,
4869 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872}
4873
4874PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4875{
4876 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004877 PyErr_BadArgument();
4878 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 }
4880 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004881 PyUnicode_GET_SIZE(unicode),
4882 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883}
4884
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004885#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004886
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004887/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004888
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004889#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004890#define NEED_RETRY
4891#endif
4892
4893/* XXX This code is limited to "true" double-byte encodings, as
4894 a) it assumes an incomplete character consists of a single byte, and
4895 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004896 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004897
4898static int is_dbcs_lead_byte(const char *s, int offset)
4899{
4900 const char *curr = s + offset;
4901
4902 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 const char *prev = CharPrev(s, curr);
4904 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004905 }
4906 return 0;
4907}
4908
4909/*
4910 * Decode MBCS string into unicode object. If 'final' is set, converts
4911 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4912 */
4913static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004914 const char *s, /* MBCS string */
4915 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004916 int final,
4917 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004918{
4919 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004920 Py_ssize_t n;
4921 DWORD usize;
4922 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004923
4924 assert(size >= 0);
4925
Victor Stinner554f3f02010-06-16 23:33:54 +00004926 /* check and handle 'errors' arg */
4927 if (errors==NULL || strcmp(errors, "strict")==0)
4928 flags = MB_ERR_INVALID_CHARS;
4929 else if (strcmp(errors, "ignore")==0)
4930 flags = 0;
4931 else {
4932 PyErr_Format(PyExc_ValueError,
4933 "mbcs encoding does not support errors='%s'",
4934 errors);
4935 return -1;
4936 }
4937
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004938 /* Skip trailing lead-byte unless 'final' is set */
4939 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004941
4942 /* First get the size of the result */
4943 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004944 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4945 if (usize==0)
4946 goto mbcs_decode_error;
4947 } else
4948 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004949
4950 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004951 /* Create unicode object */
4952 *v = _PyUnicode_New(usize);
4953 if (*v == NULL)
4954 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004955 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004956 }
4957 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 /* Extend unicode object */
4959 n = PyUnicode_GET_SIZE(*v);
4960 if (_PyUnicode_Resize(v, n + usize) < 0)
4961 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004962 }
4963
4964 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004965 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004966 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004967 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4968 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004970 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004971 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004972
4973mbcs_decode_error:
4974 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4975 we raise a UnicodeDecodeError - else it is a 'generic'
4976 windows error
4977 */
4978 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4979 /* Ideally, we should get reason from FormatMessage - this
4980 is the Windows 2000 English version of the message
4981 */
4982 PyObject *exc = NULL;
4983 const char *reason = "No mapping for the Unicode character exists "
4984 "in the target multi-byte code page.";
4985 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4986 if (exc != NULL) {
4987 PyCodec_StrictErrors(exc);
4988 Py_DECREF(exc);
4989 }
4990 } else {
4991 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4992 }
4993 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004994}
4995
4996PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004997 Py_ssize_t size,
4998 const char *errors,
4999 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005000{
5001 PyUnicodeObject *v = NULL;
5002 int done;
5003
5004 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005005 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005006
5007#ifdef NEED_RETRY
5008 retry:
5009 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005010 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005011 else
5012#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005013 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005014
5015 if (done < 0) {
5016 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005017 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005018 }
5019
5020 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005021 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005022
5023#ifdef NEED_RETRY
5024 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 s += done;
5026 size -= done;
5027 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005028 }
5029#endif
5030
5031 return (PyObject *)v;
5032}
5033
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005034PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 Py_ssize_t size,
5036 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005037{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005038 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5039}
5040
5041/*
5042 * Convert unicode into string object (MBCS).
5043 * Returns 0 if succeed, -1 otherwise.
5044 */
5045static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00005047 int size, /* size of unicode */
5048 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005049{
Victor Stinner554f3f02010-06-16 23:33:54 +00005050 BOOL usedDefaultChar = FALSE;
5051 BOOL *pusedDefaultChar;
5052 int mbcssize;
5053 Py_ssize_t n;
5054 PyObject *exc = NULL;
5055 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005056
5057 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005058
Victor Stinner554f3f02010-06-16 23:33:54 +00005059 /* check and handle 'errors' arg */
5060 if (errors==NULL || strcmp(errors, "strict")==0) {
5061 flags = WC_NO_BEST_FIT_CHARS;
5062 pusedDefaultChar = &usedDefaultChar;
5063 } else if (strcmp(errors, "replace")==0) {
5064 flags = 0;
5065 pusedDefaultChar = NULL;
5066 } else {
5067 PyErr_Format(PyExc_ValueError,
5068 "mbcs encoding does not support errors='%s'",
5069 errors);
5070 return -1;
5071 }
5072
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005073 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005074 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005075 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5076 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005077 if (mbcssize == 0) {
5078 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5079 return -1;
5080 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005081 /* If we used a default char, then we failed! */
5082 if (pusedDefaultChar && *pusedDefaultChar)
5083 goto mbcs_encode_error;
5084 } else {
5085 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005086 }
5087
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005088 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005089 /* Create string object */
5090 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5091 if (*repr == NULL)
5092 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005093 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005094 }
5095 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005096 /* Extend string object */
5097 n = PyBytes_Size(*repr);
5098 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5099 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005100 }
5101
5102 /* Do the conversion */
5103 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005105 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5106 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005107 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5108 return -1;
5109 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005110 if (pusedDefaultChar && *pusedDefaultChar)
5111 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005112 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005113 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005114
5115mbcs_encode_error:
5116 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5117 Py_XDECREF(exc);
5118 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005119}
5120
5121PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005122 Py_ssize_t size,
5123 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005124{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005125 PyObject *repr = NULL;
5126 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005127
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005128#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005130 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005131 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005132 else
5133#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005134 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005135
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005136 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005137 Py_XDECREF(repr);
5138 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005139 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005140
5141#ifdef NEED_RETRY
5142 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005143 p += INT_MAX;
5144 size -= INT_MAX;
5145 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005146 }
5147#endif
5148
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005149 return repr;
5150}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005151
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005152PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5153{
5154 if (!PyUnicode_Check(unicode)) {
5155 PyErr_BadArgument();
5156 return NULL;
5157 }
5158 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005159 PyUnicode_GET_SIZE(unicode),
5160 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005161}
5162
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005163#undef NEED_RETRY
5164
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005165#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005166
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167/* --- Character Mapping Codec -------------------------------------------- */
5168
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 Py_ssize_t size,
5171 PyObject *mapping,
5172 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005174 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005175 Py_ssize_t startinpos;
5176 Py_ssize_t endinpos;
5177 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005178 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 PyUnicodeObject *v;
5180 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005181 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005182 PyObject *errorHandler = NULL;
5183 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005184 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005185 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005186
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187 /* Default to Latin-1 */
5188 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190
5191 v = _PyUnicode_New(size);
5192 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005195 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005197 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005198 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005199 mapstring = PyUnicode_AS_UNICODE(mapping);
5200 maplen = PyUnicode_GET_SIZE(mapping);
5201 while (s < e) {
5202 unsigned char ch = *s;
5203 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 if (ch < maplen)
5206 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 if (x == 0xfffe) {
5209 /* undefined mapping */
5210 outpos = p-PyUnicode_AS_UNICODE(v);
5211 startinpos = s-starts;
5212 endinpos = startinpos+1;
5213 if (unicode_decode_call_errorhandler(
5214 errors, &errorHandler,
5215 "charmap", "character maps to <undefined>",
5216 &starts, &e, &startinpos, &endinpos, &exc, &s,
5217 &v, &outpos, &p)) {
5218 goto onError;
5219 }
5220 continue;
5221 }
5222 *p++ = x;
5223 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005224 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005225 }
5226 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 while (s < e) {
5228 unsigned char ch = *s;
5229 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005230
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5232 w = PyLong_FromLong((long)ch);
5233 if (w == NULL)
5234 goto onError;
5235 x = PyObject_GetItem(mapping, w);
5236 Py_DECREF(w);
5237 if (x == NULL) {
5238 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5239 /* No mapping found means: mapping is undefined. */
5240 PyErr_Clear();
5241 x = Py_None;
5242 Py_INCREF(x);
5243 } else
5244 goto onError;
5245 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005246
Benjamin Peterson29060642009-01-31 22:14:21 +00005247 /* Apply mapping */
5248 if (PyLong_Check(x)) {
5249 long value = PyLong_AS_LONG(x);
5250 if (value < 0 || value > 65535) {
5251 PyErr_SetString(PyExc_TypeError,
5252 "character mapping must be in range(65536)");
5253 Py_DECREF(x);
5254 goto onError;
5255 }
5256 *p++ = (Py_UNICODE)value;
5257 }
5258 else if (x == Py_None) {
5259 /* undefined mapping */
5260 outpos = p-PyUnicode_AS_UNICODE(v);
5261 startinpos = s-starts;
5262 endinpos = startinpos+1;
5263 if (unicode_decode_call_errorhandler(
5264 errors, &errorHandler,
5265 "charmap", "character maps to <undefined>",
5266 &starts, &e, &startinpos, &endinpos, &exc, &s,
5267 &v, &outpos, &p)) {
5268 Py_DECREF(x);
5269 goto onError;
5270 }
5271 Py_DECREF(x);
5272 continue;
5273 }
5274 else if (PyUnicode_Check(x)) {
5275 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005276
Benjamin Peterson29060642009-01-31 22:14:21 +00005277 if (targetsize == 1)
5278 /* 1-1 mapping */
5279 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005280
Benjamin Peterson29060642009-01-31 22:14:21 +00005281 else if (targetsize > 1) {
5282 /* 1-n mapping */
5283 if (targetsize > extrachars) {
5284 /* resize first */
5285 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5286 Py_ssize_t needed = (targetsize - extrachars) + \
5287 (targetsize << 2);
5288 extrachars += needed;
5289 /* XXX overflow detection missing */
5290 if (_PyUnicode_Resize(&v,
5291 PyUnicode_GET_SIZE(v) + needed) < 0) {
5292 Py_DECREF(x);
5293 goto onError;
5294 }
5295 p = PyUnicode_AS_UNICODE(v) + oldpos;
5296 }
5297 Py_UNICODE_COPY(p,
5298 PyUnicode_AS_UNICODE(x),
5299 targetsize);
5300 p += targetsize;
5301 extrachars -= targetsize;
5302 }
5303 /* 1-0 mapping: skip the character */
5304 }
5305 else {
5306 /* wrong return value */
5307 PyErr_SetString(PyExc_TypeError,
5308 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005309 Py_DECREF(x);
5310 goto onError;
5311 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 Py_DECREF(x);
5313 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005314 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 }
5316 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005317 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5318 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005319 Py_XDECREF(errorHandler);
5320 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005322
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005324 Py_XDECREF(errorHandler);
5325 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 Py_XDECREF(v);
5327 return NULL;
5328}
5329
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005330/* Charmap encoding: the lookup table */
5331
5332struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 PyObject_HEAD
5334 unsigned char level1[32];
5335 int count2, count3;
5336 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005337};
5338
5339static PyObject*
5340encoding_map_size(PyObject *obj, PyObject* args)
5341{
5342 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005343 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005345}
5346
5347static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005348 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 PyDoc_STR("Return the size (in bytes) of this object") },
5350 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005351};
5352
5353static void
5354encoding_map_dealloc(PyObject* o)
5355{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005356 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005357}
5358
5359static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005360 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 "EncodingMap", /*tp_name*/
5362 sizeof(struct encoding_map), /*tp_basicsize*/
5363 0, /*tp_itemsize*/
5364 /* methods */
5365 encoding_map_dealloc, /*tp_dealloc*/
5366 0, /*tp_print*/
5367 0, /*tp_getattr*/
5368 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005369 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 0, /*tp_repr*/
5371 0, /*tp_as_number*/
5372 0, /*tp_as_sequence*/
5373 0, /*tp_as_mapping*/
5374 0, /*tp_hash*/
5375 0, /*tp_call*/
5376 0, /*tp_str*/
5377 0, /*tp_getattro*/
5378 0, /*tp_setattro*/
5379 0, /*tp_as_buffer*/
5380 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5381 0, /*tp_doc*/
5382 0, /*tp_traverse*/
5383 0, /*tp_clear*/
5384 0, /*tp_richcompare*/
5385 0, /*tp_weaklistoffset*/
5386 0, /*tp_iter*/
5387 0, /*tp_iternext*/
5388 encoding_map_methods, /*tp_methods*/
5389 0, /*tp_members*/
5390 0, /*tp_getset*/
5391 0, /*tp_base*/
5392 0, /*tp_dict*/
5393 0, /*tp_descr_get*/
5394 0, /*tp_descr_set*/
5395 0, /*tp_dictoffset*/
5396 0, /*tp_init*/
5397 0, /*tp_alloc*/
5398 0, /*tp_new*/
5399 0, /*tp_free*/
5400 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005401};
5402
5403PyObject*
5404PyUnicode_BuildEncodingMap(PyObject* string)
5405{
5406 Py_UNICODE *decode;
5407 PyObject *result;
5408 struct encoding_map *mresult;
5409 int i;
5410 int need_dict = 0;
5411 unsigned char level1[32];
5412 unsigned char level2[512];
5413 unsigned char *mlevel1, *mlevel2, *mlevel3;
5414 int count2 = 0, count3 = 0;
5415
5416 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5417 PyErr_BadArgument();
5418 return NULL;
5419 }
5420 decode = PyUnicode_AS_UNICODE(string);
5421 memset(level1, 0xFF, sizeof level1);
5422 memset(level2, 0xFF, sizeof level2);
5423
5424 /* If there isn't a one-to-one mapping of NULL to \0,
5425 or if there are non-BMP characters, we need to use
5426 a mapping dictionary. */
5427 if (decode[0] != 0)
5428 need_dict = 1;
5429 for (i = 1; i < 256; i++) {
5430 int l1, l2;
5431 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005432#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005433 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005434#endif
5435 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005436 need_dict = 1;
5437 break;
5438 }
5439 if (decode[i] == 0xFFFE)
5440 /* unmapped character */
5441 continue;
5442 l1 = decode[i] >> 11;
5443 l2 = decode[i] >> 7;
5444 if (level1[l1] == 0xFF)
5445 level1[l1] = count2++;
5446 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005447 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005448 }
5449
5450 if (count2 >= 0xFF || count3 >= 0xFF)
5451 need_dict = 1;
5452
5453 if (need_dict) {
5454 PyObject *result = PyDict_New();
5455 PyObject *key, *value;
5456 if (!result)
5457 return NULL;
5458 for (i = 0; i < 256; i++) {
5459 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005460 key = PyLong_FromLong(decode[i]);
5461 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005462 if (!key || !value)
5463 goto failed1;
5464 if (PyDict_SetItem(result, key, value) == -1)
5465 goto failed1;
5466 Py_DECREF(key);
5467 Py_DECREF(value);
5468 }
5469 return result;
5470 failed1:
5471 Py_XDECREF(key);
5472 Py_XDECREF(value);
5473 Py_DECREF(result);
5474 return NULL;
5475 }
5476
5477 /* Create a three-level trie */
5478 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5479 16*count2 + 128*count3 - 1);
5480 if (!result)
5481 return PyErr_NoMemory();
5482 PyObject_Init(result, &EncodingMapType);
5483 mresult = (struct encoding_map*)result;
5484 mresult->count2 = count2;
5485 mresult->count3 = count3;
5486 mlevel1 = mresult->level1;
5487 mlevel2 = mresult->level23;
5488 mlevel3 = mresult->level23 + 16*count2;
5489 memcpy(mlevel1, level1, 32);
5490 memset(mlevel2, 0xFF, 16*count2);
5491 memset(mlevel3, 0, 128*count3);
5492 count3 = 0;
5493 for (i = 1; i < 256; i++) {
5494 int o1, o2, o3, i2, i3;
5495 if (decode[i] == 0xFFFE)
5496 /* unmapped character */
5497 continue;
5498 o1 = decode[i]>>11;
5499 o2 = (decode[i]>>7) & 0xF;
5500 i2 = 16*mlevel1[o1] + o2;
5501 if (mlevel2[i2] == 0xFF)
5502 mlevel2[i2] = count3++;
5503 o3 = decode[i] & 0x7F;
5504 i3 = 128*mlevel2[i2] + o3;
5505 mlevel3[i3] = i;
5506 }
5507 return result;
5508}
5509
5510static int
5511encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5512{
5513 struct encoding_map *map = (struct encoding_map*)mapping;
5514 int l1 = c>>11;
5515 int l2 = (c>>7) & 0xF;
5516 int l3 = c & 0x7F;
5517 int i;
5518
5519#ifdef Py_UNICODE_WIDE
5520 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005522 }
5523#endif
5524 if (c == 0)
5525 return 0;
5526 /* level 1*/
5527 i = map->level1[l1];
5528 if (i == 0xFF) {
5529 return -1;
5530 }
5531 /* level 2*/
5532 i = map->level23[16*i+l2];
5533 if (i == 0xFF) {
5534 return -1;
5535 }
5536 /* level 3 */
5537 i = map->level23[16*map->count2 + 128*i + l3];
5538 if (i == 0) {
5539 return -1;
5540 }
5541 return i;
5542}
5543
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005544/* Lookup the character ch in the mapping. If the character
5545 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005546 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005547static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548{
Christian Heimes217cfd12007-12-02 14:31:20 +00005549 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005550 PyObject *x;
5551
5552 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005553 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005554 x = PyObject_GetItem(mapping, w);
5555 Py_DECREF(w);
5556 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005557 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5558 /* No mapping found means: mapping is undefined. */
5559 PyErr_Clear();
5560 x = Py_None;
5561 Py_INCREF(x);
5562 return x;
5563 } else
5564 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005566 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005567 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005568 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005569 long value = PyLong_AS_LONG(x);
5570 if (value < 0 || value > 255) {
5571 PyErr_SetString(PyExc_TypeError,
5572 "character mapping must be in range(256)");
5573 Py_DECREF(x);
5574 return NULL;
5575 }
5576 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005578 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 /* wrong return value */
5582 PyErr_Format(PyExc_TypeError,
5583 "character mapping must return integer, bytes or None, not %.400s",
5584 x->ob_type->tp_name);
5585 Py_DECREF(x);
5586 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 }
5588}
5589
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005590static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005591charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005592{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005593 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5594 /* exponentially overallocate to minimize reallocations */
5595 if (requiredsize < 2*outsize)
5596 requiredsize = 2*outsize;
5597 if (_PyBytes_Resize(outobj, requiredsize))
5598 return -1;
5599 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005600}
5601
Benjamin Peterson14339b62009-01-31 16:36:08 +00005602typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005603 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005604}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005605/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005606 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005607 space is available. Return a new reference to the object that
5608 was put in the output buffer, or Py_None, if the mapping was undefined
5609 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005610 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005611static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005612charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005614{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005615 PyObject *rep;
5616 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005617 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618
Christian Heimes90aa7642007-12-19 02:45:37 +00005619 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005620 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005621 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005622 if (res == -1)
5623 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 if (outsize<requiredsize)
5625 if (charmapencode_resize(outobj, outpos, requiredsize))
5626 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005627 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 outstart[(*outpos)++] = (char)res;
5629 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005630 }
5631
5632 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005635 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005636 Py_DECREF(rep);
5637 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005638 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005639 if (PyLong_Check(rep)) {
5640 Py_ssize_t requiredsize = *outpos+1;
5641 if (outsize<requiredsize)
5642 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5643 Py_DECREF(rep);
5644 return enc_EXCEPTION;
5645 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005646 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005648 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005649 else {
5650 const char *repchars = PyBytes_AS_STRING(rep);
5651 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5652 Py_ssize_t requiredsize = *outpos+repsize;
5653 if (outsize<requiredsize)
5654 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5655 Py_DECREF(rep);
5656 return enc_EXCEPTION;
5657 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005658 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005659 memcpy(outstart + *outpos, repchars, repsize);
5660 *outpos += repsize;
5661 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005662 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005663 Py_DECREF(rep);
5664 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005665}
5666
5667/* handle an error in PyUnicode_EncodeCharmap
5668 Return 0 on success, -1 on error */
5669static
5670int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005671 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005672 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005673 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005674 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005675{
5676 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005677 Py_ssize_t repsize;
5678 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005679 Py_UNICODE *uni2;
5680 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005681 Py_ssize_t collstartpos = *inpos;
5682 Py_ssize_t collendpos = *inpos+1;
5683 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005684 char *encoding = "charmap";
5685 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005686 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005687
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005688 /* find all unencodable characters */
5689 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005690 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005691 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 int res = encoding_map_lookup(p[collendpos], mapping);
5693 if (res != -1)
5694 break;
5695 ++collendpos;
5696 continue;
5697 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005698
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 rep = charmapencode_lookup(p[collendpos], mapping);
5700 if (rep==NULL)
5701 return -1;
5702 else if (rep!=Py_None) {
5703 Py_DECREF(rep);
5704 break;
5705 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005706 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005708 }
5709 /* cache callback name lookup
5710 * (if not done yet, i.e. it's the first error) */
5711 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 if ((errors==NULL) || (!strcmp(errors, "strict")))
5713 *known_errorHandler = 1;
5714 else if (!strcmp(errors, "replace"))
5715 *known_errorHandler = 2;
5716 else if (!strcmp(errors, "ignore"))
5717 *known_errorHandler = 3;
5718 else if (!strcmp(errors, "xmlcharrefreplace"))
5719 *known_errorHandler = 4;
5720 else
5721 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005722 }
5723 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005724 case 1: /* strict */
5725 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5726 return -1;
5727 case 2: /* replace */
5728 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 x = charmapencode_output('?', mapping, res, respos);
5730 if (x==enc_EXCEPTION) {
5731 return -1;
5732 }
5733 else if (x==enc_FAILED) {
5734 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5735 return -1;
5736 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005737 }
5738 /* fall through */
5739 case 3: /* ignore */
5740 *inpos = collendpos;
5741 break;
5742 case 4: /* xmlcharrefreplace */
5743 /* generate replacement (temporarily (mis)uses p) */
5744 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 char buffer[2+29+1+1];
5746 char *cp;
5747 sprintf(buffer, "&#%d;", (int)p[collpos]);
5748 for (cp = buffer; *cp; ++cp) {
5749 x = charmapencode_output(*cp, mapping, res, respos);
5750 if (x==enc_EXCEPTION)
5751 return -1;
5752 else if (x==enc_FAILED) {
5753 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5754 return -1;
5755 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005756 }
5757 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005758 *inpos = collendpos;
5759 break;
5760 default:
5761 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 encoding, reason, p, size, exceptionObject,
5763 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005764 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005766 if (PyBytes_Check(repunicode)) {
5767 /* Directly copy bytes result to output. */
5768 Py_ssize_t outsize = PyBytes_Size(*res);
5769 Py_ssize_t requiredsize;
5770 repsize = PyBytes_Size(repunicode);
5771 requiredsize = *respos + repsize;
5772 if (requiredsize > outsize)
5773 /* Make room for all additional bytes. */
5774 if (charmapencode_resize(res, respos, requiredsize)) {
5775 Py_DECREF(repunicode);
5776 return -1;
5777 }
5778 memcpy(PyBytes_AsString(*res) + *respos,
5779 PyBytes_AsString(repunicode), repsize);
5780 *respos += repsize;
5781 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005782 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005783 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005784 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005785 /* generate replacement */
5786 repsize = PyUnicode_GET_SIZE(repunicode);
5787 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 x = charmapencode_output(*uni2, mapping, res, respos);
5789 if (x==enc_EXCEPTION) {
5790 return -1;
5791 }
5792 else if (x==enc_FAILED) {
5793 Py_DECREF(repunicode);
5794 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5795 return -1;
5796 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005797 }
5798 *inpos = newpos;
5799 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005800 }
5801 return 0;
5802}
5803
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 Py_ssize_t size,
5806 PyObject *mapping,
5807 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005809 /* output object */
5810 PyObject *res = NULL;
5811 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005812 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005813 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005814 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005815 PyObject *errorHandler = NULL;
5816 PyObject *exc = NULL;
5817 /* the following variable is used for caching string comparisons
5818 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5819 * 3=ignore, 4=xmlcharrefreplace */
5820 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821
5822 /* Default to Latin-1 */
5823 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005826 /* allocate enough for a simple encoding without
5827 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005828 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005829 if (res == NULL)
5830 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005831 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005834 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 /* try to encode it */
5836 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5837 if (x==enc_EXCEPTION) /* error */
5838 goto onError;
5839 if (x==enc_FAILED) { /* unencodable character */
5840 if (charmap_encoding_error(p, size, &inpos, mapping,
5841 &exc,
5842 &known_errorHandler, &errorHandler, errors,
5843 &res, &respos)) {
5844 goto onError;
5845 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005846 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 else
5848 /* done with this character => adjust input position */
5849 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005852 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005853 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005854 if (_PyBytes_Resize(&res, respos) < 0)
5855 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005856
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005857 Py_XDECREF(exc);
5858 Py_XDECREF(errorHandler);
5859 return res;
5860
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005862 Py_XDECREF(res);
5863 Py_XDECREF(exc);
5864 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 return NULL;
5866}
5867
5868PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005869 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870{
5871 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 PyErr_BadArgument();
5873 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 }
5875 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005876 PyUnicode_GET_SIZE(unicode),
5877 mapping,
5878 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879}
5880
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005881/* create or adjust a UnicodeTranslateError */
5882static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 const Py_UNICODE *unicode, Py_ssize_t size,
5884 Py_ssize_t startpos, Py_ssize_t endpos,
5885 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005887 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005888 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 }
5891 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005892 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5893 goto onError;
5894 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5895 goto onError;
5896 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5897 goto onError;
5898 return;
5899 onError:
5900 Py_DECREF(*exceptionObject);
5901 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 }
5903}
5904
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005905/* raises a UnicodeTranslateError */
5906static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005907 const Py_UNICODE *unicode, Py_ssize_t size,
5908 Py_ssize_t startpos, Py_ssize_t endpos,
5909 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005910{
5911 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005913 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005915}
5916
5917/* error handling callback helper:
5918 build arguments, call the callback and check the arguments,
5919 put the result into newpos and return the replacement string, which
5920 has to be freed by the caller */
5921static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 PyObject **errorHandler,
5923 const char *reason,
5924 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5925 Py_ssize_t startpos, Py_ssize_t endpos,
5926 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005927{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005928 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005929
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005930 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005931 PyObject *restuple;
5932 PyObject *resunicode;
5933
5934 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005936 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005938 }
5939
5940 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005942 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005944
5945 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005947 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005949 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005950 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 Py_DECREF(restuple);
5952 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005953 }
5954 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 &resunicode, &i_newpos)) {
5956 Py_DECREF(restuple);
5957 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005958 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005959 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005961 else
5962 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005963 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005964 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5965 Py_DECREF(restuple);
5966 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005967 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005968 Py_INCREF(resunicode);
5969 Py_DECREF(restuple);
5970 return resunicode;
5971}
5972
5973/* Lookup the character ch in the mapping and put the result in result,
5974 which must be decrefed by the caller.
5975 Return 0 on success, -1 on error */
5976static
5977int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5978{
Christian Heimes217cfd12007-12-02 14:31:20 +00005979 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005980 PyObject *x;
5981
5982 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005984 x = PyObject_GetItem(mapping, w);
5985 Py_DECREF(w);
5986 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5988 /* No mapping found means: use 1:1 mapping. */
5989 PyErr_Clear();
5990 *result = NULL;
5991 return 0;
5992 } else
5993 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005994 }
5995 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 *result = x;
5997 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005998 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005999 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 long value = PyLong_AS_LONG(x);
6001 long max = PyUnicode_GetMax();
6002 if (value < 0 || value > max) {
6003 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00006004 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 Py_DECREF(x);
6006 return -1;
6007 }
6008 *result = x;
6009 return 0;
6010 }
6011 else if (PyUnicode_Check(x)) {
6012 *result = x;
6013 return 0;
6014 }
6015 else {
6016 /* wrong return value */
6017 PyErr_SetString(PyExc_TypeError,
6018 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006019 Py_DECREF(x);
6020 return -1;
6021 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006022}
6023/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 if not reallocate and adjust various state variables.
6025 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026static
Walter Dörwald4894c302003-10-24 14:25:28 +00006027int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006029{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006030 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006031 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006032 /* remember old output position */
6033 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6034 /* exponentially overallocate to minimize reallocations */
6035 if (requiredsize < 2 * oldsize)
6036 requiredsize = 2 * oldsize;
6037 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6038 return -1;
6039 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006040 }
6041 return 0;
6042}
6043/* lookup the character, put the result in the output string and adjust
6044 various state variables. Return a new reference to the object that
6045 was put in the output buffer in *result, or Py_None, if the mapping was
6046 undefined (in which case no character was written).
6047 The called must decref result.
6048 Return 0 on success, -1 on error. */
6049static
Walter Dörwald4894c302003-10-24 14:25:28 +00006050int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6052 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006053{
Walter Dörwald4894c302003-10-24 14:25:28 +00006054 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006055 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 /* not found => default to 1:1 mapping */
6058 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006059 }
6060 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006061 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006062 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 /* no overflow check, because we know that the space is enough */
6064 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006065 }
6066 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6068 if (repsize==1) {
6069 /* no overflow check, because we know that the space is enough */
6070 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6071 }
6072 else if (repsize!=0) {
6073 /* more than one character */
6074 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6075 (insize - (curinp-startinp)) +
6076 repsize - 1;
6077 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6078 return -1;
6079 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6080 *outp += repsize;
6081 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006082 }
6083 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006085 return 0;
6086}
6087
6088PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00006089 Py_ssize_t size,
6090 PyObject *mapping,
6091 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006093 /* output object */
6094 PyObject *res = NULL;
6095 /* pointers to the beginning and end+1 of input */
6096 const Py_UNICODE *startp = p;
6097 const Py_UNICODE *endp = p + size;
6098 /* pointer into the output */
6099 Py_UNICODE *str;
6100 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006101 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006102 char *reason = "character maps to <undefined>";
6103 PyObject *errorHandler = NULL;
6104 PyObject *exc = NULL;
6105 /* the following variable is used for caching string comparisons
6106 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6107 * 3=ignore, 4=xmlcharrefreplace */
6108 int known_errorHandler = -1;
6109
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 PyErr_BadArgument();
6112 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006114
6115 /* allocate enough for a simple 1:1 translation without
6116 replacements, if we need more, we'll resize */
6117 res = PyUnicode_FromUnicode(NULL, size);
6118 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006122 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006124 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 /* try to encode it */
6126 PyObject *x = NULL;
6127 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6128 Py_XDECREF(x);
6129 goto onError;
6130 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006131 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 if (x!=Py_None) /* it worked => adjust input pointer */
6133 ++p;
6134 else { /* untranslatable character */
6135 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6136 Py_ssize_t repsize;
6137 Py_ssize_t newpos;
6138 Py_UNICODE *uni2;
6139 /* startpos for collecting untranslatable chars */
6140 const Py_UNICODE *collstart = p;
6141 const Py_UNICODE *collend = p+1;
6142 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 /* find all untranslatable characters */
6145 while (collend < endp) {
6146 if (charmaptranslate_lookup(*collend, mapping, &x))
6147 goto onError;
6148 Py_XDECREF(x);
6149 if (x!=Py_None)
6150 break;
6151 ++collend;
6152 }
6153 /* cache callback name lookup
6154 * (if not done yet, i.e. it's the first error) */
6155 if (known_errorHandler==-1) {
6156 if ((errors==NULL) || (!strcmp(errors, "strict")))
6157 known_errorHandler = 1;
6158 else if (!strcmp(errors, "replace"))
6159 known_errorHandler = 2;
6160 else if (!strcmp(errors, "ignore"))
6161 known_errorHandler = 3;
6162 else if (!strcmp(errors, "xmlcharrefreplace"))
6163 known_errorHandler = 4;
6164 else
6165 known_errorHandler = 0;
6166 }
6167 switch (known_errorHandler) {
6168 case 1: /* strict */
6169 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006170 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 case 2: /* replace */
6172 /* No need to check for space, this is a 1:1 replacement */
6173 for (coll = collstart; coll<collend; ++coll)
6174 *str++ = '?';
6175 /* fall through */
6176 case 3: /* ignore */
6177 p = collend;
6178 break;
6179 case 4: /* xmlcharrefreplace */
6180 /* generate replacement (temporarily (mis)uses p) */
6181 for (p = collstart; p < collend; ++p) {
6182 char buffer[2+29+1+1];
6183 char *cp;
6184 sprintf(buffer, "&#%d;", (int)*p);
6185 if (charmaptranslate_makespace(&res, &str,
6186 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6187 goto onError;
6188 for (cp = buffer; *cp; ++cp)
6189 *str++ = *cp;
6190 }
6191 p = collend;
6192 break;
6193 default:
6194 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6195 reason, startp, size, &exc,
6196 collstart-startp, collend-startp, &newpos);
6197 if (repunicode == NULL)
6198 goto onError;
6199 /* generate replacement */
6200 repsize = PyUnicode_GET_SIZE(repunicode);
6201 if (charmaptranslate_makespace(&res, &str,
6202 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6203 Py_DECREF(repunicode);
6204 goto onError;
6205 }
6206 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6207 *str++ = *uni2;
6208 p = startp + newpos;
6209 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006210 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006211 }
6212 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006213 /* Resize if we allocated to much */
6214 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006215 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006216 if (PyUnicode_Resize(&res, respos) < 0)
6217 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006218 }
6219 Py_XDECREF(exc);
6220 Py_XDECREF(errorHandler);
6221 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006224 Py_XDECREF(res);
6225 Py_XDECREF(exc);
6226 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 return NULL;
6228}
6229
6230PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 PyObject *mapping,
6232 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233{
6234 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006235
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 str = PyUnicode_FromObject(str);
6237 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 PyUnicode_GET_SIZE(str),
6241 mapping,
6242 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 Py_DECREF(str);
6244 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006245
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 Py_XDECREF(str);
6248 return NULL;
6249}
Tim Petersced69f82003-09-16 20:30:58 +00006250
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006251PyObject *
6252PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6253 Py_ssize_t length)
6254{
6255 PyObject *result;
6256 Py_UNICODE *p; /* write pointer into result */
6257 Py_ssize_t i;
6258 /* Copy to a new string */
6259 result = (PyObject *)_PyUnicode_New(length);
6260 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6261 if (result == NULL)
6262 return result;
6263 p = PyUnicode_AS_UNICODE(result);
6264 /* Iterate over code points */
6265 for (i = 0; i < length; i++) {
6266 Py_UNICODE ch =s[i];
6267 if (ch > 127) {
6268 int decimal = Py_UNICODE_TODECIMAL(ch);
6269 if (decimal >= 0)
6270 p[i] = '0' + decimal;
6271 }
6272 }
6273 return result;
6274}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006275/* --- Decimal Encoder ---------------------------------------------------- */
6276
6277int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 Py_ssize_t length,
6279 char *output,
6280 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006281{
6282 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006283 PyObject *errorHandler = NULL;
6284 PyObject *exc = NULL;
6285 const char *encoding = "decimal";
6286 const char *reason = "invalid decimal Unicode string";
6287 /* the following variable is used for caching string comparisons
6288 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6289 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006290
6291 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 PyErr_BadArgument();
6293 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006294 }
6295
6296 p = s;
6297 end = s + length;
6298 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 register Py_UNICODE ch = *p;
6300 int decimal;
6301 PyObject *repunicode;
6302 Py_ssize_t repsize;
6303 Py_ssize_t newpos;
6304 Py_UNICODE *uni2;
6305 Py_UNICODE *collstart;
6306 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006307
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006309 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 ++p;
6311 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006312 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 decimal = Py_UNICODE_TODECIMAL(ch);
6314 if (decimal >= 0) {
6315 *output++ = '0' + decimal;
6316 ++p;
6317 continue;
6318 }
6319 if (0 < ch && ch < 256) {
6320 *output++ = (char)ch;
6321 ++p;
6322 continue;
6323 }
6324 /* All other characters are considered unencodable */
6325 collstart = p;
Victor Stinnerab1d16b2011-11-22 01:45:37 +01006326 for (collend = p+1; collend < end; collend++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 if ((0 < *collend && *collend < 256) ||
Victor Stinnerab1d16b2011-11-22 01:45:37 +01006328 Py_UNICODE_ISSPACE(*collend) ||
6329 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 break;
6331 }
6332 /* cache callback name lookup
6333 * (if not done yet, i.e. it's the first error) */
6334 if (known_errorHandler==-1) {
6335 if ((errors==NULL) || (!strcmp(errors, "strict")))
6336 known_errorHandler = 1;
6337 else if (!strcmp(errors, "replace"))
6338 known_errorHandler = 2;
6339 else if (!strcmp(errors, "ignore"))
6340 known_errorHandler = 3;
6341 else if (!strcmp(errors, "xmlcharrefreplace"))
6342 known_errorHandler = 4;
6343 else
6344 known_errorHandler = 0;
6345 }
6346 switch (known_errorHandler) {
6347 case 1: /* strict */
6348 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6349 goto onError;
6350 case 2: /* replace */
6351 for (p = collstart; p < collend; ++p)
6352 *output++ = '?';
6353 /* fall through */
6354 case 3: /* ignore */
6355 p = collend;
6356 break;
6357 case 4: /* xmlcharrefreplace */
6358 /* generate replacement (temporarily (mis)uses p) */
6359 for (p = collstart; p < collend; ++p)
6360 output += sprintf(output, "&#%d;", (int)*p);
6361 p = collend;
6362 break;
6363 default:
6364 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6365 encoding, reason, s, length, &exc,
6366 collstart-s, collend-s, &newpos);
6367 if (repunicode == NULL)
6368 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006369 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006370 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006371 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6372 Py_DECREF(repunicode);
6373 goto onError;
6374 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 /* generate replacement */
6376 repsize = PyUnicode_GET_SIZE(repunicode);
6377 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6378 Py_UNICODE ch = *uni2;
6379 if (Py_UNICODE_ISSPACE(ch))
6380 *output++ = ' ';
6381 else {
6382 decimal = Py_UNICODE_TODECIMAL(ch);
6383 if (decimal >= 0)
6384 *output++ = '0' + decimal;
6385 else if (0 < ch && ch < 256)
6386 *output++ = (char)ch;
6387 else {
6388 Py_DECREF(repunicode);
6389 raise_encode_exception(&exc, encoding,
6390 s, length, collstart-s, collend-s, reason);
6391 goto onError;
6392 }
6393 }
6394 }
6395 p = s + newpos;
6396 Py_DECREF(repunicode);
6397 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006398 }
6399 /* 0-terminate the output string */
6400 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006401 Py_XDECREF(exc);
6402 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006403 return 0;
6404
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 Py_XDECREF(exc);
6407 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006408 return -1;
6409}
6410
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411/* --- Helpers ------------------------------------------------------------ */
6412
Eric Smith8c663262007-08-25 02:26:07 +00006413#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006414#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006415
Thomas Wouters477c8d52006-05-27 19:21:47 +00006416#include "stringlib/count.h"
6417#include "stringlib/find.h"
6418#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006419#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006420
Eric Smith5807c412008-05-11 21:00:57 +00006421#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006422#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006423#include "stringlib/localeutil.h"
6424
Thomas Wouters477c8d52006-05-27 19:21:47 +00006425/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006426#define ADJUST_INDICES(start, end, len) \
6427 if (end > len) \
6428 end = len; \
6429 else if (end < 0) { \
6430 end += len; \
6431 if (end < 0) \
6432 end = 0; \
6433 } \
6434 if (start < 0) { \
6435 start += len; \
6436 if (start < 0) \
6437 start = 0; \
6438 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006439
Ezio Melotti93e7afc2011-08-22 14:08:38 +03006440/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
6441 * by 'ptr', possibly combining surrogate pairs on narrow builds.
6442 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
6443 * that should be returned and 'end' pointing to the end of the buffer.
6444 * ('end' is used on narrow builds to detect a lone surrogate at the
6445 * end of the buffer that should be returned unchanged.)
6446 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
6447 * The type of the returned char is always Py_UCS4.
6448 *
6449 * Note: the macro advances ptr to next char, so it might have side-effects
6450 * (especially if used with other macros).
6451 */
6452
6453/* helper macros used by _Py_UNICODE_NEXT */
6454#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
6455#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
6456/* Join two surrogate characters and return a single Py_UCS4 value. */
6457#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
6458 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
6459 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
6460
6461#ifdef Py_UNICODE_WIDE
6462#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
6463#else
6464#define _Py_UNICODE_NEXT(ptr, end) \
6465 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
6466 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
6467 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
6468 (Py_UCS4)*(ptr)++)
6469#endif
6470
Martin v. Löwis18e16552006-02-15 17:27:45 +00006471Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006472 PyObject *substr,
6473 Py_ssize_t start,
6474 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006476 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006477 PyUnicodeObject* str_obj;
6478 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006479
Thomas Wouters477c8d52006-05-27 19:21:47 +00006480 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6481 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006483 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6484 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 Py_DECREF(str_obj);
6486 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 }
Tim Petersced69f82003-09-16 20:30:58 +00006488
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006489 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006490 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006491 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6492 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006493 );
6494
6495 Py_DECREF(sub_obj);
6496 Py_DECREF(str_obj);
6497
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498 return result;
6499}
6500
Martin v. Löwis18e16552006-02-15 17:27:45 +00006501Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006502 PyObject *sub,
6503 Py_ssize_t start,
6504 Py_ssize_t end,
6505 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006507 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006508
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006510 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006512 sub = PyUnicode_FromObject(sub);
6513 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 Py_DECREF(str);
6515 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 }
Tim Petersced69f82003-09-16 20:30:58 +00006517
Thomas Wouters477c8d52006-05-27 19:21:47 +00006518 if (direction > 0)
6519 result = stringlib_find_slice(
6520 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6521 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6522 start, end
6523 );
6524 else
6525 result = stringlib_rfind_slice(
6526 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6527 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6528 start, end
6529 );
6530
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006532 Py_DECREF(sub);
6533
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 return result;
6535}
6536
Tim Petersced69f82003-09-16 20:30:58 +00006537static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 PyUnicodeObject *substring,
6540 Py_ssize_t start,
6541 Py_ssize_t end,
6542 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 if (substring->length == 0)
6545 return 1;
6546
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006547 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 end -= substring->length;
6549 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551
6552 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 if (Py_UNICODE_MATCH(self, end, substring))
6554 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555 } else {
6556 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006557 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 }
6559
6560 return 0;
6561}
6562
Martin v. Löwis18e16552006-02-15 17:27:45 +00006563Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 PyObject *substr,
6565 Py_ssize_t start,
6566 Py_ssize_t end,
6567 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006569 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006570
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 str = PyUnicode_FromObject(str);
6572 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 substr = PyUnicode_FromObject(substr);
6575 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 Py_DECREF(str);
6577 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578 }
Tim Petersced69f82003-09-16 20:30:58 +00006579
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 (PyUnicodeObject *)substr,
6582 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583 Py_DECREF(str);
6584 Py_DECREF(substr);
6585 return result;
6586}
6587
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588/* Apply fixfct filter to the Unicode object self and return a
6589 reference to the modified object */
6590
Tim Petersced69f82003-09-16 20:30:58 +00006591static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594{
6595
6596 PyUnicodeObject *u;
6597
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006598 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006600 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006601
6602 Py_UNICODE_COPY(u->str, self->str, self->length);
6603
Tim Peters7a29bd52001-09-12 03:03:31 +00006604 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 /* fixfct should return TRUE if it modified the buffer. If
6606 FALSE, return a reference to the original buffer instead
6607 (to save space, not time) */
6608 Py_INCREF(self);
6609 Py_DECREF(u);
6610 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 }
6612 return (PyObject*) u;
6613}
6614
Tim Petersced69f82003-09-16 20:30:58 +00006615static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616int fixupper(PyUnicodeObject *self)
6617{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006618 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 Py_UNICODE *s = self->str;
6620 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006621
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006624
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 ch = Py_UNICODE_TOUPPER(*s);
6626 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 *s = ch;
6629 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630 s++;
6631 }
6632
6633 return status;
6634}
6635
Tim Petersced69f82003-09-16 20:30:58 +00006636static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637int fixlower(PyUnicodeObject *self)
6638{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006639 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640 Py_UNICODE *s = self->str;
6641 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006642
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006644 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006645
Benjamin Peterson29060642009-01-31 22:14:21 +00006646 ch = Py_UNICODE_TOLOWER(*s);
6647 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 *s = ch;
6650 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 s++;
6652 }
6653
6654 return status;
6655}
6656
Tim Petersced69f82003-09-16 20:30:58 +00006657static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658int fixswapcase(PyUnicodeObject *self)
6659{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006660 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 Py_UNICODE *s = self->str;
6662 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006663
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 while (len-- > 0) {
6665 if (Py_UNICODE_ISUPPER(*s)) {
6666 *s = Py_UNICODE_TOLOWER(*s);
6667 status = 1;
6668 } else if (Py_UNICODE_ISLOWER(*s)) {
6669 *s = Py_UNICODE_TOUPPER(*s);
6670 status = 1;
6671 }
6672 s++;
6673 }
6674
6675 return status;
6676}
6677
Tim Petersced69f82003-09-16 20:30:58 +00006678static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679int fixcapitalize(PyUnicodeObject *self)
6680{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006681 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006682 Py_UNICODE *s = self->str;
6683 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006684
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006685 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006686 return 0;
Ezio Melottiee8d9982011-08-15 09:09:57 +03006687 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006688 *s = Py_UNICODE_TOUPPER(*s);
6689 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006691 s++;
6692 while (--len > 0) {
Ezio Melottiee8d9982011-08-15 09:09:57 +03006693 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006694 *s = Py_UNICODE_TOLOWER(*s);
6695 status = 1;
6696 }
6697 s++;
6698 }
6699 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700}
6701
6702static
6703int fixtitle(PyUnicodeObject *self)
6704{
6705 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6706 register Py_UNICODE *e;
6707 int previous_is_cased;
6708
6709 /* Shortcut for single character strings */
6710 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6712 if (*p != ch) {
6713 *p = ch;
6714 return 1;
6715 }
6716 else
6717 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 }
Tim Petersced69f82003-09-16 20:30:58 +00006719
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720 e = p + PyUnicode_GET_SIZE(self);
6721 previous_is_cased = 0;
6722 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006723 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006724
Benjamin Peterson29060642009-01-31 22:14:21 +00006725 if (previous_is_cased)
6726 *p = Py_UNICODE_TOLOWER(ch);
6727 else
6728 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006729
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 if (Py_UNICODE_ISLOWER(ch) ||
6731 Py_UNICODE_ISUPPER(ch) ||
6732 Py_UNICODE_ISTITLE(ch))
6733 previous_is_cased = 1;
6734 else
6735 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 }
6737 return 1;
6738}
6739
Tim Peters8ce9f162004-08-27 01:49:32 +00006740PyObject *
6741PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742{
Skip Montanaro6543b452004-09-16 03:28:13 +00006743 const Py_UNICODE blank = ' ';
6744 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006745 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006746 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006747 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6748 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006749 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6750 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006751 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006752 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753
Tim Peters05eba1f2004-08-27 21:32:02 +00006754 fseq = PySequence_Fast(seq, "");
6755 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006756 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006757 }
6758
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006759 /* NOTE: the following code can't call back into Python code,
6760 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006761 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006762
Tim Peters05eba1f2004-08-27 21:32:02 +00006763 seqlen = PySequence_Fast_GET_SIZE(fseq);
6764 /* If empty sequence, return u"". */
6765 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006766 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6767 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006768 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006769 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006770 /* If singleton sequence with an exact Unicode, return that. */
6771 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 item = items[0];
6773 if (PyUnicode_CheckExact(item)) {
6774 Py_INCREF(item);
6775 res = (PyUnicodeObject *)item;
6776 goto Done;
6777 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006778 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006779 else {
6780 /* Set up sep and seplen */
6781 if (separator == NULL) {
6782 sep = &blank;
6783 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006784 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006785 else {
6786 if (!PyUnicode_Check(separator)) {
6787 PyErr_Format(PyExc_TypeError,
6788 "separator: expected str instance,"
6789 " %.80s found",
6790 Py_TYPE(separator)->tp_name);
6791 goto onError;
6792 }
6793 sep = PyUnicode_AS_UNICODE(separator);
6794 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006795 }
6796 }
6797
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006798 /* There are at least two things to join, or else we have a subclass
6799 * of str in the sequence.
6800 * Do a pre-pass to figure out the total amount of space we'll
6801 * need (sz), and see whether all argument are strings.
6802 */
6803 sz = 0;
6804 for (i = 0; i < seqlen; i++) {
6805 const Py_ssize_t old_sz = sz;
6806 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 if (!PyUnicode_Check(item)) {
6808 PyErr_Format(PyExc_TypeError,
6809 "sequence item %zd: expected str instance,"
6810 " %.80s found",
6811 i, Py_TYPE(item)->tp_name);
6812 goto onError;
6813 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006814 sz += PyUnicode_GET_SIZE(item);
6815 if (i != 0)
6816 sz += seplen;
6817 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6818 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006819 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006820 goto onError;
6821 }
6822 }
Tim Petersced69f82003-09-16 20:30:58 +00006823
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006824 res = _PyUnicode_New(sz);
6825 if (res == NULL)
6826 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006827
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006828 /* Catenate everything. */
6829 res_p = PyUnicode_AS_UNICODE(res);
6830 for (i = 0; i < seqlen; ++i) {
6831 Py_ssize_t itemlen;
6832 item = items[i];
6833 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 /* Copy item, and maybe the separator. */
6835 if (i) {
6836 Py_UNICODE_COPY(res_p, sep, seplen);
6837 res_p += seplen;
6838 }
6839 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6840 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006841 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006842
Benjamin Peterson29060642009-01-31 22:14:21 +00006843 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006844 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 return (PyObject *)res;
6846
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006848 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006849 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850 return NULL;
6851}
6852
Tim Petersced69f82003-09-16 20:30:58 +00006853static
6854PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 Py_ssize_t left,
6856 Py_ssize_t right,
6857 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858{
6859 PyUnicodeObject *u;
6860
6861 if (left < 0)
6862 left = 0;
6863 if (right < 0)
6864 right = 0;
6865
Tim Peters7a29bd52001-09-12 03:03:31 +00006866 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 Py_INCREF(self);
6868 return self;
6869 }
6870
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006871 if (left > PY_SSIZE_T_MAX - self->length ||
6872 right > PY_SSIZE_T_MAX - (left + self->length)) {
6873 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6874 return NULL;
6875 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 u = _PyUnicode_New(left + self->length + right);
6877 if (u) {
6878 if (left)
6879 Py_UNICODE_FILL(u->str, fill, left);
6880 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6881 if (right)
6882 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6883 }
6884
6885 return u;
6886}
6887
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006888PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891
6892 string = PyUnicode_FromObject(string);
6893 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006894 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006895
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006896 list = stringlib_splitlines(
6897 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6898 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899
6900 Py_DECREF(string);
6901 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006902}
6903
Tim Petersced69f82003-09-16 20:30:58 +00006904static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006906 PyUnicodeObject *substring,
6907 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006910 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006913 return stringlib_split_whitespace(
6914 (PyObject*) self, self->str, self->length, maxcount
6915 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006917 return stringlib_split(
6918 (PyObject*) self, self->str, self->length,
6919 substring->str, substring->length,
6920 maxcount
6921 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006922}
6923
Tim Petersced69f82003-09-16 20:30:58 +00006924static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006925PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 PyUnicodeObject *substring,
6927 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006928{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006929 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006930 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006931
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006932 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006933 return stringlib_rsplit_whitespace(
6934 (PyObject*) self, self->str, self->length, maxcount
6935 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006936
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006937 return stringlib_rsplit(
6938 (PyObject*) self, self->str, self->length,
6939 substring->str, substring->length,
6940 maxcount
6941 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006942}
6943
6944static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006946 PyUnicodeObject *str1,
6947 PyUnicodeObject *str2,
6948 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949{
6950 PyUnicodeObject *u;
6951
6952 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006954 else if (maxcount == 0 || self->length == 0)
6955 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956
Thomas Wouters477c8d52006-05-27 19:21:47 +00006957 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006958 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006959 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006960 if (str1->length == 0)
6961 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006962 if (str1->length == 1) {
6963 /* replace characters */
6964 Py_UNICODE u1, u2;
6965 if (!findchar(self->str, self->length, str1->str[0]))
6966 goto nothing;
6967 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6968 if (!u)
6969 return NULL;
6970 Py_UNICODE_COPY(u->str, self->str, self->length);
6971 u1 = str1->str[0];
6972 u2 = str2->str[0];
6973 for (i = 0; i < u->length; i++)
6974 if (u->str[i] == u1) {
6975 if (--maxcount < 0)
6976 break;
6977 u->str[i] = u2;
6978 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006980 i = stringlib_find(
6981 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006983 if (i < 0)
6984 goto nothing;
6985 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6986 if (!u)
6987 return NULL;
6988 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006989
6990 /* change everything in-place, starting with this one */
6991 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6992 i += str1->length;
6993
6994 while ( --maxcount > 0) {
6995 i = stringlib_find(self->str+i, self->length-i,
6996 str1->str, str1->length,
6997 i);
6998 if (i == -1)
6999 break;
7000 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7001 i += str1->length;
7002 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007005
Victor Stinnerab1d16b2011-11-22 01:45:37 +01007006 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007007 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 Py_UNICODE *p;
7009
7010 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007011 n = stringlib_count(self->str, self->length, str1->str, str1->length,
7012 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007013 if (n == 0)
7014 goto nothing;
7015 /* new_size = self->length + n * (str2->length - str1->length)); */
7016 delta = (str2->length - str1->length);
7017 if (delta == 0) {
7018 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007020 product = n * (str2->length - str1->length);
7021 if ((product / (str2->length - str1->length)) != n) {
7022 PyErr_SetString(PyExc_OverflowError,
7023 "replace string is too long");
7024 return NULL;
7025 }
7026 new_size = self->length + product;
7027 if (new_size < 0) {
7028 PyErr_SetString(PyExc_OverflowError,
7029 "replace string is too long");
7030 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031 }
7032 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007033 u = _PyUnicode_New(new_size);
7034 if (!u)
7035 return NULL;
7036 i = 0;
7037 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007038 if (str1->length > 0) {
7039 while (n-- > 0) {
7040 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007041 j = stringlib_find(self->str+i, self->length-i,
7042 str1->str, str1->length,
7043 i);
7044 if (j == -1)
7045 break;
7046 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007047 /* copy unchanged part [i:j] */
7048 Py_UNICODE_COPY(p, self->str+i, j-i);
7049 p += j - i;
7050 }
7051 /* copy substitution string */
7052 if (str2->length > 0) {
7053 Py_UNICODE_COPY(p, str2->str, str2->length);
7054 p += str2->length;
7055 }
7056 i = j + str1->length;
7057 }
7058 if (i < self->length)
7059 /* copy tail [i:] */
7060 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7061 } else {
7062 /* interleave */
7063 while (n > 0) {
7064 Py_UNICODE_COPY(p, str2->str, str2->length);
7065 p += str2->length;
7066 if (--n <= 0)
7067 break;
7068 *p++ = self->str[i++];
7069 }
7070 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7071 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007074
Benjamin Peterson29060642009-01-31 22:14:21 +00007075 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007076 /* nothing to replace; return original string (when possible) */
7077 if (PyUnicode_CheckExact(self)) {
7078 Py_INCREF(self);
7079 return (PyObject *) self;
7080 }
7081 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082}
7083
7084/* --- Unicode Object Methods --------------------------------------------- */
7085
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007086PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007087 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088\n\
7089Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007090characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007091
7092static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007093unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095 return fixup(self, fixtitle);
7096}
7097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007098PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007099 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100\n\
7101Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007102have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103
7104static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007105unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107 return fixup(self, fixcapitalize);
7108}
7109
7110#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007111PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007112 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113\n\
7114Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007115normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116
7117static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007118unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007119{
7120 PyObject *list;
7121 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007122 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124 /* Split into words */
7125 list = split(self, NULL, -1);
7126 if (!list)
7127 return NULL;
7128
7129 /* Capitalize each word */
7130 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7131 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133 if (item == NULL)
7134 goto onError;
7135 Py_DECREF(PyList_GET_ITEM(list, i));
7136 PyList_SET_ITEM(list, i, item);
7137 }
7138
7139 /* Join the words to form a new string */
7140 item = PyUnicode_Join(NULL, list);
7141
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 Py_DECREF(list);
7144 return (PyObject *)item;
7145}
7146#endif
7147
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007148/* Argument converter. Coerces to a single unicode character */
7149
7150static int
7151convert_uc(PyObject *obj, void *addr)
7152{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007153 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7154 PyObject *uniobj;
7155 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007156
Benjamin Peterson14339b62009-01-31 16:36:08 +00007157 uniobj = PyUnicode_FromObject(obj);
7158 if (uniobj == NULL) {
7159 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007160 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007161 return 0;
7162 }
7163 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7164 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007165 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007166 Py_DECREF(uniobj);
7167 return 0;
7168 }
7169 unistr = PyUnicode_AS_UNICODE(uniobj);
7170 *fillcharloc = unistr[0];
7171 Py_DECREF(uniobj);
7172 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007173}
7174
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007175PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007176 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007177\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007178Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007179done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180
7181static PyObject *
7182unicode_center(PyUnicodeObject *self, PyObject *args)
7183{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007184 Py_ssize_t marg, left;
7185 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007186 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007187
Thomas Woutersde017742006-02-16 19:34:37 +00007188 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189 return NULL;
7190
Tim Peters7a29bd52001-09-12 03:03:31 +00007191 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192 Py_INCREF(self);
7193 return (PyObject*) self;
7194 }
7195
7196 marg = width - self->length;
7197 left = marg / 2 + (marg & width & 1);
7198
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007199 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007200}
7201
Marc-André Lemburge5034372000-08-08 08:04:29 +00007202#if 0
7203
7204/* This code should go into some future Unicode collation support
7205 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007206 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007207
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007208/* speedy UTF-16 code point order comparison */
7209/* gleaned from: */
7210/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7211
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007212static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007213{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007214 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007215 0, 0, 0, 0, 0, 0, 0, 0,
7216 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007217 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007218};
7219
Guido van Rossumd57fd912000-03-10 22:53:23 +00007220static int
7221unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7222{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007223 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007224
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225 Py_UNICODE *s1 = str1->str;
7226 Py_UNICODE *s2 = str2->str;
7227
7228 len1 = str1->length;
7229 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007230
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007232 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007233
7234 c1 = *s1++;
7235 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007236
Benjamin Peterson29060642009-01-31 22:14:21 +00007237 if (c1 > (1<<11) * 26)
7238 c1 += utf16Fixup[c1>>11];
7239 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007240 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007241 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007242
7243 if (c1 != c2)
7244 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007245
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007246 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007247 }
7248
7249 return (len1 < len2) ? -1 : (len1 != len2);
7250}
7251
Marc-André Lemburge5034372000-08-08 08:04:29 +00007252#else
7253
7254static int
7255unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7256{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007257 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007258
7259 Py_UNICODE *s1 = str1->str;
7260 Py_UNICODE *s2 = str2->str;
7261
7262 len1 = str1->length;
7263 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007264
Marc-André Lemburge5034372000-08-08 08:04:29 +00007265 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007266 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007267
Fredrik Lundh45714e92001-06-26 16:39:36 +00007268 c1 = *s1++;
7269 c2 = *s2++;
7270
7271 if (c1 != c2)
7272 return (c1 < c2) ? -1 : 1;
7273
Marc-André Lemburge5034372000-08-08 08:04:29 +00007274 len1--; len2--;
7275 }
7276
7277 return (len1 < len2) ? -1 : (len1 != len2);
7278}
7279
7280#endif
7281
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007283 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007285 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7286 return unicode_compare((PyUnicodeObject *)left,
7287 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007288 PyErr_Format(PyExc_TypeError,
7289 "Can't compare %.100s and %.100s",
7290 left->ob_type->tp_name,
7291 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007292 return -1;
7293}
7294
Martin v. Löwis5b222132007-06-10 09:51:05 +00007295int
7296PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7297{
7298 int i;
7299 Py_UNICODE *id;
7300 assert(PyUnicode_Check(uni));
7301 id = PyUnicode_AS_UNICODE(uni);
7302 /* Compare Unicode string and source character set string */
7303 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007304 if (id[i] != str[i])
7305 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007306 /* This check keeps Python strings that end in '\0' from comparing equal
7307 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007308 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007310 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007312 return 0;
7313}
7314
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007315
Benjamin Peterson29060642009-01-31 22:14:21 +00007316#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007317 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007318
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007319PyObject *PyUnicode_RichCompare(PyObject *left,
7320 PyObject *right,
7321 int op)
7322{
7323 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007324
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007325 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7326 PyObject *v;
7327 if (((PyUnicodeObject *) left)->length !=
7328 ((PyUnicodeObject *) right)->length) {
7329 if (op == Py_EQ) {
7330 Py_INCREF(Py_False);
7331 return Py_False;
7332 }
7333 if (op == Py_NE) {
7334 Py_INCREF(Py_True);
7335 return Py_True;
7336 }
7337 }
7338 if (left == right)
7339 result = 0;
7340 else
7341 result = unicode_compare((PyUnicodeObject *)left,
7342 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007343
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007344 /* Convert the return value to a Boolean */
7345 switch (op) {
7346 case Py_EQ:
7347 v = TEST_COND(result == 0);
7348 break;
7349 case Py_NE:
7350 v = TEST_COND(result != 0);
7351 break;
7352 case Py_LE:
7353 v = TEST_COND(result <= 0);
7354 break;
7355 case Py_GE:
7356 v = TEST_COND(result >= 0);
7357 break;
7358 case Py_LT:
7359 v = TEST_COND(result == -1);
7360 break;
7361 case Py_GT:
7362 v = TEST_COND(result == 1);
7363 break;
7364 default:
7365 PyErr_BadArgument();
7366 return NULL;
7367 }
7368 Py_INCREF(v);
7369 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007370 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007371
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007372 Py_INCREF(Py_NotImplemented);
7373 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007374}
7375
Guido van Rossum403d68b2000-03-13 15:55:09 +00007376int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007377 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007378{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007379 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007380 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007381
7382 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007383 sub = PyUnicode_FromObject(element);
7384 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 PyErr_Format(PyExc_TypeError,
7386 "'in <string>' requires string as left operand, not %s",
7387 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007388 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007389 }
7390
Thomas Wouters477c8d52006-05-27 19:21:47 +00007391 str = PyUnicode_FromObject(container);
7392 if (!str) {
7393 Py_DECREF(sub);
7394 return -1;
7395 }
7396
7397 result = stringlib_contains_obj(str, sub);
7398
7399 Py_DECREF(str);
7400 Py_DECREF(sub);
7401
Guido van Rossum403d68b2000-03-13 15:55:09 +00007402 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007403}
7404
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405/* Concat to string or Unicode object giving a new Unicode object. */
7406
7407PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007408 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409{
7410 PyUnicodeObject *u = NULL, *v = NULL, *w;
7411
7412 /* Coerce the two arguments */
7413 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7414 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007415 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7417 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419
7420 /* Shortcuts */
7421 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 Py_DECREF(v);
7423 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424 }
7425 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007426 Py_DECREF(u);
7427 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428 }
7429
7430 /* Concat the two Unicode strings */
7431 w = _PyUnicode_New(u->length + v->length);
7432 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434 Py_UNICODE_COPY(w->str, u->str, u->length);
7435 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7436
7437 Py_DECREF(u);
7438 Py_DECREF(v);
7439 return (PyObject *)w;
7440
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442 Py_XDECREF(u);
7443 Py_XDECREF(v);
7444 return NULL;
7445}
7446
Walter Dörwald1ab83302007-05-18 17:15:44 +00007447void
7448PyUnicode_Append(PyObject **pleft, PyObject *right)
7449{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007450 PyObject *new;
7451 if (*pleft == NULL)
7452 return;
7453 if (right == NULL || !PyUnicode_Check(*pleft)) {
7454 Py_DECREF(*pleft);
7455 *pleft = NULL;
7456 return;
7457 }
7458 new = PyUnicode_Concat(*pleft, right);
7459 Py_DECREF(*pleft);
7460 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007461}
7462
7463void
7464PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7465{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007466 PyUnicode_Append(pleft, right);
7467 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007468}
7469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007470PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007473Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007474string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007475interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476
7477static PyObject *
7478unicode_count(PyUnicodeObject *self, PyObject *args)
7479{
7480 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007481 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007482 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483 PyObject *result;
7484
Jesus Ceaac451502011-04-20 17:09:23 +02007485 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
7486 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00007487 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007488
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007489 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007490 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007491 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007492 substring->str, substring->length,
7493 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007494 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495
7496 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007497
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498 return result;
7499}
7500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007501PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007502 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007504Encode S using the codec registered for encoding. Default encoding\n\
7505is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007506handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007507a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7508'xmlcharrefreplace' as well as any other name registered with\n\
7509codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510
7511static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007512unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007514 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515 char *encoding = NULL;
7516 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007517
Benjamin Peterson308d6372009-09-18 21:42:35 +00007518 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7519 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007521 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007522}
7523
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007524PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526\n\
7527Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007528If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529
7530static PyObject*
7531unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7532{
7533 Py_UNICODE *e;
7534 Py_UNICODE *p;
7535 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007536 Py_UNICODE *qe;
7537 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007538 PyUnicodeObject *u;
7539 int tabsize = 8;
7540
7541 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007542 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543
Thomas Wouters7e474022000-07-16 12:04:32 +00007544 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007545 i = 0; /* chars up to and including most recent \n or \r */
7546 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7547 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548 for (p = self->str; p < e; p++)
7549 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007550 if (tabsize > 0) {
7551 incr = tabsize - (j % tabsize); /* cannot overflow */
7552 if (j > PY_SSIZE_T_MAX - incr)
7553 goto overflow1;
7554 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007555 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007556 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007558 if (j > PY_SSIZE_T_MAX - 1)
7559 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560 j++;
7561 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007562 if (i > PY_SSIZE_T_MAX - j)
7563 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007565 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566 }
7567 }
7568
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007569 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007571
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 /* Second pass: create output string and fill it */
7573 u = _PyUnicode_New(i + j);
7574 if (!u)
7575 return NULL;
7576
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007577 j = 0; /* same as in first pass */
7578 q = u->str; /* next output char */
7579 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580
7581 for (p = self->str; p < e; p++)
7582 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 if (tabsize > 0) {
7584 i = tabsize - (j % tabsize);
7585 j += i;
7586 while (i--) {
7587 if (q >= qe)
7588 goto overflow2;
7589 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007590 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007591 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007592 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 else {
7594 if (q >= qe)
7595 goto overflow2;
7596 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007597 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598 if (*p == '\n' || *p == '\r')
7599 j = 0;
7600 }
7601
7602 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007603
7604 overflow2:
7605 Py_DECREF(u);
7606 overflow1:
7607 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7608 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609}
7610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007611PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007612 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613\n\
7614Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08007615such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616arguments start and end are interpreted as in slice notation.\n\
7617\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007618Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619
7620static PyObject *
7621unicode_find(PyUnicodeObject *self, PyObject *args)
7622{
Jesus Ceaac451502011-04-20 17:09:23 +02007623 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007624 Py_ssize_t start;
7625 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007626 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627
Jesus Ceaac451502011-04-20 17:09:23 +02007628 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
7629 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631
Thomas Wouters477c8d52006-05-27 19:21:47 +00007632 result = stringlib_find_slice(
7633 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7634 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7635 start, end
7636 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637
7638 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007639
Christian Heimes217cfd12007-12-02 14:31:20 +00007640 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641}
7642
7643static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007644unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645{
7646 if (index < 0 || index >= self->length) {
7647 PyErr_SetString(PyExc_IndexError, "string index out of range");
7648 return NULL;
7649 }
7650
7651 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7652}
7653
Guido van Rossumc2504932007-09-18 19:42:40 +00007654/* Believe it or not, this produces the same value for ASCII strings
7655 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007656static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007657unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658{
Guido van Rossumc2504932007-09-18 19:42:40 +00007659 Py_ssize_t len;
7660 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007661 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007662
7663 if (self->hash != -1)
7664 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007665 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007666 p = self->str;
7667 x = *p << 7;
7668 while (--len >= 0)
7669 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007670 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007671 if (x == -1)
7672 x = -2;
7673 self->hash = x;
7674 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675}
7676
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007677PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007680Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681
7682static PyObject *
7683unicode_index(PyUnicodeObject *self, PyObject *args)
7684{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007685 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02007686 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007687 Py_ssize_t start;
7688 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689
Jesus Ceaac451502011-04-20 17:09:23 +02007690 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
7691 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693
Thomas Wouters477c8d52006-05-27 19:21:47 +00007694 result = stringlib_find_slice(
7695 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7696 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7697 start, end
7698 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007699
7700 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007701
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702 if (result < 0) {
7703 PyErr_SetString(PyExc_ValueError, "substring not found");
7704 return NULL;
7705 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007706
Christian Heimes217cfd12007-12-02 14:31:20 +00007707 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708}
7709
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007710PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007711 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007713Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007714at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715
7716static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007717unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718{
7719 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7720 register const Py_UNICODE *e;
7721 int cased;
7722
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723 /* Shortcut for single character strings */
7724 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007727 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007728 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007729 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007730
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731 e = p + PyUnicode_GET_SIZE(self);
7732 cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007733 while (p < e) {
7734 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007735
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7737 return PyBool_FromLong(0);
7738 else if (!cased && Py_UNICODE_ISLOWER(ch))
7739 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007741 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742}
7743
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007744PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007747Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007748at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749
7750static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007751unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752{
7753 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7754 register const Py_UNICODE *e;
7755 int cased;
7756
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757 /* Shortcut for single character strings */
7758 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007761 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007762 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007764
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765 e = p + PyUnicode_GET_SIZE(self);
7766 cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007767 while (p < e) {
7768 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007769
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7771 return PyBool_FromLong(0);
7772 else if (!cased && Py_UNICODE_ISUPPER(ch))
7773 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007775 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776}
7777
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007778PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007781Return True if S is a titlecased string and there is at least one\n\
7782character in S, i.e. upper- and titlecase characters may only\n\
7783follow uncased characters and lowercase characters only cased ones.\n\
7784Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785
7786static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007787unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788{
7789 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7790 register const Py_UNICODE *e;
7791 int cased, previous_is_cased;
7792
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 /* Shortcut for single character strings */
7794 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7796 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007798 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007799 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007801
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 e = p + PyUnicode_GET_SIZE(self);
7803 cased = 0;
7804 previous_is_cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007805 while (p < e) {
7806 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007807
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7809 if (previous_is_cased)
7810 return PyBool_FromLong(0);
7811 previous_is_cased = 1;
7812 cased = 1;
7813 }
7814 else if (Py_UNICODE_ISLOWER(ch)) {
7815 if (!previous_is_cased)
7816 return PyBool_FromLong(0);
7817 previous_is_cased = 1;
7818 cased = 1;
7819 }
7820 else
7821 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007823 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824}
7825
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007826PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007829Return True if all characters in S are whitespace\n\
7830and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007831
7832static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007833unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834{
7835 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7836 register const Py_UNICODE *e;
7837
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 /* Shortcut for single character strings */
7839 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 Py_UNICODE_ISSPACE(*p))
7841 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007843 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007844 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007846
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007848 while (p < e) {
7849 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7850 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00007851 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007853 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854}
7855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007856PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007857 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007858\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007859Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007860and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007861
7862static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007863unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007864{
7865 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7866 register const Py_UNICODE *e;
7867
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007868 /* Shortcut for single character strings */
7869 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 Py_UNICODE_ISALPHA(*p))
7871 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007872
7873 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007874 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007876
7877 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007878 while (p < e) {
7879 if (!Py_UNICODE_ISALPHA(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007881 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007882 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007883}
7884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007885PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007887\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007888Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007889and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007890
7891static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007892unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007893{
7894 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7895 register const Py_UNICODE *e;
7896
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007897 /* Shortcut for single character strings */
7898 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 Py_UNICODE_ISALNUM(*p))
7900 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007901
7902 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007903 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007904 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007905
7906 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007907 while (p < e) {
7908 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7909 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007911 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007912 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007913}
7914
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007915PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007918Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007919False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007920
7921static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007922unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007923{
7924 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7925 register const Py_UNICODE *e;
7926
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927 /* Shortcut for single character strings */
7928 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007929 Py_UNICODE_ISDECIMAL(*p))
7930 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007931
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007932 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007933 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007935
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007937 while (p < e) {
7938 if (!Py_UNICODE_ISDECIMAL(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007941 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942}
7943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007944PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007947Return True if all characters in S are digits\n\
7948and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949
7950static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007951unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952{
7953 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7954 register const Py_UNICODE *e;
7955
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956 /* Shortcut for single character strings */
7957 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007958 Py_UNICODE_ISDIGIT(*p))
7959 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007960
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007961 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007962 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007964
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007966 while (p < e) {
7967 if (!Py_UNICODE_ISDIGIT(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007970 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971}
7972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007973PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007976Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007977False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978
7979static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007980unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981{
7982 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7983 register const Py_UNICODE *e;
7984
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 /* Shortcut for single character strings */
7986 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 Py_UNICODE_ISNUMERIC(*p))
7988 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007990 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007991 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007993
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007995 while (p < e) {
7996 if (!Py_UNICODE_ISNUMERIC(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007997 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007999 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000}
8001
Martin v. Löwis47383402007-08-15 07:32:56 +00008002int
8003PyUnicode_IsIdentifier(PyObject *self)
8004{
Benjamin Petersonf413b802011-08-12 22:17:18 -05008005 const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008006 const Py_UNICODE *e;
8007 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +00008008
8009 /* Special case for empty strings */
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008010 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008012
8013 /* PEP 3131 says that the first character must be in
8014 XID_Start and subsequent characters in XID_Continue,
8015 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00008016 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00008017 letters, digits, underscore). However, given the current
8018 definition of XID_Start and XID_Continue, it is sufficient
8019 to check just for these, except that _ must be allowed
8020 as starting an identifier. */
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008021 e = p + PyUnicode_GET_SIZE(self);
8022 first = _Py_UNICODE_NEXT(p, e);
Benjamin Petersonf413b802011-08-12 22:17:18 -05008023 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +00008024 return 0;
8025
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008026 while (p < e)
8027 if (!_PyUnicode_IsXidContinue(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008029 return 1;
8030}
8031
8032PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008033 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008034\n\
8035Return True if S is a valid identifier according\n\
8036to the language definition.");
8037
8038static PyObject*
8039unicode_isidentifier(PyObject *self)
8040{
8041 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8042}
8043
Georg Brandl559e5d72008-06-11 18:37:52 +00008044PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008046\n\
8047Return True if all characters in S are considered\n\
8048printable in repr() or S is empty, False otherwise.");
8049
8050static PyObject*
8051unicode_isprintable(PyObject *self)
8052{
8053 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8054 register const Py_UNICODE *e;
8055
8056 /* Shortcut for single character strings */
8057 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8058 Py_RETURN_TRUE;
8059 }
8060
8061 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008062 while (p < e) {
8063 if (!Py_UNICODE_ISPRINTABLE(_Py_UNICODE_NEXT(p, e))) {
Georg Brandl559e5d72008-06-11 18:37:52 +00008064 Py_RETURN_FALSE;
8065 }
8066 }
8067 Py_RETURN_TRUE;
8068}
8069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008070PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008071 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072\n\
8073Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008074iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075
8076static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008077unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008079 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080}
8081
Martin v. Löwis18e16552006-02-15 17:27:45 +00008082static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083unicode_length(PyUnicodeObject *self)
8084{
8085 return self->length;
8086}
8087
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008088PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008091Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008092done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008093
8094static PyObject *
8095unicode_ljust(PyUnicodeObject *self, PyObject *args)
8096{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008097 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008098 Py_UNICODE fillchar = ' ';
8099
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008100 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101 return NULL;
8102
Tim Peters7a29bd52001-09-12 03:03:31 +00008103 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104 Py_INCREF(self);
8105 return (PyObject*) self;
8106 }
8107
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008108 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109}
8110
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008111PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008114Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115
8116static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008117unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119 return fixup(self, fixlower);
8120}
8121
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008122#define LEFTSTRIP 0
8123#define RIGHTSTRIP 1
8124#define BOTHSTRIP 2
8125
8126/* Arrays indexed by above */
8127static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8128
8129#define STRIPNAME(i) (stripformat[i]+3)
8130
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008131/* externally visible for str.strip(unicode) */
8132PyObject *
8133_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8134{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008135 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8136 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8137 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8138 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8139 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008140
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008142
Benjamin Peterson14339b62009-01-31 16:36:08 +00008143 i = 0;
8144 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008145 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8146 i++;
8147 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008148 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008149
Benjamin Peterson14339b62009-01-31 16:36:08 +00008150 j = len;
8151 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008152 do {
8153 j--;
8154 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8155 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008156 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008157
Benjamin Peterson14339b62009-01-31 16:36:08 +00008158 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 Py_INCREF(self);
8160 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008161 }
8162 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008163 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008164}
8165
Guido van Rossumd57fd912000-03-10 22:53:23 +00008166
8167static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008168do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008170 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8171 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008172
Benjamin Peterson14339b62009-01-31 16:36:08 +00008173 i = 0;
8174 if (striptype != RIGHTSTRIP) {
8175 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8176 i++;
8177 }
8178 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008179
Benjamin Peterson14339b62009-01-31 16:36:08 +00008180 j = len;
8181 if (striptype != LEFTSTRIP) {
8182 do {
8183 j--;
8184 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8185 j++;
8186 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008187
Benjamin Peterson14339b62009-01-31 16:36:08 +00008188 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8189 Py_INCREF(self);
8190 return (PyObject*)self;
8191 }
8192 else
8193 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194}
8195
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008196
8197static PyObject *
8198do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8199{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008200 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008201
Benjamin Peterson14339b62009-01-31 16:36:08 +00008202 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8203 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008204
Benjamin Peterson14339b62009-01-31 16:36:08 +00008205 if (sep != NULL && sep != Py_None) {
8206 if (PyUnicode_Check(sep))
8207 return _PyUnicode_XStrip(self, striptype, sep);
8208 else {
8209 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 "%s arg must be None or str",
8211 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008212 return NULL;
8213 }
8214 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008215
Benjamin Peterson14339b62009-01-31 16:36:08 +00008216 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008217}
8218
8219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008220PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008222\n\
8223Return a copy of the string S with leading and trailing\n\
8224whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008225If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008226
8227static PyObject *
8228unicode_strip(PyUnicodeObject *self, PyObject *args)
8229{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008230 if (PyTuple_GET_SIZE(args) == 0)
8231 return do_strip(self, BOTHSTRIP); /* Common case */
8232 else
8233 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008234}
8235
8236
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008237PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008239\n\
8240Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008241If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008242
8243static PyObject *
8244unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8245{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008246 if (PyTuple_GET_SIZE(args) == 0)
8247 return do_strip(self, LEFTSTRIP); /* Common case */
8248 else
8249 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008250}
8251
8252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008253PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008255\n\
8256Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008257If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008258
8259static PyObject *
8260unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8261{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008262 if (PyTuple_GET_SIZE(args) == 0)
8263 return do_strip(self, RIGHTSTRIP); /* Common case */
8264 else
8265 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008266}
8267
8268
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008270unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271{
8272 PyUnicodeObject *u;
8273 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008274 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008275 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276
Georg Brandl222de0f2009-04-12 12:01:50 +00008277 if (len < 1) {
8278 Py_INCREF(unicode_empty);
8279 return (PyObject *)unicode_empty;
8280 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281
Tim Peters7a29bd52001-09-12 03:03:31 +00008282 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283 /* no repeat, return original string */
8284 Py_INCREF(str);
8285 return (PyObject*) str;
8286 }
Tim Peters8f422462000-09-09 06:13:41 +00008287
8288 /* ensure # of chars needed doesn't overflow int and # of bytes
8289 * needed doesn't overflow size_t
8290 */
8291 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008292 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008293 PyErr_SetString(PyExc_OverflowError,
8294 "repeated string is too long");
8295 return NULL;
8296 }
8297 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8298 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8299 PyErr_SetString(PyExc_OverflowError,
8300 "repeated string is too long");
8301 return NULL;
8302 }
8303 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304 if (!u)
8305 return NULL;
8306
8307 p = u->str;
8308
Georg Brandl222de0f2009-04-12 12:01:50 +00008309 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008310 Py_UNICODE_FILL(p, str->str[0], len);
8311 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008312 Py_ssize_t done = str->length; /* number of characters copied this far */
8313 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008315 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008316 Py_UNICODE_COPY(p+done, p, n);
8317 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319 }
8320
8321 return (PyObject*) u;
8322}
8323
8324PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 PyObject *subobj,
8326 PyObject *replobj,
8327 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328{
8329 PyObject *self;
8330 PyObject *str1;
8331 PyObject *str2;
8332 PyObject *result;
8333
8334 self = PyUnicode_FromObject(obj);
8335 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008336 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 str1 = PyUnicode_FromObject(subobj);
8338 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 Py_DECREF(self);
8340 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341 }
8342 str2 = PyUnicode_FromObject(replobj);
8343 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 Py_DECREF(self);
8345 Py_DECREF(str1);
8346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347 }
Tim Petersced69f82003-09-16 20:30:58 +00008348 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 (PyUnicodeObject *)str1,
8350 (PyUnicodeObject *)str2,
8351 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352 Py_DECREF(self);
8353 Py_DECREF(str1);
8354 Py_DECREF(str2);
8355 return result;
8356}
8357
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008358PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008359 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360\n\
8361Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008362old replaced by new. If the optional argument count is\n\
8363given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364
8365static PyObject*
8366unicode_replace(PyUnicodeObject *self, PyObject *args)
8367{
8368 PyUnicodeObject *str1;
8369 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008370 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371 PyObject *result;
8372
Martin v. Löwis18e16552006-02-15 17:27:45 +00008373 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374 return NULL;
8375 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8376 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008379 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 Py_DECREF(str1);
8381 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008382 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383
8384 result = replace(self, str1, str2, maxcount);
8385
8386 Py_DECREF(str1);
8387 Py_DECREF(str2);
8388 return result;
8389}
8390
8391static
8392PyObject *unicode_repr(PyObject *unicode)
8393{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008394 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008395 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008396 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8397 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8398
8399 /* XXX(nnorwitz): rather than over-allocating, it would be
8400 better to choose a different scheme. Perhaps scan the
8401 first N-chars of the string and allocate based on that size.
8402 */
8403 /* Initial allocation is based on the longest-possible unichr
8404 escape.
8405
8406 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8407 unichr, so in this case it's the longest unichr escape. In
8408 narrow (UTF-16) builds this is five chars per source unichr
8409 since there are two unichrs in the surrogate pair, so in narrow
8410 (UTF-16) builds it's not the longest unichr escape.
8411
8412 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8413 so in the narrow (UTF-16) build case it's the longest unichr
8414 escape.
8415 */
8416
Walter Dörwald1ab83302007-05-18 17:15:44 +00008417 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008419#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008421#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008423#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008425 if (repr == NULL)
8426 return NULL;
8427
Walter Dörwald1ab83302007-05-18 17:15:44 +00008428 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008429
8430 /* Add quote */
8431 *p++ = (findchar(s, size, '\'') &&
8432 !findchar(s, size, '"')) ? '"' : '\'';
8433 while (size-- > 0) {
8434 Py_UNICODE ch = *s++;
8435
8436 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008437 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008438 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008439 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008440 continue;
8441 }
8442
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008444 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008445 *p++ = '\\';
8446 *p++ = 't';
8447 }
8448 else if (ch == '\n') {
8449 *p++ = '\\';
8450 *p++ = 'n';
8451 }
8452 else if (ch == '\r') {
8453 *p++ = '\\';
8454 *p++ = 'r';
8455 }
8456
8457 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008458 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008459 *p++ = '\\';
8460 *p++ = 'x';
8461 *p++ = hexdigits[(ch >> 4) & 0x000F];
8462 *p++ = hexdigits[ch & 0x000F];
8463 }
8464
Georg Brandl559e5d72008-06-11 18:37:52 +00008465 /* Copy ASCII characters as-is */
8466 else if (ch < 0x7F) {
8467 *p++ = ch;
8468 }
8469
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008471 else {
8472 Py_UCS4 ucs = ch;
8473
8474#ifndef Py_UNICODE_WIDE
8475 Py_UNICODE ch2 = 0;
8476 /* Get code point from surrogate pair */
8477 if (size > 0) {
8478 ch2 = *s;
8479 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008481 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008483 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008484 size--;
8485 }
8486 }
8487#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008488 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008489 (categories Z* and C* except ASCII space)
8490 */
8491 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8492 /* Map 8-bit characters to '\xhh' */
8493 if (ucs <= 0xff) {
8494 *p++ = '\\';
8495 *p++ = 'x';
8496 *p++ = hexdigits[(ch >> 4) & 0x000F];
8497 *p++ = hexdigits[ch & 0x000F];
8498 }
8499 /* Map 21-bit characters to '\U00xxxxxx' */
8500 else if (ucs >= 0x10000) {
8501 *p++ = '\\';
8502 *p++ = 'U';
8503 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8504 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8505 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8506 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8507 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8508 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8509 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8510 *p++ = hexdigits[ucs & 0x0000000F];
8511 }
8512 /* Map 16-bit characters to '\uxxxx' */
8513 else {
8514 *p++ = '\\';
8515 *p++ = 'u';
8516 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8517 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8518 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8519 *p++ = hexdigits[ucs & 0x000F];
8520 }
8521 }
8522 /* Copy characters as-is */
8523 else {
8524 *p++ = ch;
8525#ifndef Py_UNICODE_WIDE
8526 if (ucs >= 0x10000)
8527 *p++ = ch2;
8528#endif
8529 }
8530 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008531 }
8532 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008533 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008534
8535 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008536 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008537 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538}
8539
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008540PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542\n\
8543Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08008544such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545arguments start and end are interpreted as in slice notation.\n\
8546\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008547Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548
8549static PyObject *
8550unicode_rfind(PyUnicodeObject *self, PyObject *args)
8551{
Jesus Ceaac451502011-04-20 17:09:23 +02008552 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008553 Py_ssize_t start;
8554 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008555 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556
Jesus Ceaac451502011-04-20 17:09:23 +02008557 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
8558 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008559 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560
Thomas Wouters477c8d52006-05-27 19:21:47 +00008561 result = stringlib_rfind_slice(
8562 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8563 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8564 start, end
8565 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566
8567 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008568
Christian Heimes217cfd12007-12-02 14:31:20 +00008569 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008570}
8571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008572PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008573 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008575Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576
8577static PyObject *
8578unicode_rindex(PyUnicodeObject *self, PyObject *args)
8579{
Jesus Ceaac451502011-04-20 17:09:23 +02008580 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008581 Py_ssize_t start;
8582 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008583 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584
Jesus Ceaac451502011-04-20 17:09:23 +02008585 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
8586 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588
Thomas Wouters477c8d52006-05-27 19:21:47 +00008589 result = stringlib_rfind_slice(
8590 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8591 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8592 start, end
8593 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594
8595 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008596
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597 if (result < 0) {
8598 PyErr_SetString(PyExc_ValueError, "substring not found");
8599 return NULL;
8600 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008601 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602}
8603
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008604PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008607Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008608done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609
8610static PyObject *
8611unicode_rjust(PyUnicodeObject *self, PyObject *args)
8612{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008613 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008614 Py_UNICODE fillchar = ' ';
8615
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008616 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617 return NULL;
8618
Tim Peters7a29bd52001-09-12 03:03:31 +00008619 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620 Py_INCREF(self);
8621 return (PyObject*) self;
8622 }
8623
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008624 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625}
8626
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 PyObject *sep,
8629 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630{
8631 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008632
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633 s = PyUnicode_FromObject(s);
8634 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008635 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 if (sep != NULL) {
8637 sep = PyUnicode_FromObject(sep);
8638 if (sep == NULL) {
8639 Py_DECREF(s);
8640 return NULL;
8641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642 }
8643
8644 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8645
8646 Py_DECREF(s);
8647 Py_XDECREF(sep);
8648 return result;
8649}
8650
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008651PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653\n\
8654Return a list of the words in S, using sep as the\n\
8655delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008656splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008657whitespace string is a separator and empty strings are\n\
8658removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659
8660static PyObject*
8661unicode_split(PyUnicodeObject *self, PyObject *args)
8662{
8663 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008664 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665
Martin v. Löwis18e16552006-02-15 17:27:45 +00008666 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667 return NULL;
8668
8669 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675}
8676
Thomas Wouters477c8d52006-05-27 19:21:47 +00008677PyObject *
8678PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8679{
8680 PyObject* str_obj;
8681 PyObject* sep_obj;
8682 PyObject* out;
8683
8684 str_obj = PyUnicode_FromObject(str_in);
8685 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008687 sep_obj = PyUnicode_FromObject(sep_in);
8688 if (!sep_obj) {
8689 Py_DECREF(str_obj);
8690 return NULL;
8691 }
8692
8693 out = stringlib_partition(
8694 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8695 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8696 );
8697
8698 Py_DECREF(sep_obj);
8699 Py_DECREF(str_obj);
8700
8701 return out;
8702}
8703
8704
8705PyObject *
8706PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8707{
8708 PyObject* str_obj;
8709 PyObject* sep_obj;
8710 PyObject* out;
8711
8712 str_obj = PyUnicode_FromObject(str_in);
8713 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008715 sep_obj = PyUnicode_FromObject(sep_in);
8716 if (!sep_obj) {
8717 Py_DECREF(str_obj);
8718 return NULL;
8719 }
8720
8721 out = stringlib_rpartition(
8722 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8723 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8724 );
8725
8726 Py_DECREF(sep_obj);
8727 Py_DECREF(str_obj);
8728
8729 return out;
8730}
8731
8732PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008734\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008735Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008736the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008737found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008738
8739static PyObject*
8740unicode_partition(PyUnicodeObject *self, PyObject *separator)
8741{
8742 return PyUnicode_Partition((PyObject *)self, separator);
8743}
8744
8745PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008746 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008747\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008748Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008749the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008750separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008751
8752static PyObject*
8753unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8754{
8755 return PyUnicode_RPartition((PyObject *)self, separator);
8756}
8757
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008758PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 PyObject *sep,
8760 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008761{
8762 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008763
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008764 s = PyUnicode_FromObject(s);
8765 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008766 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 if (sep != NULL) {
8768 sep = PyUnicode_FromObject(sep);
8769 if (sep == NULL) {
8770 Py_DECREF(s);
8771 return NULL;
8772 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008773 }
8774
8775 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8776
8777 Py_DECREF(s);
8778 Py_XDECREF(sep);
8779 return result;
8780}
8781
8782PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008783 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008784\n\
8785Return a list of the words in S, using sep as the\n\
8786delimiter string, starting at the end of the string and\n\
8787working to the front. If maxsplit is given, at most maxsplit\n\
8788splits are done. If sep is not specified, any whitespace string\n\
8789is a separator.");
8790
8791static PyObject*
8792unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8793{
8794 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008795 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008796
Martin v. Löwis18e16552006-02-15 17:27:45 +00008797 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008798 return NULL;
8799
8800 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008802 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008804 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008806}
8807
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008808PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008809 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008810\n\
8811Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008812Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008813is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008814
8815static PyObject*
8816unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8817{
Guido van Rossum86662912000-04-11 15:38:46 +00008818 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008819
Guido van Rossum86662912000-04-11 15:38:46 +00008820 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008821 return NULL;
8822
Guido van Rossum86662912000-04-11 15:38:46 +00008823 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824}
8825
8826static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008827PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828{
Walter Dörwald346737f2007-05-31 10:44:43 +00008829 if (PyUnicode_CheckExact(self)) {
8830 Py_INCREF(self);
8831 return self;
8832 } else
8833 /* Subtype -- return genuine unicode string with the same value. */
8834 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8835 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836}
8837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008838PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840\n\
8841Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008842and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008843
8844static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008845unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847 return fixup(self, fixswapcase);
8848}
8849
Georg Brandlceee0772007-11-27 23:48:05 +00008850PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008852\n\
8853Return a translation table usable for str.translate().\n\
8854If there is only one argument, it must be a dictionary mapping Unicode\n\
8855ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008856Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008857If there are two arguments, they must be strings of equal length, and\n\
8858in the resulting dictionary, each character in x will be mapped to the\n\
8859character at the same position in y. If there is a third argument, it\n\
8860must be a string, whose characters will be mapped to None in the result.");
8861
8862static PyObject*
8863unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8864{
8865 PyObject *x, *y = NULL, *z = NULL;
8866 PyObject *new = NULL, *key, *value;
8867 Py_ssize_t i = 0;
8868 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008869
Georg Brandlceee0772007-11-27 23:48:05 +00008870 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8871 return NULL;
8872 new = PyDict_New();
8873 if (!new)
8874 return NULL;
8875 if (y != NULL) {
8876 /* x must be a string too, of equal length */
8877 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8878 if (!PyUnicode_Check(x)) {
8879 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8880 "be a string if there is a second argument");
8881 goto err;
8882 }
8883 if (PyUnicode_GET_SIZE(x) != ylen) {
8884 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8885 "arguments must have equal length");
8886 goto err;
8887 }
8888 /* create entries for translating chars in x to those in y */
8889 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008890 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
Benjamin Peterson53aa1d72011-12-20 13:29:45 -06008891 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +00008892 goto err;
Benjamin Peterson53aa1d72011-12-20 13:29:45 -06008893 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8894 if (!value) {
8895 Py_DECREF(key);
8896 goto err;
8897 }
Georg Brandlceee0772007-11-27 23:48:05 +00008898 res = PyDict_SetItem(new, key, value);
8899 Py_DECREF(key);
8900 Py_DECREF(value);
8901 if (res < 0)
8902 goto err;
8903 }
8904 /* create entries for deleting chars in z */
8905 if (z != NULL) {
8906 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008907 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008908 if (!key)
8909 goto err;
8910 res = PyDict_SetItem(new, key, Py_None);
8911 Py_DECREF(key);
8912 if (res < 0)
8913 goto err;
8914 }
8915 }
8916 } else {
8917 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008918 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008919 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8920 "to maketrans it must be a dict");
8921 goto err;
8922 }
8923 /* copy entries into the new dict, converting string keys to int keys */
8924 while (PyDict_Next(x, &i, &key, &value)) {
8925 if (PyUnicode_Check(key)) {
8926 /* convert string keys to integer keys */
8927 PyObject *newkey;
8928 if (PyUnicode_GET_SIZE(key) != 1) {
8929 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8930 "table must be of length 1");
8931 goto err;
8932 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008933 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008934 if (!newkey)
8935 goto err;
8936 res = PyDict_SetItem(new, newkey, value);
8937 Py_DECREF(newkey);
8938 if (res < 0)
8939 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008940 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008941 /* just keep integer keys */
8942 if (PyDict_SetItem(new, key, value) < 0)
8943 goto err;
8944 } else {
8945 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8946 "be strings or integers");
8947 goto err;
8948 }
8949 }
8950 }
8951 return new;
8952 err:
8953 Py_DECREF(new);
8954 return NULL;
8955}
8956
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008957PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008958 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959\n\
8960Return a copy of the string S, where all characters have been mapped\n\
8961through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008962Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008963Unmapped characters are left untouched. Characters mapped to None\n\
8964are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965
8966static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008967unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968{
Georg Brandlceee0772007-11-27 23:48:05 +00008969 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970}
8971
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008972PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008975Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976
8977static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008978unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980 return fixup(self, fixupper);
8981}
8982
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008983PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008984 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008986Pad a numeric string S with zeros on the left, to fill a field\n\
8987of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988
8989static PyObject *
8990unicode_zfill(PyUnicodeObject *self, PyObject *args)
8991{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008992 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993 PyUnicodeObject *u;
8994
Martin v. Löwis18e16552006-02-15 17:27:45 +00008995 Py_ssize_t width;
8996 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997 return NULL;
8998
8999 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00009000 if (PyUnicode_CheckExact(self)) {
9001 Py_INCREF(self);
9002 return (PyObject*) self;
9003 }
9004 else
9005 return PyUnicode_FromUnicode(
9006 PyUnicode_AS_UNICODE(self),
9007 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00009008 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009 }
9010
9011 fill = width - self->length;
9012
9013 u = pad(self, fill, 0, '0');
9014
Walter Dörwald068325e2002-04-15 13:36:47 +00009015 if (u == NULL)
9016 return NULL;
9017
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018 if (u->str[fill] == '+' || u->str[fill] == '-') {
9019 /* move sign to beginning of string */
9020 u->str[0] = u->str[fill];
9021 u->str[fill] = '0';
9022 }
9023
9024 return (PyObject*) u;
9025}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026
9027#if 0
9028static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009029unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030{
Christian Heimes2202f872008-02-06 14:31:34 +00009031 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009033
9034static PyObject *
9035unicode__decimal2ascii(PyObject *self)
9036{
9037 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9038 PyUnicode_GET_SIZE(self));
9039}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040#endif
9041
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009042PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009043 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009045Return True if S starts with the specified prefix, False otherwise.\n\
9046With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009047With optional end, stop comparing S at that position.\n\
9048prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049
9050static PyObject *
9051unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009052 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009054 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009056 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009057 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009058 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059
Jesus Ceaac451502011-04-20 17:09:23 +02009060 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009061 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009062 if (PyTuple_Check(subobj)) {
9063 Py_ssize_t i;
9064 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9065 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009066 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009067 if (substring == NULL)
9068 return NULL;
9069 result = tailmatch(self, substring, start, end, -1);
9070 Py_DECREF(substring);
9071 if (result) {
9072 Py_RETURN_TRUE;
9073 }
9074 }
9075 /* nothing matched */
9076 Py_RETURN_FALSE;
9077 }
9078 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009079 if (substring == NULL) {
9080 if (PyErr_ExceptionMatches(PyExc_TypeError))
9081 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
9082 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009083 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009084 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009085 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009087 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088}
9089
9090
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009091PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009094Return True if S ends with the specified suffix, False otherwise.\n\
9095With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009096With optional end, stop comparing S at that position.\n\
9097suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098
9099static PyObject *
9100unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009101 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009103 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009105 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009106 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009107 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108
Jesus Ceaac451502011-04-20 17:09:23 +02009109 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009110 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009111 if (PyTuple_Check(subobj)) {
9112 Py_ssize_t i;
9113 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9114 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009115 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009116 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009118 result = tailmatch(self, substring, start, end, +1);
9119 Py_DECREF(substring);
9120 if (result) {
9121 Py_RETURN_TRUE;
9122 }
9123 }
9124 Py_RETURN_FALSE;
9125 }
9126 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009127 if (substring == NULL) {
9128 if (PyErr_ExceptionMatches(PyExc_TypeError))
9129 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
9130 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009132 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009133 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009135 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136}
9137
Eric Smith8c663262007-08-25 02:26:07 +00009138#include "stringlib/string_format.h"
9139
9140PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009141 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009142\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009143Return a formatted version of S, using substitutions from args and kwargs.\n\
9144The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009145
Eric Smith27bbca62010-11-04 17:06:58 +00009146PyDoc_STRVAR(format_map__doc__,
9147 "S.format_map(mapping) -> str\n\
9148\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009149Return a formatted version of S, using substitutions from mapping.\n\
9150The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009151
Eric Smith4a7d76d2008-05-30 18:10:19 +00009152static PyObject *
9153unicode__format__(PyObject* self, PyObject* args)
9154{
9155 PyObject *format_spec;
9156
9157 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9158 return NULL;
9159
9160 return _PyUnicode_FormatAdvanced(self,
9161 PyUnicode_AS_UNICODE(format_spec),
9162 PyUnicode_GET_SIZE(format_spec));
9163}
9164
Eric Smith8c663262007-08-25 02:26:07 +00009165PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009166 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009167\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009168Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009169
9170static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009171unicode__sizeof__(PyUnicodeObject *v)
9172{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009173 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9174 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009175}
9176
9177PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009178 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009179
9180static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009181unicode_getnewargs(PyUnicodeObject *v)
9182{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009183 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009184}
9185
Guido van Rossumd57fd912000-03-10 22:53:23 +00009186static PyMethodDef unicode_methods[] = {
9187
9188 /* Order is according to common usage: often used methods should
9189 appear first, since lookup is done sequentially. */
9190
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009191 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009192 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9193 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009194 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009195 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9196 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9197 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9198 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9199 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9200 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9201 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009202 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009203 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9204 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9205 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009206 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009207 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9208 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9209 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009210 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009211 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009212 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009213 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009214 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9215 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9216 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9217 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9218 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9219 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9220 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9221 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9222 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9223 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9224 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9225 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9226 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9227 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009228 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009229 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009230 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009231 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009232 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009233 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009234 {"maketrans", (PyCFunction) unicode_maketrans,
9235 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009236 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009237#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009238 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239#endif
9240
9241#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009242 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009243 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009244 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009245#endif
9246
Benjamin Peterson14339b62009-01-31 16:36:08 +00009247 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009248 {NULL, NULL}
9249};
9250
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009251static PyObject *
9252unicode_mod(PyObject *v, PyObject *w)
9253{
Benjamin Peterson29060642009-01-31 22:14:21 +00009254 if (!PyUnicode_Check(v)) {
9255 Py_INCREF(Py_NotImplemented);
9256 return Py_NotImplemented;
9257 }
9258 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009259}
9260
9261static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009262 0, /*nb_add*/
9263 0, /*nb_subtract*/
9264 0, /*nb_multiply*/
9265 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009266};
9267
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009269 (lenfunc) unicode_length, /* sq_length */
9270 PyUnicode_Concat, /* sq_concat */
9271 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9272 (ssizeargfunc) unicode_getitem, /* sq_item */
9273 0, /* sq_slice */
9274 0, /* sq_ass_item */
9275 0, /* sq_ass_slice */
9276 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009277};
9278
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009279static PyObject*
9280unicode_subscript(PyUnicodeObject* self, PyObject* item)
9281{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009282 if (PyIndex_Check(item)) {
9283 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009284 if (i == -1 && PyErr_Occurred())
9285 return NULL;
9286 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009287 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009288 return unicode_getitem(self, i);
9289 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009290 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009291 Py_UNICODE* source_buf;
9292 Py_UNICODE* result_buf;
9293 PyObject* result;
9294
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009295 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009296 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009297 return NULL;
9298 }
9299
9300 if (slicelength <= 0) {
9301 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009302 } else if (start == 0 && step == 1 && slicelength == self->length &&
9303 PyUnicode_CheckExact(self)) {
9304 Py_INCREF(self);
9305 return (PyObject *)self;
9306 } else if (step == 1) {
9307 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009308 } else {
9309 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009310 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9311 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009312
Benjamin Peterson29060642009-01-31 22:14:21 +00009313 if (result_buf == NULL)
9314 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009315
9316 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9317 result_buf[i] = source_buf[cur];
9318 }
Tim Petersced69f82003-09-16 20:30:58 +00009319
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009320 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009321 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009322 return result;
9323 }
9324 } else {
9325 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9326 return NULL;
9327 }
9328}
9329
9330static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009331 (lenfunc)unicode_length, /* mp_length */
9332 (binaryfunc)unicode_subscript, /* mp_subscript */
9333 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009334};
9335
Guido van Rossumd57fd912000-03-10 22:53:23 +00009336
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337/* Helpers for PyUnicode_Format() */
9338
9339static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009340getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009341{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009342 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009343 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009344 (*p_argidx)++;
9345 if (arglen < 0)
9346 return args;
9347 else
9348 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349 }
9350 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009351 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009352 return NULL;
9353}
9354
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009355/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009356
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009357static PyObject *
9358formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009360 char *p;
9361 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009363
Guido van Rossumd57fd912000-03-10 22:53:23 +00009364 x = PyFloat_AsDouble(v);
9365 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009366 return NULL;
9367
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009369 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009370
Eric Smith0923d1d2009-04-16 20:16:10 +00009371 p = PyOS_double_to_string(x, type, prec,
9372 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009373 if (p == NULL)
9374 return NULL;
9375 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009376 PyMem_Free(p);
9377 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378}
9379
Tim Peters38fd5b62000-09-21 05:43:11 +00009380static PyObject*
9381formatlong(PyObject *val, int flags, int prec, int type)
9382{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009383 char *buf;
9384 int len;
9385 PyObject *str; /* temporary string object. */
9386 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009387
Benjamin Peterson14339b62009-01-31 16:36:08 +00009388 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9389 if (!str)
9390 return NULL;
9391 result = PyUnicode_FromStringAndSize(buf, len);
9392 Py_DECREF(str);
9393 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009394}
9395
Guido van Rossumd57fd912000-03-10 22:53:23 +00009396static int
9397formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009398 size_t buflen,
9399 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009401 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009402 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009403 if (PyUnicode_GET_SIZE(v) == 1) {
9404 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9405 buf[1] = '\0';
9406 return 1;
9407 }
9408#ifndef Py_UNICODE_WIDE
9409 if (PyUnicode_GET_SIZE(v) == 2) {
9410 /* Decode a valid surrogate pair */
9411 int c0 = PyUnicode_AS_UNICODE(v)[0];
9412 int c1 = PyUnicode_AS_UNICODE(v)[1];
9413 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9414 0xDC00 <= c1 && c1 <= 0xDFFF) {
9415 buf[0] = c0;
9416 buf[1] = c1;
9417 buf[2] = '\0';
9418 return 2;
9419 }
9420 }
9421#endif
9422 goto onError;
9423 }
9424 else {
9425 /* Integer input truncated to a character */
9426 long x;
9427 x = PyLong_AsLong(v);
9428 if (x == -1 && PyErr_Occurred())
9429 goto onError;
9430
9431 if (x < 0 || x > 0x10ffff) {
9432 PyErr_SetString(PyExc_OverflowError,
9433 "%c arg not in range(0x110000)");
9434 return -1;
9435 }
9436
9437#ifndef Py_UNICODE_WIDE
9438 if (x > 0xffff) {
9439 x -= 0x10000;
9440 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9441 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9442 return 2;
9443 }
9444#endif
9445 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009446 buf[1] = '\0';
9447 return 1;
9448 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009449
Benjamin Peterson29060642009-01-31 22:14:21 +00009450 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009451 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009452 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009453 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454}
9455
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009456/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009457 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009458*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009459#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009460
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009462 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009463{
9464 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009465 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466 int args_owned = 0;
9467 PyUnicodeObject *result = NULL;
9468 PyObject *dict = NULL;
9469 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009470
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009472 PyErr_BadInternalCall();
9473 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474 }
9475 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009476 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009477 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478 fmt = PyUnicode_AS_UNICODE(uformat);
9479 fmtcnt = PyUnicode_GET_SIZE(uformat);
9480
9481 reslen = rescnt = fmtcnt + 100;
9482 result = _PyUnicode_New(reslen);
9483 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009484 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485 res = PyUnicode_AS_UNICODE(result);
9486
9487 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009488 arglen = PyTuple_Size(args);
9489 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490 }
9491 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009492 arglen = -1;
9493 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009495 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009496 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009497 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498
9499 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009500 if (*fmt != '%') {
9501 if (--rescnt < 0) {
9502 rescnt = fmtcnt + 100;
9503 reslen += rescnt;
9504 if (_PyUnicode_Resize(&result, reslen) < 0)
9505 goto onError;
9506 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9507 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009508 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009509 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009510 }
9511 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009512 /* Got a format specifier */
9513 int flags = 0;
9514 Py_ssize_t width = -1;
9515 int prec = -1;
9516 Py_UNICODE c = '\0';
9517 Py_UNICODE fill;
9518 int isnumok;
9519 PyObject *v = NULL;
9520 PyObject *temp = NULL;
9521 Py_UNICODE *pbuf;
9522 Py_UNICODE sign;
9523 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009524 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525
Benjamin Peterson29060642009-01-31 22:14:21 +00009526 fmt++;
9527 if (*fmt == '(') {
9528 Py_UNICODE *keystart;
9529 Py_ssize_t keylen;
9530 PyObject *key;
9531 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009532
Benjamin Peterson29060642009-01-31 22:14:21 +00009533 if (dict == NULL) {
9534 PyErr_SetString(PyExc_TypeError,
9535 "format requires a mapping");
9536 goto onError;
9537 }
9538 ++fmt;
9539 --fmtcnt;
9540 keystart = fmt;
9541 /* Skip over balanced parentheses */
9542 while (pcount > 0 && --fmtcnt >= 0) {
9543 if (*fmt == ')')
9544 --pcount;
9545 else if (*fmt == '(')
9546 ++pcount;
9547 fmt++;
9548 }
9549 keylen = fmt - keystart - 1;
9550 if (fmtcnt < 0 || pcount > 0) {
9551 PyErr_SetString(PyExc_ValueError,
9552 "incomplete format key");
9553 goto onError;
9554 }
9555#if 0
9556 /* keys are converted to strings using UTF-8 and
9557 then looked up since Python uses strings to hold
9558 variables names etc. in its namespaces and we
9559 wouldn't want to break common idioms. */
9560 key = PyUnicode_EncodeUTF8(keystart,
9561 keylen,
9562 NULL);
9563#else
9564 key = PyUnicode_FromUnicode(keystart, keylen);
9565#endif
9566 if (key == NULL)
9567 goto onError;
9568 if (args_owned) {
9569 Py_DECREF(args);
9570 args_owned = 0;
9571 }
9572 args = PyObject_GetItem(dict, key);
9573 Py_DECREF(key);
9574 if (args == NULL) {
9575 goto onError;
9576 }
9577 args_owned = 1;
9578 arglen = -1;
9579 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009580 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009581 while (--fmtcnt >= 0) {
9582 switch (c = *fmt++) {
9583 case '-': flags |= F_LJUST; continue;
9584 case '+': flags |= F_SIGN; continue;
9585 case ' ': flags |= F_BLANK; continue;
9586 case '#': flags |= F_ALT; continue;
9587 case '0': flags |= F_ZERO; continue;
9588 }
9589 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009590 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009591 if (c == '*') {
9592 v = getnextarg(args, arglen, &argidx);
9593 if (v == NULL)
9594 goto onError;
9595 if (!PyLong_Check(v)) {
9596 PyErr_SetString(PyExc_TypeError,
9597 "* wants int");
9598 goto onError;
9599 }
9600 width = PyLong_AsLong(v);
9601 if (width == -1 && PyErr_Occurred())
9602 goto onError;
9603 if (width < 0) {
9604 flags |= F_LJUST;
9605 width = -width;
9606 }
9607 if (--fmtcnt >= 0)
9608 c = *fmt++;
9609 }
9610 else if (c >= '0' && c <= '9') {
9611 width = c - '0';
9612 while (--fmtcnt >= 0) {
9613 c = *fmt++;
9614 if (c < '0' || c > '9')
9615 break;
9616 if ((width*10) / 10 != width) {
9617 PyErr_SetString(PyExc_ValueError,
9618 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009619 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009620 }
9621 width = width*10 + (c - '0');
9622 }
9623 }
9624 if (c == '.') {
9625 prec = 0;
9626 if (--fmtcnt >= 0)
9627 c = *fmt++;
9628 if (c == '*') {
9629 v = getnextarg(args, arglen, &argidx);
9630 if (v == NULL)
9631 goto onError;
9632 if (!PyLong_Check(v)) {
9633 PyErr_SetString(PyExc_TypeError,
9634 "* wants int");
9635 goto onError;
9636 }
9637 prec = PyLong_AsLong(v);
9638 if (prec == -1 && PyErr_Occurred())
9639 goto onError;
9640 if (prec < 0)
9641 prec = 0;
9642 if (--fmtcnt >= 0)
9643 c = *fmt++;
9644 }
9645 else if (c >= '0' && c <= '9') {
9646 prec = c - '0';
9647 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009648 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009649 if (c < '0' || c > '9')
9650 break;
9651 if ((prec*10) / 10 != prec) {
9652 PyErr_SetString(PyExc_ValueError,
9653 "prec too big");
9654 goto onError;
9655 }
9656 prec = prec*10 + (c - '0');
9657 }
9658 }
9659 } /* prec */
9660 if (fmtcnt >= 0) {
9661 if (c == 'h' || c == 'l' || c == 'L') {
9662 if (--fmtcnt >= 0)
9663 c = *fmt++;
9664 }
9665 }
9666 if (fmtcnt < 0) {
9667 PyErr_SetString(PyExc_ValueError,
9668 "incomplete format");
9669 goto onError;
9670 }
9671 if (c != '%') {
9672 v = getnextarg(args, arglen, &argidx);
9673 if (v == NULL)
9674 goto onError;
9675 }
9676 sign = 0;
9677 fill = ' ';
9678 switch (c) {
9679
9680 case '%':
9681 pbuf = formatbuf;
9682 /* presume that buffer length is at least 1 */
9683 pbuf[0] = '%';
9684 len = 1;
9685 break;
9686
9687 case 's':
9688 case 'r':
9689 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009690 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009691 temp = v;
9692 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009693 }
9694 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009695 if (c == 's')
9696 temp = PyObject_Str(v);
9697 else if (c == 'r')
9698 temp = PyObject_Repr(v);
9699 else
9700 temp = PyObject_ASCII(v);
9701 if (temp == NULL)
9702 goto onError;
9703 if (PyUnicode_Check(temp))
9704 /* nothing to do */;
9705 else {
9706 Py_DECREF(temp);
9707 PyErr_SetString(PyExc_TypeError,
9708 "%s argument has non-string str()");
9709 goto onError;
9710 }
9711 }
9712 pbuf = PyUnicode_AS_UNICODE(temp);
9713 len = PyUnicode_GET_SIZE(temp);
9714 if (prec >= 0 && len > prec)
9715 len = prec;
9716 break;
9717
9718 case 'i':
9719 case 'd':
9720 case 'u':
9721 case 'o':
9722 case 'x':
9723 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +00009724 isnumok = 0;
9725 if (PyNumber_Check(v)) {
9726 PyObject *iobj=NULL;
9727
9728 if (PyLong_Check(v)) {
9729 iobj = v;
9730 Py_INCREF(iobj);
9731 }
9732 else {
9733 iobj = PyNumber_Long(v);
9734 }
9735 if (iobj!=NULL) {
9736 if (PyLong_Check(iobj)) {
9737 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -07009738 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +00009739 Py_DECREF(iobj);
9740 if (!temp)
9741 goto onError;
9742 pbuf = PyUnicode_AS_UNICODE(temp);
9743 len = PyUnicode_GET_SIZE(temp);
9744 sign = 1;
9745 }
9746 else {
9747 Py_DECREF(iobj);
9748 }
9749 }
9750 }
9751 if (!isnumok) {
9752 PyErr_Format(PyExc_TypeError,
9753 "%%%c format: a number is required, "
9754 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9755 goto onError;
9756 }
9757 if (flags & F_ZERO)
9758 fill = '0';
9759 break;
9760
9761 case 'e':
9762 case 'E':
9763 case 'f':
9764 case 'F':
9765 case 'g':
9766 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009767 temp = formatfloat(v, flags, prec, c);
9768 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009769 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009770 pbuf = PyUnicode_AS_UNICODE(temp);
9771 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009772 sign = 1;
9773 if (flags & F_ZERO)
9774 fill = '0';
9775 break;
9776
9777 case 'c':
9778 pbuf = formatbuf;
9779 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9780 if (len < 0)
9781 goto onError;
9782 break;
9783
9784 default:
9785 PyErr_Format(PyExc_ValueError,
9786 "unsupported format character '%c' (0x%x) "
9787 "at index %zd",
9788 (31<=c && c<=126) ? (char)c : '?',
9789 (int)c,
9790 (Py_ssize_t)(fmt - 1 -
9791 PyUnicode_AS_UNICODE(uformat)));
9792 goto onError;
9793 }
9794 if (sign) {
9795 if (*pbuf == '-' || *pbuf == '+') {
9796 sign = *pbuf++;
9797 len--;
9798 }
9799 else if (flags & F_SIGN)
9800 sign = '+';
9801 else if (flags & F_BLANK)
9802 sign = ' ';
9803 else
9804 sign = 0;
9805 }
9806 if (width < len)
9807 width = len;
9808 if (rescnt - (sign != 0) < width) {
9809 reslen -= rescnt;
9810 rescnt = width + fmtcnt + 100;
9811 reslen += rescnt;
9812 if (reslen < 0) {
9813 Py_XDECREF(temp);
9814 PyErr_NoMemory();
9815 goto onError;
9816 }
9817 if (_PyUnicode_Resize(&result, reslen) < 0) {
9818 Py_XDECREF(temp);
9819 goto onError;
9820 }
9821 res = PyUnicode_AS_UNICODE(result)
9822 + reslen - rescnt;
9823 }
9824 if (sign) {
9825 if (fill != ' ')
9826 *res++ = sign;
9827 rescnt--;
9828 if (width > len)
9829 width--;
9830 }
9831 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9832 assert(pbuf[0] == '0');
9833 assert(pbuf[1] == c);
9834 if (fill != ' ') {
9835 *res++ = *pbuf++;
9836 *res++ = *pbuf++;
9837 }
9838 rescnt -= 2;
9839 width -= 2;
9840 if (width < 0)
9841 width = 0;
9842 len -= 2;
9843 }
9844 if (width > len && !(flags & F_LJUST)) {
9845 do {
9846 --rescnt;
9847 *res++ = fill;
9848 } while (--width > len);
9849 }
9850 if (fill == ' ') {
9851 if (sign)
9852 *res++ = sign;
9853 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9854 assert(pbuf[0] == '0');
9855 assert(pbuf[1] == c);
9856 *res++ = *pbuf++;
9857 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009858 }
9859 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009860 Py_UNICODE_COPY(res, pbuf, len);
9861 res += len;
9862 rescnt -= len;
9863 while (--width >= len) {
9864 --rescnt;
9865 *res++ = ' ';
9866 }
9867 if (dict && (argidx < arglen) && c != '%') {
9868 PyErr_SetString(PyExc_TypeError,
9869 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009870 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009871 goto onError;
9872 }
9873 Py_XDECREF(temp);
9874 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009875 } /* until end */
9876 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009877 PyErr_SetString(PyExc_TypeError,
9878 "not all arguments converted during string formatting");
9879 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009880 }
9881
Thomas Woutersa96affe2006-03-12 00:29:36 +00009882 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009883 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009884 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009885 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009886 }
9887 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888 return (PyObject *)result;
9889
Benjamin Peterson29060642009-01-31 22:14:21 +00009890 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891 Py_XDECREF(result);
9892 Py_DECREF(uformat);
9893 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009894 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895 }
9896 return NULL;
9897}
9898
Jeremy Hylton938ace62002-07-17 16:30:39 +00009899static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009900unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9901
Tim Peters6d6c1a32001-08-02 04:15:00 +00009902static PyObject *
9903unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9904{
Benjamin Peterson29060642009-01-31 22:14:21 +00009905 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009906 static char *kwlist[] = {"object", "encoding", "errors", 0};
9907 char *encoding = NULL;
9908 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009909
Benjamin Peterson14339b62009-01-31 16:36:08 +00009910 if (type != &PyUnicode_Type)
9911 return unicode_subtype_new(type, args, kwds);
9912 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009913 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009914 return NULL;
9915 if (x == NULL)
9916 return (PyObject *)_PyUnicode_New(0);
9917 if (encoding == NULL && errors == NULL)
9918 return PyObject_Str(x);
9919 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009920 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009921}
9922
Guido van Rossume023fe02001-08-30 03:12:59 +00009923static PyObject *
9924unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9925{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009926 PyUnicodeObject *tmp, *pnew;
9927 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009928
Benjamin Peterson14339b62009-01-31 16:36:08 +00009929 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9930 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9931 if (tmp == NULL)
9932 return NULL;
9933 assert(PyUnicode_Check(tmp));
9934 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9935 if (pnew == NULL) {
9936 Py_DECREF(tmp);
9937 return NULL;
9938 }
9939 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9940 if (pnew->str == NULL) {
9941 _Py_ForgetReference((PyObject *)pnew);
9942 PyObject_Del(pnew);
9943 Py_DECREF(tmp);
9944 return PyErr_NoMemory();
9945 }
9946 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9947 pnew->length = n;
9948 pnew->hash = tmp->hash;
9949 Py_DECREF(tmp);
9950 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009951}
9952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009953PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009954 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009955\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009956Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009957encoding defaults to the current default string encoding.\n\
9958errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009959
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009960static PyObject *unicode_iter(PyObject *seq);
9961
Guido van Rossumd57fd912000-03-10 22:53:23 +00009962PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009963 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009964 "str", /* tp_name */
9965 sizeof(PyUnicodeObject), /* tp_size */
9966 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009967 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009968 (destructor)unicode_dealloc, /* tp_dealloc */
9969 0, /* tp_print */
9970 0, /* tp_getattr */
9971 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009972 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009973 unicode_repr, /* tp_repr */
9974 &unicode_as_number, /* tp_as_number */
9975 &unicode_as_sequence, /* tp_as_sequence */
9976 &unicode_as_mapping, /* tp_as_mapping */
9977 (hashfunc) unicode_hash, /* tp_hash*/
9978 0, /* tp_call*/
9979 (reprfunc) unicode_str, /* tp_str */
9980 PyObject_GenericGetAttr, /* tp_getattro */
9981 0, /* tp_setattro */
9982 0, /* tp_as_buffer */
9983 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009984 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009985 unicode_doc, /* tp_doc */
9986 0, /* tp_traverse */
9987 0, /* tp_clear */
9988 PyUnicode_RichCompare, /* tp_richcompare */
9989 0, /* tp_weaklistoffset */
9990 unicode_iter, /* tp_iter */
9991 0, /* tp_iternext */
9992 unicode_methods, /* tp_methods */
9993 0, /* tp_members */
9994 0, /* tp_getset */
9995 &PyBaseObject_Type, /* tp_base */
9996 0, /* tp_dict */
9997 0, /* tp_descr_get */
9998 0, /* tp_descr_set */
9999 0, /* tp_dictoffset */
10000 0, /* tp_init */
10001 0, /* tp_alloc */
10002 unicode_new, /* tp_new */
10003 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004};
10005
10006/* Initialize the Unicode implementation */
10007
Thomas Wouters78890102000-07-22 19:25:51 +000010008void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010009{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010010 int i;
10011
Thomas Wouters477c8d52006-05-27 19:21:47 +000010012 /* XXX - move this array to unicodectype.c ? */
10013 Py_UNICODE linebreak[] = {
10014 0x000A, /* LINE FEED */
10015 0x000D, /* CARRIAGE RETURN */
10016 0x001C, /* FILE SEPARATOR */
10017 0x001D, /* GROUP SEPARATOR */
10018 0x001E, /* RECORD SEPARATOR */
10019 0x0085, /* NEXT LINE */
10020 0x2028, /* LINE SEPARATOR */
10021 0x2029, /* PARAGRAPH SEPARATOR */
10022 };
10023
Fred Drakee4315f52000-05-09 19:53:39 +000010024 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +000010025 free_list = NULL;
10026 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010028 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +000010029 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010030
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010031 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000010032 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000010033 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010034 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000010035
10036 /* initialize the linebreak bloom filter */
10037 bloom_linebreak = make_bloom_mask(
10038 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10039 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010040
10041 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010042}
10043
10044/* Finalize the Unicode implementation */
10045
Christian Heimesa156e092008-02-16 07:38:31 +000010046int
10047PyUnicode_ClearFreeList(void)
10048{
10049 int freelist_size = numfree;
10050 PyUnicodeObject *u;
10051
10052 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010053 PyUnicodeObject *v = u;
10054 u = *(PyUnicodeObject **)u;
10055 if (v->str)
10056 PyObject_DEL(v->str);
10057 Py_XDECREF(v->defenc);
10058 PyObject_Del(v);
10059 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010060 }
10061 free_list = NULL;
10062 assert(numfree == 0);
10063 return freelist_size;
10064}
10065
Guido van Rossumd57fd912000-03-10 22:53:23 +000010066void
Thomas Wouters78890102000-07-22 19:25:51 +000010067_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010068{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010069 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010070
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010071 Py_XDECREF(unicode_empty);
10072 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010073
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010074 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010075 if (unicode_latin1[i]) {
10076 Py_DECREF(unicode_latin1[i]);
10077 unicode_latin1[i] = NULL;
10078 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010079 }
Christian Heimesa156e092008-02-16 07:38:31 +000010080 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010081}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010082
Walter Dörwald16807132007-05-25 13:52:07 +000010083void
10084PyUnicode_InternInPlace(PyObject **p)
10085{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010086 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10087 PyObject *t;
10088 if (s == NULL || !PyUnicode_Check(s))
10089 Py_FatalError(
10090 "PyUnicode_InternInPlace: unicode strings only please!");
10091 /* If it's a subclass, we don't really know what putting
10092 it in the interned dict might do. */
10093 if (!PyUnicode_CheckExact(s))
10094 return;
10095 if (PyUnicode_CHECK_INTERNED(s))
10096 return;
10097 if (interned == NULL) {
10098 interned = PyDict_New();
10099 if (interned == NULL) {
10100 PyErr_Clear(); /* Don't leave an exception */
10101 return;
10102 }
10103 }
10104 /* It might be that the GetItem call fails even
10105 though the key is present in the dictionary,
10106 namely when this happens during a stack overflow. */
10107 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010108 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010109 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010110
Benjamin Peterson29060642009-01-31 22:14:21 +000010111 if (t) {
10112 Py_INCREF(t);
10113 Py_DECREF(*p);
10114 *p = t;
10115 return;
10116 }
Walter Dörwald16807132007-05-25 13:52:07 +000010117
Benjamin Peterson14339b62009-01-31 16:36:08 +000010118 PyThreadState_GET()->recursion_critical = 1;
10119 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10120 PyErr_Clear();
10121 PyThreadState_GET()->recursion_critical = 0;
10122 return;
10123 }
10124 PyThreadState_GET()->recursion_critical = 0;
10125 /* The two references in interned are not counted by refcnt.
10126 The deallocator will take care of this */
10127 Py_REFCNT(s) -= 2;
10128 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010129}
10130
10131void
10132PyUnicode_InternImmortal(PyObject **p)
10133{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010134 PyUnicode_InternInPlace(p);
10135 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10136 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10137 Py_INCREF(*p);
10138 }
Walter Dörwald16807132007-05-25 13:52:07 +000010139}
10140
10141PyObject *
10142PyUnicode_InternFromString(const char *cp)
10143{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010144 PyObject *s = PyUnicode_FromString(cp);
10145 if (s == NULL)
10146 return NULL;
10147 PyUnicode_InternInPlace(&s);
10148 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010149}
10150
10151void _Py_ReleaseInternedUnicodeStrings(void)
10152{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010153 PyObject *keys;
10154 PyUnicodeObject *s;
10155 Py_ssize_t i, n;
10156 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010157
Benjamin Peterson14339b62009-01-31 16:36:08 +000010158 if (interned == NULL || !PyDict_Check(interned))
10159 return;
10160 keys = PyDict_Keys(interned);
10161 if (keys == NULL || !PyList_Check(keys)) {
10162 PyErr_Clear();
10163 return;
10164 }
Walter Dörwald16807132007-05-25 13:52:07 +000010165
Benjamin Peterson14339b62009-01-31 16:36:08 +000010166 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10167 detector, interned unicode strings are not forcibly deallocated;
10168 rather, we give them their stolen references back, and then clear
10169 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010170
Benjamin Peterson14339b62009-01-31 16:36:08 +000010171 n = PyList_GET_SIZE(keys);
10172 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010173 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010174 for (i = 0; i < n; i++) {
10175 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10176 switch (s->state) {
10177 case SSTATE_NOT_INTERNED:
10178 /* XXX Shouldn't happen */
10179 break;
10180 case SSTATE_INTERNED_IMMORTAL:
10181 Py_REFCNT(s) += 1;
10182 immortal_size += s->length;
10183 break;
10184 case SSTATE_INTERNED_MORTAL:
10185 Py_REFCNT(s) += 2;
10186 mortal_size += s->length;
10187 break;
10188 default:
10189 Py_FatalError("Inconsistent interned string state.");
10190 }
10191 s->state = SSTATE_NOT_INTERNED;
10192 }
10193 fprintf(stderr, "total size of all interned strings: "
10194 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10195 "mortal/immortal\n", mortal_size, immortal_size);
10196 Py_DECREF(keys);
10197 PyDict_Clear(interned);
10198 Py_DECREF(interned);
10199 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010200}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010201
10202
10203/********************* Unicode Iterator **************************/
10204
10205typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010206 PyObject_HEAD
10207 Py_ssize_t it_index;
10208 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010209} unicodeiterobject;
10210
10211static void
10212unicodeiter_dealloc(unicodeiterobject *it)
10213{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010214 _PyObject_GC_UNTRACK(it);
10215 Py_XDECREF(it->it_seq);
10216 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010217}
10218
10219static int
10220unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10221{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010222 Py_VISIT(it->it_seq);
10223 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010224}
10225
10226static PyObject *
10227unicodeiter_next(unicodeiterobject *it)
10228{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010229 PyUnicodeObject *seq;
10230 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010231
Benjamin Peterson14339b62009-01-31 16:36:08 +000010232 assert(it != NULL);
10233 seq = it->it_seq;
10234 if (seq == NULL)
10235 return NULL;
10236 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010237
Benjamin Peterson14339b62009-01-31 16:36:08 +000010238 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10239 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010240 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010241 if (item != NULL)
10242 ++it->it_index;
10243 return item;
10244 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010245
Benjamin Peterson14339b62009-01-31 16:36:08 +000010246 Py_DECREF(seq);
10247 it->it_seq = NULL;
10248 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010249}
10250
10251static PyObject *
10252unicodeiter_len(unicodeiterobject *it)
10253{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010254 Py_ssize_t len = 0;
10255 if (it->it_seq)
10256 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10257 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010258}
10259
10260PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10261
10262static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010263 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010264 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010265 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010266};
10267
10268PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010269 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10270 "str_iterator", /* tp_name */
10271 sizeof(unicodeiterobject), /* tp_basicsize */
10272 0, /* tp_itemsize */
10273 /* methods */
10274 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10275 0, /* tp_print */
10276 0, /* tp_getattr */
10277 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010278 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010279 0, /* tp_repr */
10280 0, /* tp_as_number */
10281 0, /* tp_as_sequence */
10282 0, /* tp_as_mapping */
10283 0, /* tp_hash */
10284 0, /* tp_call */
10285 0, /* tp_str */
10286 PyObject_GenericGetAttr, /* tp_getattro */
10287 0, /* tp_setattro */
10288 0, /* tp_as_buffer */
10289 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10290 0, /* tp_doc */
10291 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10292 0, /* tp_clear */
10293 0, /* tp_richcompare */
10294 0, /* tp_weaklistoffset */
10295 PyObject_SelfIter, /* tp_iter */
10296 (iternextfunc)unicodeiter_next, /* tp_iternext */
10297 unicodeiter_methods, /* tp_methods */
10298 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010299};
10300
10301static PyObject *
10302unicode_iter(PyObject *seq)
10303{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010304 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010305
Benjamin Peterson14339b62009-01-31 16:36:08 +000010306 if (!PyUnicode_Check(seq)) {
10307 PyErr_BadInternalCall();
10308 return NULL;
10309 }
10310 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10311 if (it == NULL)
10312 return NULL;
10313 it->it_index = 0;
10314 Py_INCREF(seq);
10315 it->it_seq = (PyUnicodeObject *)seq;
10316 _PyObject_GC_TRACK(it);
10317 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010318}
10319
Martin v. Löwis5b222132007-06-10 09:51:05 +000010320size_t
10321Py_UNICODE_strlen(const Py_UNICODE *u)
10322{
10323 int res = 0;
10324 while(*u++)
10325 res++;
10326 return res;
10327}
10328
10329Py_UNICODE*
10330Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10331{
10332 Py_UNICODE *u = s1;
10333 while ((*u++ = *s2++));
10334 return s1;
10335}
10336
10337Py_UNICODE*
10338Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10339{
10340 Py_UNICODE *u = s1;
10341 while ((*u++ = *s2++))
10342 if (n-- == 0)
10343 break;
10344 return s1;
10345}
10346
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010347Py_UNICODE*
10348Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10349{
10350 Py_UNICODE *u1 = s1;
10351 u1 += Py_UNICODE_strlen(u1);
10352 Py_UNICODE_strcpy(u1, s2);
10353 return s1;
10354}
10355
Martin v. Löwis5b222132007-06-10 09:51:05 +000010356int
10357Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10358{
10359 while (*s1 && *s2 && *s1 == *s2)
10360 s1++, s2++;
10361 if (*s1 && *s2)
10362 return (*s1 < *s2) ? -1 : +1;
10363 if (*s1)
10364 return 1;
10365 if (*s2)
10366 return -1;
10367 return 0;
10368}
10369
Victor Stinneref8d95c2010-08-16 22:03:11 +000010370int
10371Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10372{
10373 register Py_UNICODE u1, u2;
10374 for (; n != 0; n--) {
10375 u1 = *s1;
10376 u2 = *s2;
10377 if (u1 != u2)
10378 return (u1 < u2) ? -1 : +1;
10379 if (u1 == '\0')
10380 return 0;
10381 s1++;
10382 s2++;
10383 }
10384 return 0;
10385}
10386
Martin v. Löwis5b222132007-06-10 09:51:05 +000010387Py_UNICODE*
10388Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10389{
10390 const Py_UNICODE *p;
10391 for (p = s; *p; p++)
10392 if (*p == c)
10393 return (Py_UNICODE*)p;
10394 return NULL;
10395}
10396
Victor Stinner331ea922010-08-10 16:37:20 +000010397Py_UNICODE*
10398Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10399{
10400 const Py_UNICODE *p;
10401 p = s + Py_UNICODE_strlen(s);
10402 while (p != s) {
10403 p--;
10404 if (*p == c)
10405 return (Py_UNICODE*)p;
10406 }
10407 return NULL;
10408}
10409
Victor Stinner71133ff2010-09-01 23:43:53 +000010410Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010411PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010412{
10413 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10414 Py_UNICODE *copy;
10415 Py_ssize_t size;
10416
10417 /* Ensure we won't overflow the size. */
10418 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10419 PyErr_NoMemory();
10420 return NULL;
10421 }
10422 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10423 size *= sizeof(Py_UNICODE);
10424 copy = PyMem_Malloc(size);
10425 if (copy == NULL) {
10426 PyErr_NoMemory();
10427 return NULL;
10428 }
10429 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10430 return copy;
10431}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010432
Georg Brandl66c221e2010-10-14 07:04:07 +000010433/* A _string module, to export formatter_parser and formatter_field_name_split
10434 to the string.Formatter class implemented in Python. */
10435
10436static PyMethodDef _string_methods[] = {
10437 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10438 METH_O, PyDoc_STR("split the argument as a field name")},
10439 {"formatter_parser", (PyCFunction) formatter_parser,
10440 METH_O, PyDoc_STR("parse the argument as a format string")},
10441 {NULL, NULL}
10442};
10443
10444static struct PyModuleDef _string_module = {
10445 PyModuleDef_HEAD_INIT,
10446 "_string",
10447 PyDoc_STR("string helper module"),
10448 0,
10449 _string_methods,
10450 NULL,
10451 NULL,
10452 NULL,
10453 NULL
10454};
10455
10456PyMODINIT_FUNC
10457PyInit__string(void)
10458{
10459 return PyModule_Create(&_string_module);
10460}
10461
10462
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010463#ifdef __cplusplus
10464}
10465#endif