blob: 8680726275e7cb2334ff12ddef63f1f499ac5e4d [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000145static PyObject *unicode_encode_call_errorhandler(const char *errors,
146 PyObject **errorHandler,const char *encoding, const char *reason,
147 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
148 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
149
Victor Stinner31be90b2010-04-22 19:38:16 +0000150static void raise_encode_exception(PyObject **exceptionObject,
151 const char *encoding,
152 const Py_UNICODE *unicode, Py_ssize_t size,
153 Py_ssize_t startpos, Py_ssize_t endpos,
154 const char *reason);
155
Christian Heimes190d79e2008-01-30 11:58:22 +0000156/* Same for linebreaks */
157static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000160/* 0x000B, * LINE TABULATION */
161/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x001C, * FILE SEPARATOR */
166/* 0x001D, * GROUP SEPARATOR */
167/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000168 0, 0, 0, 0, 1, 1, 1, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000173
Benjamin Peterson14339b62009-01-31 16:36:08 +0000174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000182};
183
184
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000185Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000186PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000188#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000189 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 /* This is actually an illegal character, so it should
192 not be passed to unichr. */
193 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#endif
195}
196
Thomas Wouters477c8d52006-05-27 19:21:47 +0000197/* --- Bloom Filters ----------------------------------------------------- */
198
199/* stuff to implement simple "bloom filters" for Unicode characters.
200 to keep things simple, we use a single bitmask, using the least 5
201 bits from each unicode characters as the bit index. */
202
203/* the linebreak mask is set up by Unicode_Init below */
204
Antoine Pitrouf068f942010-01-13 14:19:12 +0000205#if LONG_BIT >= 128
206#define BLOOM_WIDTH 128
207#elif LONG_BIT >= 64
208#define BLOOM_WIDTH 64
209#elif LONG_BIT >= 32
210#define BLOOM_WIDTH 32
211#else
212#error "LONG_BIT is smaller than 32"
213#endif
214
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215#define BLOOM_MASK unsigned long
216
217static BLOOM_MASK bloom_linebreak;
218
Antoine Pitrouf068f942010-01-13 14:19:12 +0000219#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
220#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221
Benjamin Peterson29060642009-01-31 22:14:21 +0000222#define BLOOM_LINEBREAK(ch) \
223 ((ch) < 128U ? ascii_linebreak[(ch)] : \
224 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000225
226Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
227{
228 /* calculate simple bloom-style bitmask for a given unicode string */
229
Antoine Pitrouf068f942010-01-13 14:19:12 +0000230 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231 Py_ssize_t i;
232
233 mask = 0;
234 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000235 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000236
237 return mask;
238}
239
240Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
241{
242 Py_ssize_t i;
243
244 for (i = 0; i < setlen; i++)
245 if (set[i] == chr)
246 return 1;
247
248 return 0;
249}
250
Benjamin Peterson29060642009-01-31 22:14:21 +0000251#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000252 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
253
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254/* --- Unicode Object ----------------------------------------------------- */
255
256static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000257int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000258 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259{
260 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000261
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000266 /* Resizing shared object (unicode_empty or single character
267 objects) in-place is not allowed. Use PyUnicode_Resize()
268 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000271 (unicode->length == 1 &&
272 unicode->str[0] < 256U &&
273 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000275 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 return -1;
277 }
278
Thomas Wouters477c8d52006-05-27 19:21:47 +0000279 /* We allocate one more byte to make sure the string is Ux0000 terminated.
280 The overallocation is also used by fastsearch, which assumes that it's
281 safe to look at str[length] (without making any assumptions about what
282 it contains). */
283
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000285 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000286 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 PyErr_NoMemory();
290 return -1;
291 }
292 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294
Benjamin Peterson29060642009-01-31 22:14:21 +0000295 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000297 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000298 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 }
300 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000301
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 return 0;
303}
304
305/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000306 Ux0000 terminated; some code (e.g. new_identifier)
307 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308
309 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000310 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312*/
313
314static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316{
317 register PyUnicodeObject *unicode;
318
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 if (length == 0 && unicode_empty != NULL) {
321 Py_INCREF(unicode_empty);
322 return unicode_empty;
323 }
324
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000325 /* Ensure we won't overflow the size. */
326 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
327 return (PyUnicodeObject *)PyErr_NoMemory();
328 }
329
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000331 if (free_list) {
332 unicode = free_list;
333 free_list = *(PyUnicodeObject **)unicode;
334 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000335 if (unicode->str) {
336 /* Keep-Alive optimization: we only upsize the buffer,
337 never downsize it. */
338 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000339 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000340 PyObject_DEL(unicode->str);
341 unicode->str = NULL;
342 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000343 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 }
348 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 }
350 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000351 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000352 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 if (unicode == NULL)
354 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
356 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 }
358
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000359 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000360 PyErr_NoMemory();
361 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000363 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000364 * the caller fails before initializing str -- unicode_resize()
365 * reads str[0], and the Keep-Alive optimization can keep memory
366 * allocated for str alive across a call to unicode_dealloc(unicode).
367 * We don't want unicode_resize to read uninitialized memory in
368 * that case.
369 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000370 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000374 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000375 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377
Benjamin Peterson29060642009-01-31 22:14:21 +0000378 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000379 /* XXX UNREF/NEWREF interface should be more symmetrical */
380 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000382 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000383 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384}
385
386static
Guido van Rossum9475a232001-10-05 20:51:39 +0000387void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388{
Walter Dörwald16807132007-05-25 13:52:07 +0000389 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 case SSTATE_NOT_INTERNED:
391 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000392
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_INTERNED_MORTAL:
394 /* revive dead object temporarily for DelItem */
395 Py_REFCNT(unicode) = 3;
396 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
397 Py_FatalError(
398 "deletion of interned string failed");
399 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000400
Benjamin Peterson29060642009-01-31 22:14:21 +0000401 case SSTATE_INTERNED_IMMORTAL:
402 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 default:
405 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406 }
407
Guido van Rossum604ddf82001-12-06 20:03:56 +0000408 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000409 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000410 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000411 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
412 PyObject_DEL(unicode->str);
413 unicode->str = NULL;
414 unicode->length = 0;
415 }
416 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000417 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000418 }
419 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000420 *(PyUnicodeObject **)unicode = free_list;
421 free_list = unicode;
422 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000425 PyObject_DEL(unicode->str);
426 Py_XDECREF(unicode->defenc);
427 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428 }
429}
430
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000431static
432int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433{
434 register PyUnicodeObject *v;
435
436 /* Argument checks */
437 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000441 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000442 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000443 PyErr_BadInternalCall();
444 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000445 }
446
447 /* Resizing unicode_empty and single character objects is not
448 possible since these are being shared. We simply return a fresh
449 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000450 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000451 (v == unicode_empty || v->length == 1)) {
452 PyUnicodeObject *w = _PyUnicode_New(length);
453 if (w == NULL)
454 return -1;
455 Py_UNICODE_COPY(w->str, v->str,
456 length < v->length ? length : v->length);
457 Py_DECREF(*unicode);
458 *unicode = w;
459 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 }
461
462 /* Note that we don't have to modify *unicode for unshared Unicode
463 objects, since we can modify them in-place. */
464 return unicode_resize(v, length);
465}
466
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000467int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
468{
469 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
470}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000471
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000473 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474{
475 PyUnicodeObject *unicode;
476
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000477 /* If the Unicode data is known at construction time, we can apply
478 some optimizations which share commonly used objects. */
479 if (u != NULL) {
480
Benjamin Peterson29060642009-01-31 22:14:21 +0000481 /* Optimization for empty strings */
482 if (size == 0 && unicode_empty != NULL) {
483 Py_INCREF(unicode_empty);
484 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000485 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000486
487 /* Single character Unicode objects in the Latin-1 range are
488 shared when using this constructor */
489 if (size == 1 && *u < 256) {
490 unicode = unicode_latin1[*u];
491 if (!unicode) {
492 unicode = _PyUnicode_New(1);
493 if (!unicode)
494 return NULL;
495 unicode->str[0] = *u;
496 unicode_latin1[*u] = unicode;
497 }
498 Py_INCREF(unicode);
499 return (PyObject *)unicode;
500 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000501 }
Tim Petersced69f82003-09-16 20:30:58 +0000502
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 unicode = _PyUnicode_New(size);
504 if (!unicode)
505 return NULL;
506
507 /* Copy the Unicode data into the new object */
508 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000509 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510
511 return (PyObject *)unicode;
512}
513
Walter Dörwaldd2034312007-05-18 16:29:38 +0000514PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000515{
516 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000517
Benjamin Peterson14339b62009-01-31 16:36:08 +0000518 if (size < 0) {
519 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000520 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 return NULL;
522 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000524 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000525 some optimizations which share commonly used objects.
526 Also, this means the input must be UTF-8, so fall back to the
527 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000528 if (u != NULL) {
529
Benjamin Peterson29060642009-01-31 22:14:21 +0000530 /* Optimization for empty strings */
531 if (size == 0 && unicode_empty != NULL) {
532 Py_INCREF(unicode_empty);
533 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000534 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000535
536 /* Single characters are shared when using this constructor.
537 Restrict to ASCII, since the input must be UTF-8. */
538 if (size == 1 && Py_CHARMASK(*u) < 128) {
539 unicode = unicode_latin1[Py_CHARMASK(*u)];
540 if (!unicode) {
541 unicode = _PyUnicode_New(1);
542 if (!unicode)
543 return NULL;
544 unicode->str[0] = Py_CHARMASK(*u);
545 unicode_latin1[Py_CHARMASK(*u)] = unicode;
546 }
547 Py_INCREF(unicode);
548 return (PyObject *)unicode;
549 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000550
551 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000552 }
553
Walter Dörwald55507312007-05-18 13:12:10 +0000554 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 if (!unicode)
556 return NULL;
557
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 return (PyObject *)unicode;
559}
560
Walter Dörwaldd2034312007-05-18 16:29:38 +0000561PyObject *PyUnicode_FromString(const char *u)
562{
563 size_t size = strlen(u);
564 if (size > PY_SSIZE_T_MAX) {
565 PyErr_SetString(PyExc_OverflowError, "input too long");
566 return NULL;
567 }
568
569 return PyUnicode_FromStringAndSize(u, size);
570}
571
Guido van Rossumd57fd912000-03-10 22:53:23 +0000572#ifdef HAVE_WCHAR_H
573
Mark Dickinson081dfee2009-03-18 14:47:41 +0000574#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
575# define CONVERT_WCHAR_TO_SURROGATES
576#endif
577
578#ifdef CONVERT_WCHAR_TO_SURROGATES
579
580/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
581 to convert from UTF32 to UTF16. */
582
583PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
584 Py_ssize_t size)
585{
586 PyUnicodeObject *unicode;
587 register Py_ssize_t i;
588 Py_ssize_t alloc;
589 const wchar_t *orig_w;
590
591 if (w == NULL) {
592 if (size == 0)
593 return PyUnicode_FromStringAndSize(NULL, 0);
594 PyErr_BadInternalCall();
595 return NULL;
596 }
597
598 if (size == -1) {
599 size = wcslen(w);
600 }
601
602 alloc = size;
603 orig_w = w;
604 for (i = size; i > 0; i--) {
605 if (*w > 0xFFFF)
606 alloc++;
607 w++;
608 }
609 w = orig_w;
610 unicode = _PyUnicode_New(alloc);
611 if (!unicode)
612 return NULL;
613
614 /* Copy the wchar_t data into the new object */
615 {
616 register Py_UNICODE *u;
617 u = PyUnicode_AS_UNICODE(unicode);
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF) {
620 wchar_t ordinal = *w++;
621 ordinal -= 0x10000;
622 *u++ = 0xD800 | (ordinal >> 10);
623 *u++ = 0xDC00 | (ordinal & 0x3FF);
624 }
625 else
626 *u++ = *w++;
627 }
628 }
629 return (PyObject *)unicode;
630}
631
632#else
633
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000635 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636{
637 PyUnicodeObject *unicode;
638
639 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000640 if (size == 0)
641 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000642 PyErr_BadInternalCall();
643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000644 }
645
Martin v. Löwis790465f2008-04-05 20:41:37 +0000646 if (size == -1) {
647 size = wcslen(w);
648 }
649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = _PyUnicode_New(size);
651 if (!unicode)
652 return NULL;
653
654 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000655#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000657#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000659 register Py_UNICODE *u;
660 register Py_ssize_t i;
661 u = PyUnicode_AS_UNICODE(unicode);
662 for (i = size; i > 0; i--)
663 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 }
665#endif
666
667 return (PyObject *)unicode;
668}
669
Mark Dickinson081dfee2009-03-18 14:47:41 +0000670#endif /* CONVERT_WCHAR_TO_SURROGATES */
671
672#undef CONVERT_WCHAR_TO_SURROGATES
673
Walter Dörwald346737f2007-05-31 10:44:43 +0000674static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000675makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
676 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000677{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000678 *fmt++ = '%';
679 if (width) {
680 if (zeropad)
681 *fmt++ = '0';
682 fmt += sprintf(fmt, "%d", width);
683 }
684 if (precision)
685 fmt += sprintf(fmt, ".%d", precision);
686 if (longflag)
687 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000688 else if (longlongflag) {
689 /* longlongflag should only ever be nonzero on machines with
690 HAVE_LONG_LONG defined */
691#ifdef HAVE_LONG_LONG
692 char *f = PY_FORMAT_LONG_LONG;
693 while (*f)
694 *fmt++ = *f++;
695#else
696 /* we shouldn't ever get here */
697 assert(0);
698 *fmt++ = 'l';
699#endif
700 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000701 else if (size_tflag) {
702 char *f = PY_FORMAT_SIZE_T;
703 while (*f)
704 *fmt++ = *f++;
705 }
706 *fmt++ = c;
707 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000708}
709
Walter Dörwaldd2034312007-05-18 16:29:38 +0000710#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
711
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000712/* size of fixed-size buffer for formatting single arguments */
713#define ITEM_BUFFER_LEN 21
714/* maximum number of characters required for output of %ld. 21 characters
715 allows for 64-bit integers (in decimal) and an optional sign. */
716#define MAX_LONG_CHARS 21
717/* maximum number of characters required for output of %lld.
718 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
719 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
720#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
721
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722PyObject *
723PyUnicode_FromFormatV(const char *format, va_list vargs)
724{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000725 va_list count;
726 Py_ssize_t callcount = 0;
727 PyObject **callresults = NULL;
728 PyObject **callresult = NULL;
729 Py_ssize_t n = 0;
730 int width = 0;
731 int precision = 0;
732 int zeropad;
733 const char* f;
734 Py_UNICODE *s;
735 PyObject *string;
736 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000737 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000738 /* use abuffer instead of buffer, if we need more space
739 * (which can happen if there's a format specifier with width). */
740 char *abuffer = NULL;
741 char *realbuffer;
742 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000743 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000744 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000745
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000746 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000747 /* step 1: count the number of %S/%R/%A/%s format specifications
748 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
749 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
750 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000751 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000752 if (*f == '%') {
753 if (*(f+1)=='%')
754 continue;
Victor Stinner2b574a22011-03-01 22:48:49 +0000755 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000756 ++callcount;
David Malcolm96960882010-11-05 17:23:41 +0000757 while (Py_ISDIGIT((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000758 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000759 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000760 ;
761 if (*f == 's')
762 ++callcount;
763 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000764 else if (128 <= (unsigned char)*f) {
765 PyErr_Format(PyExc_ValueError,
766 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000767 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000768 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000769 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 }
772 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000773 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000774 if (callcount) {
775 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
776 if (!callresults) {
777 PyErr_NoMemory();
778 return NULL;
779 }
780 callresult = callresults;
781 }
782 /* step 3: figure out how large a buffer we need */
783 for (f = format; *f; f++) {
784 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000785#ifdef HAVE_LONG_LONG
786 int longlongflag = 0;
787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000788 const char* p = f;
789 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000790 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000792 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000793 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794
Benjamin Peterson14339b62009-01-31 16:36:08 +0000795 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
796 * they don't affect the amount of space we reserve.
797 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000798 if (*f == 'l') {
799 if (f[1] == 'd' || f[1] == 'u') {
800 ++f;
801 }
802#ifdef HAVE_LONG_LONG
803 else if (f[1] == 'l' &&
804 (f[2] == 'd' || f[2] == 'u')) {
805 longlongflag = 1;
806 f += 2;
807 }
808#endif
809 }
810 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000811 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000812 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000813
Benjamin Peterson14339b62009-01-31 16:36:08 +0000814 switch (*f) {
815 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +0000816 {
817#ifndef Py_UNICODE_WIDE
818 int ordinal = va_arg(count, int);
819 if (ordinal > 0xffff)
820 n += 2;
821 else
822 n++;
823#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000824 (void)va_arg(count, int);
Victor Stinner659eb842011-02-23 12:14:22 +0000825 n++;
826#endif
827 break;
828 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 case '%':
830 n++;
831 break;
832 case 'd': case 'u': case 'i': case 'x':
833 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000834#ifdef HAVE_LONG_LONG
835 if (longlongflag) {
836 if (width < MAX_LONG_LONG_CHARS)
837 width = MAX_LONG_LONG_CHARS;
838 }
839 else
840#endif
841 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
842 including sign. Decimal takes the most space. This
843 isn't enough for octal. If a width is specified we
844 need more (which we allocate later). */
845 if (width < MAX_LONG_CHARS)
846 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000847 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000848 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000849 if (abuffersize < width)
850 abuffersize = width;
851 break;
852 case 's':
853 {
854 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000855 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000856 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
857 if (!str)
858 goto fail;
859 n += PyUnicode_GET_SIZE(str);
860 /* Remember the str and switch to the next slot */
861 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000862 break;
863 }
864 case 'U':
865 {
866 PyObject *obj = va_arg(count, PyObject *);
867 assert(obj && PyUnicode_Check(obj));
868 n += PyUnicode_GET_SIZE(obj);
869 break;
870 }
871 case 'V':
872 {
873 PyObject *obj = va_arg(count, PyObject *);
874 const char *str = va_arg(count, const char *);
Victor Stinner2b574a22011-03-01 22:48:49 +0000875 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000876 assert(obj || str);
877 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2b574a22011-03-01 22:48:49 +0000878 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000879 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2b574a22011-03-01 22:48:49 +0000880 *callresult++ = NULL;
881 }
882 else {
883 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
884 if (!str_obj)
885 goto fail;
886 n += PyUnicode_GET_SIZE(str_obj);
887 *callresult++ = str_obj;
888 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000889 break;
890 }
891 case 'S':
892 {
893 PyObject *obj = va_arg(count, PyObject *);
894 PyObject *str;
895 assert(obj);
896 str = PyObject_Str(obj);
897 if (!str)
898 goto fail;
899 n += PyUnicode_GET_SIZE(str);
900 /* Remember the str and switch to the next slot */
901 *callresult++ = str;
902 break;
903 }
904 case 'R':
905 {
906 PyObject *obj = va_arg(count, PyObject *);
907 PyObject *repr;
908 assert(obj);
909 repr = PyObject_Repr(obj);
910 if (!repr)
911 goto fail;
912 n += PyUnicode_GET_SIZE(repr);
913 /* Remember the repr and switch to the next slot */
914 *callresult++ = repr;
915 break;
916 }
917 case 'A':
918 {
919 PyObject *obj = va_arg(count, PyObject *);
920 PyObject *ascii;
921 assert(obj);
922 ascii = PyObject_ASCII(obj);
923 if (!ascii)
924 goto fail;
925 n += PyUnicode_GET_SIZE(ascii);
926 /* Remember the repr and switch to the next slot */
927 *callresult++ = ascii;
928 break;
929 }
930 case 'p':
931 (void) va_arg(count, int);
932 /* maximum 64-bit pointer representation:
933 * 0xffffffffffffffff
934 * so 19 characters is enough.
935 * XXX I count 18 -- what's the extra for?
936 */
937 n += 19;
938 break;
939 default:
940 /* if we stumble upon an unknown
941 formatting code, copy the rest of
942 the format string to the output
943 string. (we cannot just skip the
944 code, since there's no way to know
945 what's in the argument list) */
946 n += strlen(p);
947 goto expand;
948 }
949 } else
950 n++;
951 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000952 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000953 if (abuffersize > ITEM_BUFFER_LEN) {
954 /* add 1 for sprintf's trailing null byte */
955 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000956 if (!abuffer) {
957 PyErr_NoMemory();
958 goto fail;
959 }
960 realbuffer = abuffer;
961 }
962 else
963 realbuffer = buffer;
964 /* step 4: fill the buffer */
965 /* Since we've analyzed how much space we need for the worst case,
966 we don't have to resize the string.
967 There can be no errors beyond this point. */
968 string = PyUnicode_FromUnicode(NULL, n);
969 if (!string)
970 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000971
Benjamin Peterson14339b62009-01-31 16:36:08 +0000972 s = PyUnicode_AS_UNICODE(string);
973 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000974
Benjamin Peterson14339b62009-01-31 16:36:08 +0000975 for (f = format; *f; f++) {
976 if (*f == '%') {
977 const char* p = f++;
978 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000979 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000980 int size_tflag = 0;
981 zeropad = (*f == '0');
982 /* parse the width.precision part */
983 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000984 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000985 width = (width*10) + *f++ - '0';
986 precision = 0;
987 if (*f == '.') {
988 f++;
David Malcolm96960882010-11-05 17:23:41 +0000989 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000990 precision = (precision*10) + *f++ - '0';
991 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000992 /* Handle %ld, %lu, %lld and %llu. */
993 if (*f == 'l') {
994 if (f[1] == 'd' || f[1] == 'u') {
995 longflag = 1;
996 ++f;
997 }
998#ifdef HAVE_LONG_LONG
999 else if (f[1] == 'l' &&
1000 (f[2] == 'd' || f[2] == 'u')) {
1001 longlongflag = 1;
1002 f += 2;
1003 }
1004#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001005 }
1006 /* handle the size_t flag. */
1007 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1008 size_tflag = 1;
1009 ++f;
1010 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001011
Benjamin Peterson14339b62009-01-31 16:36:08 +00001012 switch (*f) {
1013 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +00001014 {
1015 int ordinal = va_arg(vargs, int);
1016#ifndef Py_UNICODE_WIDE
1017 if (ordinal > 0xffff) {
1018 ordinal -= 0x10000;
1019 *s++ = 0xD800 | (ordinal >> 10);
1020 *s++ = 0xDC00 | (ordinal & 0x3FF);
1021 } else
1022#endif
1023 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001024 break;
Victor Stinner659eb842011-02-23 12:14:22 +00001025 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001026 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001027 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1028 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001029 if (longflag)
1030 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001031#ifdef HAVE_LONG_LONG
1032 else if (longlongflag)
1033 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1034#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001035 else if (size_tflag)
1036 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1037 else
1038 sprintf(realbuffer, fmt, va_arg(vargs, int));
1039 appendstring(realbuffer);
1040 break;
1041 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001042 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1043 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001044 if (longflag)
1045 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001046#ifdef HAVE_LONG_LONG
1047 else if (longlongflag)
1048 sprintf(realbuffer, fmt, va_arg(vargs,
1049 unsigned PY_LONG_LONG));
1050#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 else if (size_tflag)
1052 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1053 else
1054 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1055 appendstring(realbuffer);
1056 break;
1057 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001058 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001059 sprintf(realbuffer, fmt, va_arg(vargs, int));
1060 appendstring(realbuffer);
1061 break;
1062 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001063 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001064 sprintf(realbuffer, fmt, va_arg(vargs, int));
1065 appendstring(realbuffer);
1066 break;
1067 case 's':
1068 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001069 /* unused, since we already have the result */
1070 (void) va_arg(vargs, char *);
1071 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1072 PyUnicode_GET_SIZE(*callresult));
1073 s += PyUnicode_GET_SIZE(*callresult);
1074 /* We're done with the unicode()/repr() => forget it */
1075 Py_DECREF(*callresult);
1076 /* switch to next unicode()/repr() result */
1077 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001078 break;
1079 }
1080 case 'U':
1081 {
1082 PyObject *obj = va_arg(vargs, PyObject *);
1083 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1084 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1085 s += size;
1086 break;
1087 }
1088 case 'V':
1089 {
1090 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2b574a22011-03-01 22:48:49 +00001091 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001092 if (obj) {
1093 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1094 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1095 s += size;
1096 } else {
Victor Stinner2b574a22011-03-01 22:48:49 +00001097 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1098 PyUnicode_GET_SIZE(*callresult));
1099 s += PyUnicode_GET_SIZE(*callresult);
1100 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001101 }
Victor Stinner2b574a22011-03-01 22:48:49 +00001102 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001103 break;
1104 }
1105 case 'S':
1106 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001107 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001108 {
1109 Py_UNICODE *ucopy;
1110 Py_ssize_t usize;
1111 Py_ssize_t upos;
1112 /* unused, since we already have the result */
1113 (void) va_arg(vargs, PyObject *);
1114 ucopy = PyUnicode_AS_UNICODE(*callresult);
1115 usize = PyUnicode_GET_SIZE(*callresult);
1116 for (upos = 0; upos<usize;)
1117 *s++ = ucopy[upos++];
1118 /* We're done with the unicode()/repr() => forget it */
1119 Py_DECREF(*callresult);
1120 /* switch to next unicode()/repr() result */
1121 ++callresult;
1122 break;
1123 }
1124 case 'p':
1125 sprintf(buffer, "%p", va_arg(vargs, void*));
1126 /* %p is ill-defined: ensure leading 0x. */
1127 if (buffer[1] == 'X')
1128 buffer[1] = 'x';
1129 else if (buffer[1] != 'x') {
1130 memmove(buffer+2, buffer, strlen(buffer)+1);
1131 buffer[0] = '0';
1132 buffer[1] = 'x';
1133 }
1134 appendstring(buffer);
1135 break;
1136 case '%':
1137 *s++ = '%';
1138 break;
1139 default:
1140 appendstring(p);
1141 goto end;
1142 }
Victor Stinner1205f272010-09-11 00:54:47 +00001143 }
Victor Stinner1205f272010-09-11 00:54:47 +00001144 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001145 *s++ = *f;
1146 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001147
Benjamin Peterson29060642009-01-31 22:14:21 +00001148 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001149 if (callresults)
1150 PyObject_Free(callresults);
1151 if (abuffer)
1152 PyObject_Free(abuffer);
1153 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1154 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001155 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001156 if (callresults) {
1157 PyObject **callresult2 = callresults;
1158 while (callresult2 < callresult) {
Victor Stinner2b574a22011-03-01 22:48:49 +00001159 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001160 ++callresult2;
1161 }
1162 PyObject_Free(callresults);
1163 }
1164 if (abuffer)
1165 PyObject_Free(abuffer);
1166 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001167}
1168
1169#undef appendstring
1170
1171PyObject *
1172PyUnicode_FromFormat(const char *format, ...)
1173{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001174 PyObject* ret;
1175 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001176
1177#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001178 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001179#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001180 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001181#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001182 ret = PyUnicode_FromFormatV(format, vargs);
1183 va_end(vargs);
1184 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001185}
1186
Victor Stinner5593d8a2010-10-02 11:11:27 +00001187/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1188 convert a Unicode object to a wide character string.
1189
Victor Stinnerd88d9832011-09-06 02:00:05 +02001190 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001191 character) required to convert the unicode object. Ignore size argument.
1192
Victor Stinnerd88d9832011-09-06 02:00:05 +02001193 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001194 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001195 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001196static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001197unicode_aswidechar(PyUnicodeObject *unicode,
1198 wchar_t *w,
1199 Py_ssize_t size)
1200{
1201#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001202 Py_ssize_t res;
1203 if (w != NULL) {
1204 res = PyUnicode_GET_SIZE(unicode);
1205 if (size > res)
1206 size = res + 1;
1207 else
1208 res = size;
1209 memcpy(w, unicode->str, size * sizeof(wchar_t));
1210 return res;
1211 }
1212 else
1213 return PyUnicode_GET_SIZE(unicode) + 1;
1214#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1215 register const Py_UNICODE *u;
1216 const Py_UNICODE *uend;
1217 const wchar_t *worig, *wend;
1218 Py_ssize_t nchar;
1219
Victor Stinner137c34c2010-09-29 10:25:54 +00001220 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001221 uend = u + PyUnicode_GET_SIZE(unicode);
1222 if (w != NULL) {
1223 worig = w;
1224 wend = w + size;
1225 while (u != uend && w != wend) {
1226 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1227 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1228 {
1229 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1230 u += 2;
1231 }
1232 else {
1233 *w = *u;
1234 u++;
1235 }
1236 w++;
1237 }
1238 if (w != wend)
1239 *w = L'\0';
1240 return w - worig;
1241 }
1242 else {
Victor Stinnerd88d9832011-09-06 02:00:05 +02001243 nchar = 1; /* null character at the end */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001244 while (u != uend) {
1245 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1246 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1247 u += 2;
1248 else
1249 u++;
1250 nchar++;
1251 }
1252 }
1253 return nchar;
1254#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1255 register Py_UNICODE *u, *uend, ordinal;
1256 register Py_ssize_t i;
1257 wchar_t *worig, *wend;
1258 Py_ssize_t nchar;
1259
1260 u = PyUnicode_AS_UNICODE(unicode);
1261 uend = u + PyUnicode_GET_SIZE(u);
1262 if (w != NULL) {
1263 worig = w;
1264 wend = w + size;
1265 while (u != uend && w != wend) {
1266 ordinal = *u;
1267 if (ordinal > 0xffff) {
1268 ordinal -= 0x10000;
1269 *w++ = 0xD800 | (ordinal >> 10);
1270 *w++ = 0xDC00 | (ordinal & 0x3FF);
1271 }
1272 else
1273 *w++ = ordinal;
1274 u++;
1275 }
1276 if (w != wend)
1277 *w = 0;
1278 return w - worig;
1279 }
1280 else {
Victor Stinnerd88d9832011-09-06 02:00:05 +02001281 nchar = 1; /* null character */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001282 while (u != uend) {
1283 if (*u > 0xffff)
1284 nchar += 2;
1285 else
1286 nchar++;
1287 u++;
1288 }
1289 return nchar;
1290 }
1291#else
1292# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001293#endif
1294}
1295
1296Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001297PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001298 wchar_t *w,
1299 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300{
1301 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001302 PyErr_BadInternalCall();
1303 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001305 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306}
1307
Victor Stinner137c34c2010-09-29 10:25:54 +00001308wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001309PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001310 Py_ssize_t *size)
1311{
1312 wchar_t* buffer;
1313 Py_ssize_t buflen;
1314
1315 if (unicode == NULL) {
1316 PyErr_BadInternalCall();
1317 return NULL;
1318 }
1319
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001320 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001321 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001322 PyErr_NoMemory();
1323 return NULL;
1324 }
1325
Victor Stinner137c34c2010-09-29 10:25:54 +00001326 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1327 if (buffer == NULL) {
1328 PyErr_NoMemory();
1329 return NULL;
1330 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001331 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001332 if (size != NULL)
1333 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001334 return buffer;
1335}
1336
Guido van Rossumd57fd912000-03-10 22:53:23 +00001337#endif
1338
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001339PyObject *PyUnicode_FromOrdinal(int ordinal)
1340{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001341 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001342
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001343 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001344 PyErr_SetString(PyExc_ValueError,
1345 "chr() arg not in range(0x110000)");
1346 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001347 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001348
1349#ifndef Py_UNICODE_WIDE
1350 if (ordinal > 0xffff) {
1351 ordinal -= 0x10000;
1352 s[0] = 0xD800 | (ordinal >> 10);
1353 s[1] = 0xDC00 | (ordinal & 0x3FF);
1354 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001355 }
1356#endif
1357
Hye-Shik Chang40574832004-04-06 07:24:51 +00001358 s[0] = (Py_UNICODE)ordinal;
1359 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001360}
1361
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362PyObject *PyUnicode_FromObject(register PyObject *obj)
1363{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001364 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001365 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001366 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001367 Py_INCREF(obj);
1368 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001369 }
1370 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001371 /* For a Unicode subtype that's not a Unicode object,
1372 return a true Unicode object with the same data. */
1373 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1374 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001375 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001376 PyErr_Format(PyExc_TypeError,
1377 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001378 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001379 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001380}
1381
1382PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001383 const char *encoding,
1384 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001385{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001386 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001387 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001388
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 PyErr_BadInternalCall();
1391 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001393
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001394 /* Decoding bytes objects is the most common case and should be fast */
1395 if (PyBytes_Check(obj)) {
1396 if (PyBytes_GET_SIZE(obj) == 0) {
1397 Py_INCREF(unicode_empty);
1398 v = (PyObject *) unicode_empty;
1399 }
1400 else {
1401 v = PyUnicode_Decode(
1402 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1403 encoding, errors);
1404 }
1405 return v;
1406 }
1407
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001408 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001409 PyErr_SetString(PyExc_TypeError,
1410 "decoding str is not supported");
1411 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001412 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001413
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001414 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1415 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1416 PyErr_Format(PyExc_TypeError,
1417 "coercing to str: need bytes, bytearray "
1418 "or buffer-like object, %.80s found",
1419 Py_TYPE(obj)->tp_name);
1420 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001421 }
Tim Petersced69f82003-09-16 20:30:58 +00001422
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001423 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001424 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001425 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426 }
Tim Petersced69f82003-09-16 20:30:58 +00001427 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001428 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001429
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001430 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001431 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432}
1433
Victor Stinner600d3be2010-06-10 12:00:55 +00001434/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001435 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1436 1 on success. */
1437static int
1438normalize_encoding(const char *encoding,
1439 char *lower,
1440 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001442 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001443 char *l;
1444 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001446 e = encoding;
1447 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001448 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001449 while (*e) {
1450 if (l == l_end)
1451 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001452 if (Py_ISUPPER(*e)) {
1453 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001454 }
1455 else if (*e == '_') {
1456 *l++ = '-';
1457 e++;
1458 }
1459 else {
1460 *l++ = *e++;
1461 }
1462 }
1463 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001464 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001465}
1466
1467PyObject *PyUnicode_Decode(const char *s,
1468 Py_ssize_t size,
1469 const char *encoding,
1470 const char *errors)
1471{
1472 PyObject *buffer = NULL, *unicode;
1473 Py_buffer info;
1474 char lower[11]; /* Enough for any encoding shortcut */
1475
1476 if (encoding == NULL)
1477 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001478
1479 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001480 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1481 if (strcmp(lower, "utf-8") == 0)
1482 return PyUnicode_DecodeUTF8(s, size, errors);
1483 else if ((strcmp(lower, "latin-1") == 0) ||
1484 (strcmp(lower, "iso-8859-1") == 0))
1485 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001486#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001487 else if (strcmp(lower, "mbcs") == 0)
1488 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001489#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001490 else if (strcmp(lower, "ascii") == 0)
1491 return PyUnicode_DecodeASCII(s, size, errors);
1492 else if (strcmp(lower, "utf-16") == 0)
1493 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1494 else if (strcmp(lower, "utf-32") == 0)
1495 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497
1498 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001499 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001500 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001501 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001502 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503 if (buffer == NULL)
1504 goto onError;
1505 unicode = PyCodec_Decode(buffer, encoding, errors);
1506 if (unicode == NULL)
1507 goto onError;
1508 if (!PyUnicode_Check(unicode)) {
1509 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001510 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001511 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001512 Py_DECREF(unicode);
1513 goto onError;
1514 }
1515 Py_DECREF(buffer);
1516 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001517
Benjamin Peterson29060642009-01-31 22:14:21 +00001518 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519 Py_XDECREF(buffer);
1520 return NULL;
1521}
1522
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001523PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1524 const char *encoding,
1525 const char *errors)
1526{
1527 PyObject *v;
1528
1529 if (!PyUnicode_Check(unicode)) {
1530 PyErr_BadArgument();
1531 goto onError;
1532 }
1533
1534 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001535 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001536
1537 /* Decode via the codec registry */
1538 v = PyCodec_Decode(unicode, encoding, errors);
1539 if (v == NULL)
1540 goto onError;
1541 return v;
1542
Benjamin Peterson29060642009-01-31 22:14:21 +00001543 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001544 return NULL;
1545}
1546
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001547PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1548 const char *encoding,
1549 const char *errors)
1550{
1551 PyObject *v;
1552
1553 if (!PyUnicode_Check(unicode)) {
1554 PyErr_BadArgument();
1555 goto onError;
1556 }
1557
1558 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001559 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001560
1561 /* Decode via the codec registry */
1562 v = PyCodec_Decode(unicode, encoding, errors);
1563 if (v == NULL)
1564 goto onError;
1565 if (!PyUnicode_Check(v)) {
1566 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001567 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001568 Py_TYPE(v)->tp_name);
1569 Py_DECREF(v);
1570 goto onError;
1571 }
1572 return v;
1573
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001575 return NULL;
1576}
1577
Guido van Rossumd57fd912000-03-10 22:53:23 +00001578PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001579 Py_ssize_t size,
1580 const char *encoding,
1581 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001582{
1583 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001584
Guido van Rossumd57fd912000-03-10 22:53:23 +00001585 unicode = PyUnicode_FromUnicode(s, size);
1586 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001588 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1589 Py_DECREF(unicode);
1590 return v;
1591}
1592
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001593PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1594 const char *encoding,
1595 const char *errors)
1596{
1597 PyObject *v;
1598
1599 if (!PyUnicode_Check(unicode)) {
1600 PyErr_BadArgument();
1601 goto onError;
1602 }
1603
1604 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001605 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001606
1607 /* Encode via the codec registry */
1608 v = PyCodec_Encode(unicode, encoding, errors);
1609 if (v == NULL)
1610 goto onError;
1611 return v;
1612
Benjamin Peterson29060642009-01-31 22:14:21 +00001613 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001614 return NULL;
1615}
1616
Victor Stinnerad158722010-10-27 00:25:46 +00001617PyObject *
1618PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001619{
Victor Stinner313a1202010-06-11 23:56:51 +00001620#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001621 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1622 PyUnicode_GET_SIZE(unicode),
1623 NULL);
1624#elif defined(__APPLE__)
1625 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1626 PyUnicode_GET_SIZE(unicode),
1627 "surrogateescape");
1628#else
Victor Stinner3cbf14b2011-04-27 00:24:21 +02001629 PyInterpreterState *interp = PyThreadState_GET()->interp;
1630 /* Bootstrap check: if the filesystem codec is implemented in Python, we
1631 cannot use it to encode and decode filenames before it is loaded. Load
1632 the Python codec requires to encode at least its own filename. Use the C
1633 version of the locale codec until the codec registry is initialized and
1634 the Python codec is loaded.
1635
1636 Py_FileSystemDefaultEncoding is shared between all interpreters, we
1637 cannot only rely on it: check also interp->fscodec_initialized for
1638 subinterpreters. */
1639 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001640 return PyUnicode_AsEncodedString(unicode,
1641 Py_FileSystemDefaultEncoding,
1642 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001643 }
1644 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001645 /* locale encoding with surrogateescape */
1646 wchar_t *wchar;
1647 char *bytes;
1648 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001649 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001650
1651 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1652 if (wchar == NULL)
1653 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001654 bytes = _Py_wchar2char(wchar, &error_pos);
1655 if (bytes == NULL) {
1656 if (error_pos != (size_t)-1) {
1657 char *errmsg = strerror(errno);
1658 PyObject *exc = NULL;
1659 if (errmsg == NULL)
1660 errmsg = "Py_wchar2char() failed";
1661 raise_encode_exception(&exc,
1662 "filesystemencoding",
1663 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1664 error_pos, error_pos+1,
1665 errmsg);
1666 Py_XDECREF(exc);
1667 }
1668 else
1669 PyErr_NoMemory();
1670 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001671 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001672 }
1673 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001674
1675 bytes_obj = PyBytes_FromString(bytes);
1676 PyMem_Free(bytes);
1677 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001678 }
Victor Stinnerad158722010-10-27 00:25:46 +00001679#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001680}
1681
Guido van Rossumd57fd912000-03-10 22:53:23 +00001682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1683 const char *encoding,
1684 const char *errors)
1685{
1686 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001687 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001688
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689 if (!PyUnicode_Check(unicode)) {
1690 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001691 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 }
Fred Drakee4315f52000-05-09 19:53:39 +00001693
Tim Petersced69f82003-09-16 20:30:58 +00001694 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001695 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001696
1697 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001698 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1699 if (strcmp(lower, "utf-8") == 0)
1700 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1701 PyUnicode_GET_SIZE(unicode),
1702 errors);
1703 else if ((strcmp(lower, "latin-1") == 0) ||
1704 (strcmp(lower, "iso-8859-1") == 0))
1705 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1706 PyUnicode_GET_SIZE(unicode),
1707 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001708#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001709 else if (strcmp(lower, "mbcs") == 0)
1710 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1711 PyUnicode_GET_SIZE(unicode),
1712 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001713#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001714 else if (strcmp(lower, "ascii") == 0)
1715 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1716 PyUnicode_GET_SIZE(unicode),
1717 errors);
1718 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001719 /* During bootstrap, we may need to find the encodings
1720 package, to load the file system encoding, and require the
1721 file system encoding in order to load the encodings
1722 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001723
Victor Stinner59e62db2010-05-15 13:14:32 +00001724 Break out of this dependency by assuming that the path to
1725 the encodings module is ASCII-only. XXX could try wcstombs
1726 instead, if the file system encoding is the locale's
1727 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001728 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001729 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1730 !PyThreadState_GET()->interp->codecs_initialized)
1731 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1732 PyUnicode_GET_SIZE(unicode),
1733 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734
1735 /* Encode via the codec registry */
1736 v = PyCodec_Encode(unicode, encoding, errors);
1737 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001738 return NULL;
1739
1740 /* The normal path */
1741 if (PyBytes_Check(v))
1742 return v;
1743
1744 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001745 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001746 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001747 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001748
1749 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1750 "encoder %s returned bytearray instead of bytes",
1751 encoding);
1752 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001753 Py_DECREF(v);
1754 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001755 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001756
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001757 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1758 Py_DECREF(v);
1759 return b;
1760 }
1761
1762 PyErr_Format(PyExc_TypeError,
1763 "encoder did not return a bytes object (type=%.400s)",
1764 Py_TYPE(v)->tp_name);
1765 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001766 return NULL;
1767}
1768
1769PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1770 const char *encoding,
1771 const char *errors)
1772{
1773 PyObject *v;
1774
1775 if (!PyUnicode_Check(unicode)) {
1776 PyErr_BadArgument();
1777 goto onError;
1778 }
1779
1780 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001781 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001782
1783 /* Encode via the codec registry */
1784 v = PyCodec_Encode(unicode, encoding, errors);
1785 if (v == NULL)
1786 goto onError;
1787 if (!PyUnicode_Check(v)) {
1788 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001789 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001790 Py_TYPE(v)->tp_name);
1791 Py_DECREF(v);
1792 goto onError;
1793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001795
Benjamin Peterson29060642009-01-31 22:14:21 +00001796 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797 return NULL;
1798}
1799
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001800PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001801 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001802{
1803 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001804 if (v)
1805 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001806 if (errors != NULL)
1807 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001808 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001809 PyUnicode_GET_SIZE(unicode),
1810 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001811 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001812 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001813 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001814 return v;
1815}
1816
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001817PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001818PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001819 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001820 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1821}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001822
Christian Heimes5894ba72007-11-04 11:43:14 +00001823PyObject*
1824PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1825{
Victor Stinnerad158722010-10-27 00:25:46 +00001826#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1827 return PyUnicode_DecodeMBCS(s, size, NULL);
1828#elif defined(__APPLE__)
1829 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1830#else
Victor Stinner3cbf14b2011-04-27 00:24:21 +02001831 PyInterpreterState *interp = PyThreadState_GET()->interp;
1832 /* Bootstrap check: if the filesystem codec is implemented in Python, we
1833 cannot use it to encode and decode filenames before it is loaded. Load
1834 the Python codec requires to encode at least its own filename. Use the C
1835 version of the locale codec until the codec registry is initialized and
1836 the Python codec is loaded.
1837
1838 Py_FileSystemDefaultEncoding is shared between all interpreters, we
1839 cannot only rely on it: check also interp->fscodec_initialized for
1840 subinterpreters. */
1841 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001842 return PyUnicode_Decode(s, size,
1843 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001844 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001845 }
1846 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001847 /* locale encoding with surrogateescape */
1848 wchar_t *wchar;
1849 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001850 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001851
1852 if (s[size] != '\0' || size != strlen(s)) {
1853 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1854 return NULL;
1855 }
1856
Victor Stinner168e1172010-10-16 23:16:16 +00001857 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001858 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001859 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001860
Victor Stinner168e1172010-10-16 23:16:16 +00001861 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001862 PyMem_Free(wchar);
1863 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001864 }
Victor Stinnerad158722010-10-27 00:25:46 +00001865#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001866}
1867
Martin v. Löwis011e8422009-05-05 04:43:17 +00001868
1869int
1870PyUnicode_FSConverter(PyObject* arg, void* addr)
1871{
1872 PyObject *output = NULL;
1873 Py_ssize_t size;
1874 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001875 if (arg == NULL) {
1876 Py_DECREF(*(PyObject**)addr);
1877 return 1;
1878 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001879 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001880 output = arg;
1881 Py_INCREF(output);
1882 }
1883 else {
1884 arg = PyUnicode_FromObject(arg);
1885 if (!arg)
1886 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001887 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001888 Py_DECREF(arg);
1889 if (!output)
1890 return 0;
1891 if (!PyBytes_Check(output)) {
1892 Py_DECREF(output);
1893 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1894 return 0;
1895 }
1896 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001897 size = PyBytes_GET_SIZE(output);
1898 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001899 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05001900 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001901 Py_DECREF(output);
1902 return 0;
1903 }
1904 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001905 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001906}
1907
1908
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001909int
1910PyUnicode_FSDecoder(PyObject* arg, void* addr)
1911{
1912 PyObject *output = NULL;
1913 Py_ssize_t size;
1914 void *data;
1915 if (arg == NULL) {
1916 Py_DECREF(*(PyObject**)addr);
1917 return 1;
1918 }
1919 if (PyUnicode_Check(arg)) {
1920 output = arg;
1921 Py_INCREF(output);
1922 }
1923 else {
1924 arg = PyBytes_FromObject(arg);
1925 if (!arg)
1926 return 0;
1927 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1928 PyBytes_GET_SIZE(arg));
1929 Py_DECREF(arg);
1930 if (!output)
1931 return 0;
1932 if (!PyUnicode_Check(output)) {
1933 Py_DECREF(output);
1934 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1935 return 0;
1936 }
1937 }
1938 size = PyUnicode_GET_SIZE(output);
1939 data = PyUnicode_AS_UNICODE(output);
1940 if (size != Py_UNICODE_strlen(data)) {
1941 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1942 Py_DECREF(output);
1943 return 0;
1944 }
1945 *(PyObject**)addr = output;
1946 return Py_CLEANUP_SUPPORTED;
1947}
1948
1949
Martin v. Löwis5b222132007-06-10 09:51:05 +00001950char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001951_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001952{
Christian Heimesf3863112007-11-22 07:46:41 +00001953 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001954 if (!PyUnicode_Check(unicode)) {
1955 PyErr_BadArgument();
1956 return NULL;
1957 }
Christian Heimesf3863112007-11-22 07:46:41 +00001958 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1959 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001960 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001961 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001962 *psize = PyBytes_GET_SIZE(bytes);
1963 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001964}
1965
1966char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001967_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001968{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001969 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001970}
1971
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1973{
1974 if (!PyUnicode_Check(unicode)) {
1975 PyErr_BadArgument();
1976 goto onError;
1977 }
1978 return PyUnicode_AS_UNICODE(unicode);
1979
Benjamin Peterson29060642009-01-31 22:14:21 +00001980 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981 return NULL;
1982}
1983
Martin v. Löwis18e16552006-02-15 17:27:45 +00001984Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985{
1986 if (!PyUnicode_Check(unicode)) {
1987 PyErr_BadArgument();
1988 goto onError;
1989 }
1990 return PyUnicode_GET_SIZE(unicode);
1991
Benjamin Peterson29060642009-01-31 22:14:21 +00001992 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001993 return -1;
1994}
1995
Thomas Wouters78890102000-07-22 19:25:51 +00001996const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001997{
Victor Stinner42cb4622010-09-01 19:39:01 +00001998 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001999}
2000
Victor Stinner554f3f02010-06-16 23:33:54 +00002001/* create or adjust a UnicodeDecodeError */
2002static void
2003make_decode_exception(PyObject **exceptionObject,
2004 const char *encoding,
2005 const char *input, Py_ssize_t length,
2006 Py_ssize_t startpos, Py_ssize_t endpos,
2007 const char *reason)
2008{
2009 if (*exceptionObject == NULL) {
2010 *exceptionObject = PyUnicodeDecodeError_Create(
2011 encoding, input, length, startpos, endpos, reason);
2012 }
2013 else {
2014 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2015 goto onError;
2016 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2017 goto onError;
2018 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2019 goto onError;
2020 }
2021 return;
2022
2023onError:
2024 Py_DECREF(*exceptionObject);
2025 *exceptionObject = NULL;
2026}
2027
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002028/* error handling callback helper:
2029 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002030 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002031 and adjust various state variables.
2032 return 0 on success, -1 on error
2033*/
2034
2035static
2036int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00002037 const char *encoding, const char *reason,
2038 const char **input, const char **inend, Py_ssize_t *startinpos,
2039 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2040 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002041{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002042 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002043
2044 PyObject *restuple = NULL;
2045 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002046 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002047 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002048 Py_ssize_t requiredsize;
2049 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002050 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002051 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002052 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053 int res = -1;
2054
2055 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002056 *errorHandler = PyCodec_LookupError(errors);
2057 if (*errorHandler == NULL)
2058 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002059 }
2060
Victor Stinner554f3f02010-06-16 23:33:54 +00002061 make_decode_exception(exceptionObject,
2062 encoding,
2063 *input, *inend - *input,
2064 *startinpos, *endinpos,
2065 reason);
2066 if (*exceptionObject == NULL)
2067 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002068
2069 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2070 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002071 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002072 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002073 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002074 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002075 }
2076 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002077 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002078
2079 /* Copy back the bytes variables, which might have been modified by the
2080 callback */
2081 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2082 if (!inputobj)
2083 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002084 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002085 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002086 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002087 *input = PyBytes_AS_STRING(inputobj);
2088 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002089 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002090 /* we can DECREF safely, as the exception has another reference,
2091 so the object won't go away. */
2092 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002093
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002094 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002095 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002096 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002097 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2098 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002099 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002100
2101 /* need more space? (at least enough for what we
2102 have+the replacement+the rest of the string (starting
2103 at the new input position), so we won't have to check space
2104 when there are no errors in the rest of the string) */
2105 repptr = PyUnicode_AS_UNICODE(repunicode);
2106 repsize = PyUnicode_GET_SIZE(repunicode);
2107 requiredsize = *outpos + repsize + insize-newpos;
2108 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002109 if (requiredsize<2*outsize)
2110 requiredsize = 2*outsize;
2111 if (_PyUnicode_Resize(output, requiredsize) < 0)
2112 goto onError;
2113 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002114 }
2115 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002116 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002117 Py_UNICODE_COPY(*outptr, repptr, repsize);
2118 *outptr += repsize;
2119 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002120
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002121 /* we made it! */
2122 res = 0;
2123
Benjamin Peterson29060642009-01-31 22:14:21 +00002124 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002125 Py_XDECREF(restuple);
2126 return res;
2127}
2128
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002129/* --- UTF-7 Codec -------------------------------------------------------- */
2130
Antoine Pitrou244651a2009-05-04 18:56:13 +00002131/* See RFC2152 for details. We encode conservatively and decode liberally. */
2132
2133/* Three simple macros defining base-64. */
2134
2135/* Is c a base-64 character? */
2136
2137#define IS_BASE64(c) \
2138 (((c) >= 'A' && (c) <= 'Z') || \
2139 ((c) >= 'a' && (c) <= 'z') || \
2140 ((c) >= '0' && (c) <= '9') || \
2141 (c) == '+' || (c) == '/')
2142
2143/* given that c is a base-64 character, what is its base-64 value? */
2144
2145#define FROM_BASE64(c) \
2146 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2147 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2148 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2149 (c) == '+' ? 62 : 63)
2150
2151/* What is the base-64 character of the bottom 6 bits of n? */
2152
2153#define TO_BASE64(n) \
2154 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2155
2156/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2157 * decoded as itself. We are permissive on decoding; the only ASCII
2158 * byte not decoding to itself is the + which begins a base64
2159 * string. */
2160
2161#define DECODE_DIRECT(c) \
2162 ((c) <= 127 && (c) != '+')
2163
2164/* The UTF-7 encoder treats ASCII characters differently according to
2165 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2166 * the above). See RFC2152. This array identifies these different
2167 * sets:
2168 * 0 : "Set D"
2169 * alphanumeric and '(),-./:?
2170 * 1 : "Set O"
2171 * !"#$%&*;<=>@[]^_`{|}
2172 * 2 : "whitespace"
2173 * ht nl cr sp
2174 * 3 : special (must be base64 encoded)
2175 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2176 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002177
Tim Petersced69f82003-09-16 20:30:58 +00002178static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002179char utf7_category[128] = {
2180/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2181 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2182/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2183 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2184/* sp ! " # $ % & ' ( ) * + , - . / */
2185 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2186/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2188/* @ A B C D E F G H I J K L M N O */
2189 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2190/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2192/* ` a b c d e f g h i j k l m n o */
2193 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2194/* p q r s t u v w x y z { | } ~ del */
2195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002196};
2197
Antoine Pitrou244651a2009-05-04 18:56:13 +00002198/* ENCODE_DIRECT: this character should be encoded as itself. The
2199 * answer depends on whether we are encoding set O as itself, and also
2200 * on whether we are encoding whitespace as itself. RFC2152 makes it
2201 * clear that the answers to these questions vary between
2202 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002203
Antoine Pitrou244651a2009-05-04 18:56:13 +00002204#define ENCODE_DIRECT(c, directO, directWS) \
2205 ((c) < 128 && (c) > 0 && \
2206 ((utf7_category[(c)] == 0) || \
2207 (directWS && (utf7_category[(c)] == 2)) || \
2208 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002209
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002210PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002211 Py_ssize_t size,
2212 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002213{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002214 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2215}
2216
Antoine Pitrou244651a2009-05-04 18:56:13 +00002217/* The decoder. The only state we preserve is our read position,
2218 * i.e. how many characters we have consumed. So if we end in the
2219 * middle of a shift sequence we have to back off the read position
2220 * and the output to the beginning of the sequence, otherwise we lose
2221 * all the shift state (seen bits, number of bits seen, high
2222 * surrogate). */
2223
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002224PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002225 Py_ssize_t size,
2226 const char *errors,
2227 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002228{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002229 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002230 Py_ssize_t startinpos;
2231 Py_ssize_t endinpos;
2232 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002233 const char *e;
2234 PyUnicodeObject *unicode;
2235 Py_UNICODE *p;
2236 const char *errmsg = "";
2237 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002238 Py_UNICODE *shiftOutStart;
2239 unsigned int base64bits = 0;
2240 unsigned long base64buffer = 0;
2241 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002242 PyObject *errorHandler = NULL;
2243 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002244
2245 unicode = _PyUnicode_New(size);
2246 if (!unicode)
2247 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002248 if (size == 0) {
2249 if (consumed)
2250 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002251 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002252 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002253
2254 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002255 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002256 e = s + size;
2257
2258 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002259 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002260 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002261 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002262
Antoine Pitrou244651a2009-05-04 18:56:13 +00002263 if (inShift) { /* in a base-64 section */
2264 if (IS_BASE64(ch)) { /* consume a base-64 character */
2265 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2266 base64bits += 6;
2267 s++;
2268 if (base64bits >= 16) {
2269 /* we have enough bits for a UTF-16 value */
2270 Py_UNICODE outCh = (Py_UNICODE)
2271 (base64buffer >> (base64bits-16));
2272 base64bits -= 16;
2273 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2274 if (surrogate) {
2275 /* expecting a second surrogate */
2276 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2277#ifdef Py_UNICODE_WIDE
2278 *p++ = (((surrogate & 0x3FF)<<10)
2279 | (outCh & 0x3FF)) + 0x10000;
2280#else
2281 *p++ = surrogate;
2282 *p++ = outCh;
2283#endif
2284 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002285 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002286 }
2287 else {
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002288 *p++ = surrogate;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002289 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002290 }
2291 }
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002292 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002293 /* first surrogate */
2294 surrogate = outCh;
2295 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002296 else {
2297 *p++ = outCh;
2298 }
2299 }
2300 }
2301 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002302 inShift = 0;
2303 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002304 if (surrogate) {
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002305 *p++ = surrogate;
2306 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002307 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002308 if (base64bits > 0) { /* left-over bits */
2309 if (base64bits >= 6) {
2310 /* We've seen at least one base-64 character */
2311 errmsg = "partial character in shift sequence";
2312 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002313 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002314 else {
2315 /* Some bits remain; they should be zero */
2316 if (base64buffer != 0) {
2317 errmsg = "non-zero padding bits in shift sequence";
2318 goto utf7Error;
2319 }
2320 }
2321 }
2322 if (ch != '-') {
2323 /* '-' is absorbed; other terminating
2324 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002325 *p++ = ch;
2326 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002327 }
2328 }
2329 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002330 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002331 s++; /* consume '+' */
2332 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002333 s++;
2334 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002335 }
2336 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002337 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002338 shiftOutStart = p;
2339 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002340 }
2341 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002342 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002343 *p++ = ch;
2344 s++;
2345 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002346 else {
2347 startinpos = s-starts;
2348 s++;
2349 errmsg = "unexpected special character";
2350 goto utf7Error;
2351 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002352 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002353utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002354 outpos = p-PyUnicode_AS_UNICODE(unicode);
2355 endinpos = s-starts;
2356 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002357 errors, &errorHandler,
2358 "utf7", errmsg,
2359 &starts, &e, &startinpos, &endinpos, &exc, &s,
2360 &unicode, &outpos, &p))
2361 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002362 }
2363
Antoine Pitrou244651a2009-05-04 18:56:13 +00002364 /* end of string */
2365
2366 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2367 /* if we're in an inconsistent state, that's an error */
2368 if (surrogate ||
2369 (base64bits >= 6) ||
2370 (base64bits > 0 && base64buffer != 0)) {
2371 outpos = p-PyUnicode_AS_UNICODE(unicode);
2372 endinpos = size;
2373 if (unicode_decode_call_errorhandler(
2374 errors, &errorHandler,
2375 "utf7", "unterminated shift sequence",
2376 &starts, &e, &startinpos, &endinpos, &exc, &s,
2377 &unicode, &outpos, &p))
2378 goto onError;
2379 if (s < e)
2380 goto restart;
2381 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002382 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002383
2384 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002385 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002386 if (inShift) {
2387 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002388 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002389 }
2390 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002391 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002392 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002393 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002394
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002395 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002396 goto onError;
2397
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002398 Py_XDECREF(errorHandler);
2399 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002400 return (PyObject *)unicode;
2401
Benjamin Peterson29060642009-01-31 22:14:21 +00002402 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002403 Py_XDECREF(errorHandler);
2404 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002405 Py_DECREF(unicode);
2406 return NULL;
2407}
2408
2409
2410PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002411 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002412 int base64SetO,
2413 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002414 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002415{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002416 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002417 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002418 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002419 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002420 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002421 unsigned int base64bits = 0;
2422 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002423 char * out;
2424 char * start;
2425
2426 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002427 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002428
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002429 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002430 return PyErr_NoMemory();
2431
Antoine Pitrou244651a2009-05-04 18:56:13 +00002432 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002433 if (v == NULL)
2434 return NULL;
2435
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002436 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002437 for (;i < size; ++i) {
2438 Py_UNICODE ch = s[i];
2439
Antoine Pitrou244651a2009-05-04 18:56:13 +00002440 if (inShift) {
2441 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2442 /* shifting out */
2443 if (base64bits) { /* output remaining bits */
2444 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2445 base64buffer = 0;
2446 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002447 }
2448 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002449 /* Characters not in the BASE64 set implicitly unshift the sequence
2450 so no '-' is required, except if the character is itself a '-' */
2451 if (IS_BASE64(ch) || ch == '-') {
2452 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002453 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002454 *out++ = (char) ch;
2455 }
2456 else {
2457 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002458 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002459 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002460 else { /* not in a shift sequence */
2461 if (ch == '+') {
2462 *out++ = '+';
2463 *out++ = '-';
2464 }
2465 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2466 *out++ = (char) ch;
2467 }
2468 else {
2469 *out++ = '+';
2470 inShift = 1;
2471 goto encode_char;
2472 }
2473 }
2474 continue;
2475encode_char:
2476#ifdef Py_UNICODE_WIDE
2477 if (ch >= 0x10000) {
2478 /* code first surrogate */
2479 base64bits += 16;
2480 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2481 while (base64bits >= 6) {
2482 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2483 base64bits -= 6;
2484 }
2485 /* prepare second surrogate */
2486 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2487 }
2488#endif
2489 base64bits += 16;
2490 base64buffer = (base64buffer << 16) | ch;
2491 while (base64bits >= 6) {
2492 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2493 base64bits -= 6;
2494 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002495 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002496 if (base64bits)
2497 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2498 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002499 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002500 if (_PyBytes_Resize(&v, out - start) < 0)
2501 return NULL;
2502 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002503}
2504
Antoine Pitrou244651a2009-05-04 18:56:13 +00002505#undef IS_BASE64
2506#undef FROM_BASE64
2507#undef TO_BASE64
2508#undef DECODE_DIRECT
2509#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002510
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511/* --- UTF-8 Codec -------------------------------------------------------- */
2512
Tim Petersced69f82003-09-16 20:30:58 +00002513static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002514char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002515 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2516 illegal prefix. See RFC 3629 for details */
2517 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2518 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002519 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2521 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2522 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2523 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002524 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2525 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2527 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002528 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2529 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2530 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2531 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2532 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533};
2534
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002536 Py_ssize_t size,
2537 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538{
Walter Dörwald69652032004-09-07 20:24:22 +00002539 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2540}
2541
Antoine Pitrouab868312009-01-10 15:40:25 +00002542/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2543#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2544
2545/* Mask to quickly check whether a C 'long' contains a
2546 non-ASCII, UTF8-encoded char. */
2547#if (SIZEOF_LONG == 8)
2548# define ASCII_CHAR_MASK 0x8080808080808080L
2549#elif (SIZEOF_LONG == 4)
2550# define ASCII_CHAR_MASK 0x80808080L
2551#else
2552# error C 'long' size should be either 4 or 8!
2553#endif
2554
Walter Dörwald69652032004-09-07 20:24:22 +00002555PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002556 Py_ssize_t size,
2557 const char *errors,
2558 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002559{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002560 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002562 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002563 Py_ssize_t startinpos;
2564 Py_ssize_t endinpos;
2565 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002566 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002567 PyUnicodeObject *unicode;
2568 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002569 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002570 PyObject *errorHandler = NULL;
2571 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572
2573 /* Note: size will always be longer than the resulting Unicode
2574 character count */
2575 unicode = _PyUnicode_New(size);
2576 if (!unicode)
2577 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002578 if (size == 0) {
2579 if (consumed)
2580 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002581 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002582 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583
2584 /* Unpack UTF-8 encoded data */
2585 p = unicode->str;
2586 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002587 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002588
2589 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002590 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591
2592 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002593 /* Fast path for runs of ASCII characters. Given that common UTF-8
2594 input will consist of an overwhelming majority of ASCII
2595 characters, we try to optimize for this case by checking
2596 as many characters as a C 'long' can contain.
2597 First, check if we can do an aligned read, as most CPUs have
2598 a penalty for unaligned reads.
2599 */
2600 if (!((size_t) s & LONG_PTR_MASK)) {
2601 /* Help register allocation */
2602 register const char *_s = s;
2603 register Py_UNICODE *_p = p;
2604 while (_s < aligned_end) {
2605 /* Read a whole long at a time (either 4 or 8 bytes),
2606 and do a fast unrolled copy if it only contains ASCII
2607 characters. */
2608 unsigned long data = *(unsigned long *) _s;
2609 if (data & ASCII_CHAR_MASK)
2610 break;
2611 _p[0] = (unsigned char) _s[0];
2612 _p[1] = (unsigned char) _s[1];
2613 _p[2] = (unsigned char) _s[2];
2614 _p[3] = (unsigned char) _s[3];
2615#if (SIZEOF_LONG == 8)
2616 _p[4] = (unsigned char) _s[4];
2617 _p[5] = (unsigned char) _s[5];
2618 _p[6] = (unsigned char) _s[6];
2619 _p[7] = (unsigned char) _s[7];
2620#endif
2621 _s += SIZEOF_LONG;
2622 _p += SIZEOF_LONG;
2623 }
2624 s = _s;
2625 p = _p;
2626 if (s == e)
2627 break;
2628 ch = (unsigned char)*s;
2629 }
2630 }
2631
2632 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002633 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634 s++;
2635 continue;
2636 }
2637
2638 n = utf8_code_length[ch];
2639
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002640 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002641 if (consumed)
2642 break;
2643 else {
2644 errmsg = "unexpected end of data";
2645 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002646 endinpos = startinpos+1;
2647 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2648 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002649 goto utf8Error;
2650 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002651 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002652
2653 switch (n) {
2654
2655 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002656 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002657 startinpos = s-starts;
2658 endinpos = startinpos+1;
2659 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660
2661 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002662 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002663 startinpos = s-starts;
2664 endinpos = startinpos+1;
2665 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666
2667 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002668 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002669 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002670 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002671 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002672 goto utf8Error;
2673 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002675 assert ((ch > 0x007F) && (ch <= 0x07FF));
2676 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 break;
2678
2679 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002680 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2681 will result in surrogates in range d800-dfff. Surrogates are
2682 not valid UTF-8 so they are rejected.
2683 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2684 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002685 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002686 (s[2] & 0xc0) != 0x80 ||
2687 ((unsigned char)s[0] == 0xE0 &&
2688 (unsigned char)s[1] < 0xA0) ||
2689 ((unsigned char)s[0] == 0xED &&
2690 (unsigned char)s[1] > 0x9F)) {
2691 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002692 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002693 endinpos = startinpos + 1;
2694
2695 /* if s[1] first two bits are 1 and 0, then the invalid
2696 continuation byte is s[2], so increment endinpos by 1,
2697 if not, s[1] is invalid and endinpos doesn't need to
2698 be incremented. */
2699 if ((s[1] & 0xC0) == 0x80)
2700 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002701 goto utf8Error;
2702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002704 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2705 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002706 break;
2707
2708 case 4:
2709 if ((s[1] & 0xc0) != 0x80 ||
2710 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002711 (s[3] & 0xc0) != 0x80 ||
2712 ((unsigned char)s[0] == 0xF0 &&
2713 (unsigned char)s[1] < 0x90) ||
2714 ((unsigned char)s[0] == 0xF4 &&
2715 (unsigned char)s[1] > 0x8F)) {
2716 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002717 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002718 endinpos = startinpos + 1;
2719 if ((s[1] & 0xC0) == 0x80) {
2720 endinpos++;
2721 if ((s[2] & 0xC0) == 0x80)
2722 endinpos++;
2723 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002724 goto utf8Error;
2725 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002726 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002727 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2728 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2729
Fredrik Lundh8f455852001-06-27 18:59:43 +00002730#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002731 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002732#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002733 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002734
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002735 /* translate from 10000..10FFFF to 0..FFFF */
2736 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002737
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002738 /* high surrogate = top 10 bits added to D800 */
2739 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002740
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002741 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002742 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002743#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 }
2746 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002747 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002748
Benjamin Peterson29060642009-01-31 22:14:21 +00002749 utf8Error:
2750 outpos = p-PyUnicode_AS_UNICODE(unicode);
2751 if (unicode_decode_call_errorhandler(
2752 errors, &errorHandler,
2753 "utf8", errmsg,
2754 &starts, &e, &startinpos, &endinpos, &exc, &s,
2755 &unicode, &outpos, &p))
2756 goto onError;
2757 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 }
Walter Dörwald69652032004-09-07 20:24:22 +00002759 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002760 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761
2762 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002763 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 goto onError;
2765
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002766 Py_XDECREF(errorHandler);
2767 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768 return (PyObject *)unicode;
2769
Benjamin Peterson29060642009-01-31 22:14:21 +00002770 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002771 Py_XDECREF(errorHandler);
2772 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 Py_DECREF(unicode);
2774 return NULL;
2775}
2776
Antoine Pitrouab868312009-01-10 15:40:25 +00002777#undef ASCII_CHAR_MASK
2778
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002779#ifdef __APPLE__
2780
2781/* Simplified UTF-8 decoder using surrogateescape error handler,
2782 used to decode the command line arguments on Mac OS X. */
2783
2784wchar_t*
2785_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2786{
2787 int n;
2788 const char *e;
2789 wchar_t *unicode, *p;
2790
2791 /* Note: size will always be longer than the resulting Unicode
2792 character count */
2793 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2794 PyErr_NoMemory();
2795 return NULL;
2796 }
2797 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2798 if (!unicode)
2799 return NULL;
2800
2801 /* Unpack UTF-8 encoded data */
2802 p = unicode;
2803 e = s + size;
2804 while (s < e) {
2805 Py_UCS4 ch = (unsigned char)*s;
2806
2807 if (ch < 0x80) {
2808 *p++ = (wchar_t)ch;
2809 s++;
2810 continue;
2811 }
2812
2813 n = utf8_code_length[ch];
2814 if (s + n > e) {
2815 goto surrogateescape;
2816 }
2817
2818 switch (n) {
2819 case 0:
2820 case 1:
2821 goto surrogateescape;
2822
2823 case 2:
2824 if ((s[1] & 0xc0) != 0x80)
2825 goto surrogateescape;
2826 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2827 assert ((ch > 0x007F) && (ch <= 0x07FF));
2828 *p++ = (wchar_t)ch;
2829 break;
2830
2831 case 3:
2832 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2833 will result in surrogates in range d800-dfff. Surrogates are
2834 not valid UTF-8 so they are rejected.
2835 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2836 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2837 if ((s[1] & 0xc0) != 0x80 ||
2838 (s[2] & 0xc0) != 0x80 ||
2839 ((unsigned char)s[0] == 0xE0 &&
2840 (unsigned char)s[1] < 0xA0) ||
2841 ((unsigned char)s[0] == 0xED &&
2842 (unsigned char)s[1] > 0x9F)) {
2843
2844 goto surrogateescape;
2845 }
2846 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2847 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2848 *p++ = (Py_UNICODE)ch;
2849 break;
2850
2851 case 4:
2852 if ((s[1] & 0xc0) != 0x80 ||
2853 (s[2] & 0xc0) != 0x80 ||
2854 (s[3] & 0xc0) != 0x80 ||
2855 ((unsigned char)s[0] == 0xF0 &&
2856 (unsigned char)s[1] < 0x90) ||
2857 ((unsigned char)s[0] == 0xF4 &&
2858 (unsigned char)s[1] > 0x8F)) {
2859 goto surrogateescape;
2860 }
2861 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2862 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2863 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2864
2865#if SIZEOF_WCHAR_T == 4
2866 *p++ = (wchar_t)ch;
2867#else
2868 /* compute and append the two surrogates: */
2869
2870 /* translate from 10000..10FFFF to 0..FFFF */
2871 ch -= 0x10000;
2872
2873 /* high surrogate = top 10 bits added to D800 */
2874 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2875
2876 /* low surrogate = bottom 10 bits added to DC00 */
2877 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2878#endif
2879 break;
2880 }
2881 s += n;
2882 continue;
2883
2884 surrogateescape:
2885 *p++ = 0xDC00 + ch;
2886 s++;
2887 }
2888 *p = L'\0';
2889 return unicode;
2890}
2891
2892#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002893
Tim Peters602f7402002-04-27 18:03:26 +00002894/* Allocation strategy: if the string is short, convert into a stack buffer
2895 and allocate exactly as much space needed at the end. Else allocate the
2896 maximum possible needed (4 result bytes per Unicode character), and return
2897 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002898*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002899PyObject *
2900PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002901 Py_ssize_t size,
2902 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903{
Tim Peters602f7402002-04-27 18:03:26 +00002904#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002905
Guido van Rossum98297ee2007-11-06 21:34:58 +00002906 Py_ssize_t i; /* index into s of next input byte */
2907 PyObject *result; /* result string object */
2908 char *p; /* next free byte in output buffer */
2909 Py_ssize_t nallocated; /* number of result bytes allocated */
2910 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002911 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002912 PyObject *errorHandler = NULL;
2913 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002914
Tim Peters602f7402002-04-27 18:03:26 +00002915 assert(s != NULL);
2916 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002917
Tim Peters602f7402002-04-27 18:03:26 +00002918 if (size <= MAX_SHORT_UNICHARS) {
2919 /* Write into the stack buffer; nallocated can't overflow.
2920 * At the end, we'll allocate exactly as much heap space as it
2921 * turns out we need.
2922 */
2923 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002924 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002925 p = stackbuf;
2926 }
2927 else {
2928 /* Overallocate on the heap, and give the excess back at the end. */
2929 nallocated = size * 4;
2930 if (nallocated / 4 != size) /* overflow! */
2931 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002932 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002933 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002934 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002935 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002936 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002937
Tim Peters602f7402002-04-27 18:03:26 +00002938 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002939 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002940
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002941 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002942 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002944
Guido van Rossumd57fd912000-03-10 22:53:23 +00002945 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002946 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002947 *p++ = (char)(0xc0 | (ch >> 6));
2948 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002949 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002950#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002951 /* Special case: check for high and low surrogate */
2952 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2953 Py_UCS4 ch2 = s[i];
2954 /* Combine the two surrogates to form a UCS4 value */
2955 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2956 i++;
2957
2958 /* Encode UCS4 Unicode ordinals */
2959 *p++ = (char)(0xf0 | (ch >> 18));
2960 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002961 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2962 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002963 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002964#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002965 Py_ssize_t newpos;
2966 PyObject *rep;
2967 Py_ssize_t repsize, k;
2968 rep = unicode_encode_call_errorhandler
2969 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2970 s, size, &exc, i-1, i, &newpos);
2971 if (!rep)
2972 goto error;
2973
2974 if (PyBytes_Check(rep))
2975 repsize = PyBytes_GET_SIZE(rep);
2976 else
2977 repsize = PyUnicode_GET_SIZE(rep);
2978
2979 if (repsize > 4) {
2980 Py_ssize_t offset;
2981
2982 if (result == NULL)
2983 offset = p - stackbuf;
2984 else
2985 offset = p - PyBytes_AS_STRING(result);
2986
2987 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2988 /* integer overflow */
2989 PyErr_NoMemory();
2990 goto error;
2991 }
2992 nallocated += repsize - 4;
2993 if (result != NULL) {
2994 if (_PyBytes_Resize(&result, nallocated) < 0)
2995 goto error;
2996 } else {
2997 result = PyBytes_FromStringAndSize(NULL, nallocated);
2998 if (result == NULL)
2999 goto error;
3000 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3001 }
3002 p = PyBytes_AS_STRING(result) + offset;
3003 }
3004
3005 if (PyBytes_Check(rep)) {
3006 char *prep = PyBytes_AS_STRING(rep);
3007 for(k = repsize; k > 0; k--)
3008 *p++ = *prep++;
3009 } else /* rep is unicode */ {
3010 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3011 Py_UNICODE c;
3012
3013 for(k=0; k<repsize; k++) {
3014 c = prep[k];
3015 if (0x80 <= c) {
3016 raise_encode_exception(&exc, "utf-8", s, size,
3017 i-1, i, "surrogates not allowed");
3018 goto error;
3019 }
3020 *p++ = (char)prep[k];
3021 }
3022 }
3023 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003024#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003025 }
Victor Stinner445a6232010-04-22 20:01:57 +00003026#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003027 } else if (ch < 0x10000) {
3028 *p++ = (char)(0xe0 | (ch >> 12));
3029 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3030 *p++ = (char)(0x80 | (ch & 0x3f));
3031 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003032 /* Encode UCS4 Unicode ordinals */
3033 *p++ = (char)(0xf0 | (ch >> 18));
3034 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3035 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3036 *p++ = (char)(0x80 | (ch & 0x3f));
3037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003039
Guido van Rossum98297ee2007-11-06 21:34:58 +00003040 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003041 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003042 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003043 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003044 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003045 }
3046 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003047 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003048 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003049 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003050 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003051 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003052 Py_XDECREF(errorHandler);
3053 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003054 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003055 error:
3056 Py_XDECREF(errorHandler);
3057 Py_XDECREF(exc);
3058 Py_XDECREF(result);
3059 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003060
Tim Peters602f7402002-04-27 18:03:26 +00003061#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062}
3063
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3065{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066 if (!PyUnicode_Check(unicode)) {
3067 PyErr_BadArgument();
3068 return NULL;
3069 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003070 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003071 PyUnicode_GET_SIZE(unicode),
3072 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073}
3074
Walter Dörwald41980ca2007-08-16 21:55:45 +00003075/* --- UTF-32 Codec ------------------------------------------------------- */
3076
3077PyObject *
3078PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003079 Py_ssize_t size,
3080 const char *errors,
3081 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003082{
3083 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3084}
3085
3086PyObject *
3087PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003088 Py_ssize_t size,
3089 const char *errors,
3090 int *byteorder,
3091 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003092{
3093 const char *starts = s;
3094 Py_ssize_t startinpos;
3095 Py_ssize_t endinpos;
3096 Py_ssize_t outpos;
3097 PyUnicodeObject *unicode;
3098 Py_UNICODE *p;
3099#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003100 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003101 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003102#else
3103 const int pairs = 0;
3104#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003105 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003106 int bo = 0; /* assume native ordering by default */
3107 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003108 /* Offsets from q for retrieving bytes in the right order. */
3109#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3110 int iorder[] = {0, 1, 2, 3};
3111#else
3112 int iorder[] = {3, 2, 1, 0};
3113#endif
3114 PyObject *errorHandler = NULL;
3115 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003116
Walter Dörwald41980ca2007-08-16 21:55:45 +00003117 q = (unsigned char *)s;
3118 e = q + size;
3119
3120 if (byteorder)
3121 bo = *byteorder;
3122
3123 /* Check for BOM marks (U+FEFF) in the input and adjust current
3124 byte order setting accordingly. In native mode, the leading BOM
3125 mark is skipped, in all other modes, it is copied to the output
3126 stream as-is (giving a ZWNBSP character). */
3127 if (bo == 0) {
3128 if (size >= 4) {
3129 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003130 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003131#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003132 if (bom == 0x0000FEFF) {
3133 q += 4;
3134 bo = -1;
3135 }
3136 else if (bom == 0xFFFE0000) {
3137 q += 4;
3138 bo = 1;
3139 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003140#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003141 if (bom == 0x0000FEFF) {
3142 q += 4;
3143 bo = 1;
3144 }
3145 else if (bom == 0xFFFE0000) {
3146 q += 4;
3147 bo = -1;
3148 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003149#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003150 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003151 }
3152
3153 if (bo == -1) {
3154 /* force LE */
3155 iorder[0] = 0;
3156 iorder[1] = 1;
3157 iorder[2] = 2;
3158 iorder[3] = 3;
3159 }
3160 else if (bo == 1) {
3161 /* force BE */
3162 iorder[0] = 3;
3163 iorder[1] = 2;
3164 iorder[2] = 1;
3165 iorder[3] = 0;
3166 }
3167
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003168 /* On narrow builds we split characters outside the BMP into two
3169 codepoints => count how much extra space we need. */
3170#ifndef Py_UNICODE_WIDE
3171 for (qq = q; qq < e; qq += 4)
3172 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3173 pairs++;
3174#endif
3175
3176 /* This might be one to much, because of a BOM */
3177 unicode = _PyUnicode_New((size+3)/4+pairs);
3178 if (!unicode)
3179 return NULL;
3180 if (size == 0)
3181 return (PyObject *)unicode;
3182
3183 /* Unpack UTF-32 encoded data */
3184 p = unicode->str;
3185
Walter Dörwald41980ca2007-08-16 21:55:45 +00003186 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003187 Py_UCS4 ch;
3188 /* remaining bytes at the end? (size should be divisible by 4) */
3189 if (e-q<4) {
3190 if (consumed)
3191 break;
3192 errmsg = "truncated data";
3193 startinpos = ((const char *)q)-starts;
3194 endinpos = ((const char *)e)-starts;
3195 goto utf32Error;
3196 /* The remaining input chars are ignored if the callback
3197 chooses to skip the input */
3198 }
3199 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3200 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003201
Benjamin Peterson29060642009-01-31 22:14:21 +00003202 if (ch >= 0x110000)
3203 {
3204 errmsg = "codepoint not in range(0x110000)";
3205 startinpos = ((const char *)q)-starts;
3206 endinpos = startinpos+4;
3207 goto utf32Error;
3208 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003209#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003210 if (ch >= 0x10000)
3211 {
3212 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3213 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3214 }
3215 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003216#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003217 *p++ = ch;
3218 q += 4;
3219 continue;
3220 utf32Error:
3221 outpos = p-PyUnicode_AS_UNICODE(unicode);
3222 if (unicode_decode_call_errorhandler(
3223 errors, &errorHandler,
3224 "utf32", errmsg,
3225 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3226 &unicode, &outpos, &p))
3227 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003228 }
3229
3230 if (byteorder)
3231 *byteorder = bo;
3232
3233 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003234 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003235
3236 /* Adjust length */
3237 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3238 goto onError;
3239
3240 Py_XDECREF(errorHandler);
3241 Py_XDECREF(exc);
3242 return (PyObject *)unicode;
3243
Benjamin Peterson29060642009-01-31 22:14:21 +00003244 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003245 Py_DECREF(unicode);
3246 Py_XDECREF(errorHandler);
3247 Py_XDECREF(exc);
3248 return NULL;
3249}
3250
3251PyObject *
3252PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003253 Py_ssize_t size,
3254 const char *errors,
3255 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003256{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003257 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003258 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003259 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003260#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003261 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003262#else
3263 const int pairs = 0;
3264#endif
3265 /* Offsets from p for storing byte pairs in the right order. */
3266#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3267 int iorder[] = {0, 1, 2, 3};
3268#else
3269 int iorder[] = {3, 2, 1, 0};
3270#endif
3271
Benjamin Peterson29060642009-01-31 22:14:21 +00003272#define STORECHAR(CH) \
3273 do { \
3274 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3275 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3276 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3277 p[iorder[0]] = (CH) & 0xff; \
3278 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003279 } while(0)
3280
3281 /* In narrow builds we can output surrogate pairs as one codepoint,
3282 so we need less space. */
3283#ifndef Py_UNICODE_WIDE
3284 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3286 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3287 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003288#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003289 nsize = (size - pairs + (byteorder == 0));
3290 bytesize = nsize * 4;
3291 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003292 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003293 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003294 if (v == NULL)
3295 return NULL;
3296
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003297 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003298 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003299 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003300 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003301 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003302
3303 if (byteorder == -1) {
3304 /* force LE */
3305 iorder[0] = 0;
3306 iorder[1] = 1;
3307 iorder[2] = 2;
3308 iorder[3] = 3;
3309 }
3310 else if (byteorder == 1) {
3311 /* force BE */
3312 iorder[0] = 3;
3313 iorder[1] = 2;
3314 iorder[2] = 1;
3315 iorder[3] = 0;
3316 }
3317
3318 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003319 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003320#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003321 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3322 Py_UCS4 ch2 = *s;
3323 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3324 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3325 s++;
3326 size--;
3327 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003328 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003329#endif
3330 STORECHAR(ch);
3331 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003332
3333 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003334 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003335#undef STORECHAR
3336}
3337
3338PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3339{
3340 if (!PyUnicode_Check(unicode)) {
3341 PyErr_BadArgument();
3342 return NULL;
3343 }
3344 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003345 PyUnicode_GET_SIZE(unicode),
3346 NULL,
3347 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003348}
3349
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350/* --- UTF-16 Codec ------------------------------------------------------- */
3351
Tim Peters772747b2001-08-09 22:21:55 +00003352PyObject *
3353PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003354 Py_ssize_t size,
3355 const char *errors,
3356 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357{
Walter Dörwald69652032004-09-07 20:24:22 +00003358 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3359}
3360
Antoine Pitrouab868312009-01-10 15:40:25 +00003361/* Two masks for fast checking of whether a C 'long' may contain
3362 UTF16-encoded surrogate characters. This is an efficient heuristic,
3363 assuming that non-surrogate characters with a code point >= 0x8000 are
3364 rare in most input.
3365 FAST_CHAR_MASK is used when the input is in native byte ordering,
3366 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003367*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003368#if (SIZEOF_LONG == 8)
3369# define FAST_CHAR_MASK 0x8000800080008000L
3370# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3371#elif (SIZEOF_LONG == 4)
3372# define FAST_CHAR_MASK 0x80008000L
3373# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3374#else
3375# error C 'long' size should be either 4 or 8!
3376#endif
3377
Walter Dörwald69652032004-09-07 20:24:22 +00003378PyObject *
3379PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003380 Py_ssize_t size,
3381 const char *errors,
3382 int *byteorder,
3383 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003384{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003385 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003386 Py_ssize_t startinpos;
3387 Py_ssize_t endinpos;
3388 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389 PyUnicodeObject *unicode;
3390 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003391 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003392 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003393 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003394 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003395 /* Offsets from q for retrieving byte pairs in the right order. */
3396#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3397 int ihi = 1, ilo = 0;
3398#else
3399 int ihi = 0, ilo = 1;
3400#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003401 PyObject *errorHandler = NULL;
3402 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403
3404 /* Note: size will always be longer than the resulting Unicode
3405 character count */
3406 unicode = _PyUnicode_New(size);
3407 if (!unicode)
3408 return NULL;
3409 if (size == 0)
3410 return (PyObject *)unicode;
3411
3412 /* Unpack UTF-16 encoded data */
3413 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003414 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003415 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416
3417 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003418 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003420 /* Check for BOM marks (U+FEFF) in the input and adjust current
3421 byte order setting accordingly. In native mode, the leading BOM
3422 mark is skipped, in all other modes, it is copied to the output
3423 stream as-is (giving a ZWNBSP character). */
3424 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003425 if (size >= 2) {
3426 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003427#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003428 if (bom == 0xFEFF) {
3429 q += 2;
3430 bo = -1;
3431 }
3432 else if (bom == 0xFFFE) {
3433 q += 2;
3434 bo = 1;
3435 }
Tim Petersced69f82003-09-16 20:30:58 +00003436#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003437 if (bom == 0xFEFF) {
3438 q += 2;
3439 bo = 1;
3440 }
3441 else if (bom == 0xFFFE) {
3442 q += 2;
3443 bo = -1;
3444 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003445#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003446 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003447 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448
Tim Peters772747b2001-08-09 22:21:55 +00003449 if (bo == -1) {
3450 /* force LE */
3451 ihi = 1;
3452 ilo = 0;
3453 }
3454 else if (bo == 1) {
3455 /* force BE */
3456 ihi = 0;
3457 ilo = 1;
3458 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003459#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3460 native_ordering = ilo < ihi;
3461#else
3462 native_ordering = ilo > ihi;
3463#endif
Tim Peters772747b2001-08-09 22:21:55 +00003464
Antoine Pitrouab868312009-01-10 15:40:25 +00003465 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003466 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003467 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003468 /* First check for possible aligned read of a C 'long'. Unaligned
3469 reads are more expensive, better to defer to another iteration. */
3470 if (!((size_t) q & LONG_PTR_MASK)) {
3471 /* Fast path for runs of non-surrogate chars. */
3472 register const unsigned char *_q = q;
3473 Py_UNICODE *_p = p;
3474 if (native_ordering) {
3475 /* Native ordering is simple: as long as the input cannot
3476 possibly contain a surrogate char, do an unrolled copy
3477 of several 16-bit code points to the target object.
3478 The non-surrogate check is done on several input bytes
3479 at a time (as many as a C 'long' can contain). */
3480 while (_q < aligned_end) {
3481 unsigned long data = * (unsigned long *) _q;
3482 if (data & FAST_CHAR_MASK)
3483 break;
3484 _p[0] = ((unsigned short *) _q)[0];
3485 _p[1] = ((unsigned short *) _q)[1];
3486#if (SIZEOF_LONG == 8)
3487 _p[2] = ((unsigned short *) _q)[2];
3488 _p[3] = ((unsigned short *) _q)[3];
3489#endif
3490 _q += SIZEOF_LONG;
3491 _p += SIZEOF_LONG / 2;
3492 }
3493 }
3494 else {
3495 /* Byteswapped ordering is similar, but we must decompose
3496 the copy bytewise, and take care of zero'ing out the
3497 upper bytes if the target object is in 32-bit units
3498 (that is, in UCS-4 builds). */
3499 while (_q < aligned_end) {
3500 unsigned long data = * (unsigned long *) _q;
3501 if (data & SWAPPED_FAST_CHAR_MASK)
3502 break;
3503 /* Zero upper bytes in UCS-4 builds */
3504#if (Py_UNICODE_SIZE > 2)
3505 _p[0] = 0;
3506 _p[1] = 0;
3507#if (SIZEOF_LONG == 8)
3508 _p[2] = 0;
3509 _p[3] = 0;
3510#endif
3511#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003512 /* Issue #4916; UCS-4 builds on big endian machines must
3513 fill the two last bytes of each 4-byte unit. */
3514#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3515# define OFF 2
3516#else
3517# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003518#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003519 ((unsigned char *) _p)[OFF + 1] = _q[0];
3520 ((unsigned char *) _p)[OFF + 0] = _q[1];
3521 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3522 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3523#if (SIZEOF_LONG == 8)
3524 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3525 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3526 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3527 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3528#endif
3529#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003530 _q += SIZEOF_LONG;
3531 _p += SIZEOF_LONG / 2;
3532 }
3533 }
3534 p = _p;
3535 q = _q;
3536 if (q >= e)
3537 break;
3538 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003539 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003540
Benjamin Peterson14339b62009-01-31 16:36:08 +00003541 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003542
3543 if (ch < 0xD800 || ch > 0xDFFF) {
3544 *p++ = ch;
3545 continue;
3546 }
3547
3548 /* UTF-16 code pair: */
3549 if (q > e) {
3550 errmsg = "unexpected end of data";
3551 startinpos = (((const char *)q) - 2) - starts;
3552 endinpos = ((const char *)e) + 1 - starts;
3553 goto utf16Error;
3554 }
3555 if (0xD800 <= ch && ch <= 0xDBFF) {
3556 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3557 q += 2;
3558 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003559#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003560 *p++ = ch;
3561 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003562#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003563 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003564#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003565 continue;
3566 }
3567 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003568 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003569 startinpos = (((const char *)q)-4)-starts;
3570 endinpos = startinpos+2;
3571 goto utf16Error;
3572 }
3573
Benjamin Peterson14339b62009-01-31 16:36:08 +00003574 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003575 errmsg = "illegal encoding";
3576 startinpos = (((const char *)q)-2)-starts;
3577 endinpos = startinpos+2;
3578 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003579
Benjamin Peterson29060642009-01-31 22:14:21 +00003580 utf16Error:
3581 outpos = p - PyUnicode_AS_UNICODE(unicode);
3582 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003583 errors,
3584 &errorHandler,
3585 "utf16", errmsg,
3586 &starts,
3587 (const char **)&e,
3588 &startinpos,
3589 &endinpos,
3590 &exc,
3591 (const char **)&q,
3592 &unicode,
3593 &outpos,
3594 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003595 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003597 /* remaining byte at the end? (size should be even) */
3598 if (e == q) {
3599 if (!consumed) {
3600 errmsg = "truncated data";
3601 startinpos = ((const char *)q) - starts;
3602 endinpos = ((const char *)e) + 1 - starts;
3603 outpos = p - PyUnicode_AS_UNICODE(unicode);
3604 if (unicode_decode_call_errorhandler(
3605 errors,
3606 &errorHandler,
3607 "utf16", errmsg,
3608 &starts,
3609 (const char **)&e,
3610 &startinpos,
3611 &endinpos,
3612 &exc,
3613 (const char **)&q,
3614 &unicode,
3615 &outpos,
3616 &p))
3617 goto onError;
3618 /* The remaining input chars are ignored if the callback
3619 chooses to skip the input */
3620 }
3621 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003622
3623 if (byteorder)
3624 *byteorder = bo;
3625
Walter Dörwald69652032004-09-07 20:24:22 +00003626 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003627 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003628
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003630 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003631 goto onError;
3632
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 Py_XDECREF(errorHandler);
3634 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003635 return (PyObject *)unicode;
3636
Benjamin Peterson29060642009-01-31 22:14:21 +00003637 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003639 Py_XDECREF(errorHandler);
3640 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641 return NULL;
3642}
3643
Antoine Pitrouab868312009-01-10 15:40:25 +00003644#undef FAST_CHAR_MASK
3645#undef SWAPPED_FAST_CHAR_MASK
3646
Tim Peters772747b2001-08-09 22:21:55 +00003647PyObject *
3648PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003649 Py_ssize_t size,
3650 const char *errors,
3651 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003653 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003654 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003655 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003656#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003657 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003658#else
3659 const int pairs = 0;
3660#endif
Tim Peters772747b2001-08-09 22:21:55 +00003661 /* Offsets from p for storing byte pairs in the right order. */
3662#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3663 int ihi = 1, ilo = 0;
3664#else
3665 int ihi = 0, ilo = 1;
3666#endif
3667
Benjamin Peterson29060642009-01-31 22:14:21 +00003668#define STORECHAR(CH) \
3669 do { \
3670 p[ihi] = ((CH) >> 8) & 0xff; \
3671 p[ilo] = (CH) & 0xff; \
3672 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003673 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003675#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003676 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003677 if (s[i] >= 0x10000)
3678 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003679#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003680 /* 2 * (size + pairs + (byteorder == 0)) */
3681 if (size > PY_SSIZE_T_MAX ||
3682 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003683 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003684 nsize = size + pairs + (byteorder == 0);
3685 bytesize = nsize * 2;
3686 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003687 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003688 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 if (v == NULL)
3690 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003691
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003692 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003694 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003695 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003696 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003697
3698 if (byteorder == -1) {
3699 /* force LE */
3700 ihi = 1;
3701 ilo = 0;
3702 }
3703 else if (byteorder == 1) {
3704 /* force BE */
3705 ihi = 0;
3706 ilo = 1;
3707 }
3708
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003709 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003710 Py_UNICODE ch = *s++;
3711 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003712#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003713 if (ch >= 0x10000) {
3714 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3715 ch = 0xD800 | ((ch-0x10000) >> 10);
3716 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003717#endif
Tim Peters772747b2001-08-09 22:21:55 +00003718 STORECHAR(ch);
3719 if (ch2)
3720 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003721 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003722
3723 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003724 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003725#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003726}
3727
3728PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3729{
3730 if (!PyUnicode_Check(unicode)) {
3731 PyErr_BadArgument();
3732 return NULL;
3733 }
3734 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003735 PyUnicode_GET_SIZE(unicode),
3736 NULL,
3737 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738}
3739
3740/* --- Unicode Escape Codec ----------------------------------------------- */
3741
Fredrik Lundh06d12682001-01-24 07:59:11 +00003742static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003743
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003745 Py_ssize_t size,
3746 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003748 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003749 Py_ssize_t startinpos;
3750 Py_ssize_t endinpos;
3751 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003754 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003756 char* message;
3757 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 PyObject *errorHandler = NULL;
3759 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003760
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761 /* Escaped strings will always be longer than the resulting
3762 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003763 length after conversion to the true value.
3764 (but if the error callback returns a long replacement string
3765 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766 v = _PyUnicode_New(size);
3767 if (v == NULL)
3768 goto onError;
3769 if (size == 0)
3770 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003771
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003772 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003774
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 while (s < end) {
3776 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003777 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779
3780 /* Non-escape characters are interpreted as Unicode ordinals */
3781 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003782 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 continue;
3784 }
3785
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003786 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003787 /* \ - Escapes */
3788 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003789 c = *s++;
3790 if (s > end)
3791 c = '\0'; /* Invalid after \ */
3792 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793
Benjamin Peterson29060642009-01-31 22:14:21 +00003794 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795 case '\n': break;
3796 case '\\': *p++ = '\\'; break;
3797 case '\'': *p++ = '\''; break;
3798 case '\"': *p++ = '\"'; break;
3799 case 'b': *p++ = '\b'; break;
3800 case 'f': *p++ = '\014'; break; /* FF */
3801 case 't': *p++ = '\t'; break;
3802 case 'n': *p++ = '\n'; break;
3803 case 'r': *p++ = '\r'; break;
3804 case 'v': *p++ = '\013'; break; /* VT */
3805 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3806
Benjamin Peterson29060642009-01-31 22:14:21 +00003807 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808 case '0': case '1': case '2': case '3':
3809 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003810 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003811 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003812 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003813 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003814 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003816 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 break;
3818
Benjamin Peterson29060642009-01-31 22:14:21 +00003819 /* hex escapes */
3820 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003821 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003822 digits = 2;
3823 message = "truncated \\xXX escape";
3824 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825
Benjamin Peterson29060642009-01-31 22:14:21 +00003826 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003828 digits = 4;
3829 message = "truncated \\uXXXX escape";
3830 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831
Benjamin Peterson29060642009-01-31 22:14:21 +00003832 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003833 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003834 digits = 8;
3835 message = "truncated \\UXXXXXXXX escape";
3836 hexescape:
3837 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003838 outpos = p-PyUnicode_AS_UNICODE(v);
3839 if (s+digits>end) {
3840 endinpos = size;
3841 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003842 errors, &errorHandler,
3843 "unicodeescape", "end of string in escape sequence",
3844 &starts, &end, &startinpos, &endinpos, &exc, &s,
3845 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003846 goto onError;
3847 goto nextByte;
3848 }
3849 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003850 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003851 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003852 endinpos = (s+i+1)-starts;
3853 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003854 errors, &errorHandler,
3855 "unicodeescape", message,
3856 &starts, &end, &startinpos, &endinpos, &exc, &s,
3857 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003858 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003859 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003860 }
3861 chr = (chr<<4) & ~0xF;
3862 if (c >= '0' && c <= '9')
3863 chr += c - '0';
3864 else if (c >= 'a' && c <= 'f')
3865 chr += 10 + c - 'a';
3866 else
3867 chr += 10 + c - 'A';
3868 }
3869 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003870 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003871 /* _decoding_error will have already written into the
3872 target buffer. */
3873 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003874 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003875 /* when we get here, chr is a 32-bit unicode character */
3876 if (chr <= 0xffff)
3877 /* UCS-2 character */
3878 *p++ = (Py_UNICODE) chr;
3879 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003880 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003881 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003882#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003883 *p++ = chr;
3884#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003885 chr -= 0x10000L;
3886 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003887 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003888#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003889 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003890 endinpos = s-starts;
3891 outpos = p-PyUnicode_AS_UNICODE(v);
3892 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003893 errors, &errorHandler,
3894 "unicodeescape", "illegal Unicode character",
3895 &starts, &end, &startinpos, &endinpos, &exc, &s,
3896 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003897 goto onError;
3898 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003899 break;
3900
Benjamin Peterson29060642009-01-31 22:14:21 +00003901 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003902 case 'N':
3903 message = "malformed \\N character escape";
3904 if (ucnhash_CAPI == NULL) {
3905 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003906 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003907 if (ucnhash_CAPI == NULL)
3908 goto ucnhashError;
3909 }
3910 if (*s == '{') {
3911 const char *start = s+1;
3912 /* look for the closing brace */
3913 while (*s != '}' && s < end)
3914 s++;
3915 if (s > start && s < end && *s == '}') {
3916 /* found a name. look it up in the unicode database */
3917 message = "unknown Unicode character name";
3918 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003919 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003920 goto store;
3921 }
3922 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003923 endinpos = s-starts;
3924 outpos = p-PyUnicode_AS_UNICODE(v);
3925 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003926 errors, &errorHandler,
3927 "unicodeescape", message,
3928 &starts, &end, &startinpos, &endinpos, &exc, &s,
3929 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003930 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003931 break;
3932
3933 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003934 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 message = "\\ at end of string";
3936 s--;
3937 endinpos = s-starts;
3938 outpos = p-PyUnicode_AS_UNICODE(v);
3939 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003940 errors, &errorHandler,
3941 "unicodeescape", message,
3942 &starts, &end, &startinpos, &endinpos, &exc, &s,
3943 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003944 goto onError;
3945 }
3946 else {
3947 *p++ = '\\';
3948 *p++ = (unsigned char)s[-1];
3949 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003950 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003952 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003953 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003955 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003956 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003957 Py_XDECREF(errorHandler);
3958 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003960
Benjamin Peterson29060642009-01-31 22:14:21 +00003961 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003962 PyErr_SetString(
3963 PyExc_UnicodeError,
3964 "\\N escapes not supported (can't load unicodedata module)"
3965 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003966 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003967 Py_XDECREF(errorHandler);
3968 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003969 return NULL;
3970
Benjamin Peterson29060642009-01-31 22:14:21 +00003971 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003973 Py_XDECREF(errorHandler);
3974 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 return NULL;
3976}
3977
3978/* Return a Unicode-Escape string version of the Unicode object.
3979
3980 If quotes is true, the string is enclosed in u"" or u'' quotes as
3981 appropriate.
3982
3983*/
3984
Thomas Wouters477c8d52006-05-27 19:21:47 +00003985Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003986 Py_ssize_t size,
3987 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003988{
3989 /* like wcschr, but doesn't stop at NULL characters */
3990
3991 while (size-- > 0) {
3992 if (*s == ch)
3993 return s;
3994 s++;
3995 }
3996
3997 return NULL;
3998}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003999
Walter Dörwald79e913e2007-05-12 11:08:06 +00004000static const char *hexdigits = "0123456789abcdef";
4001
4002PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004003 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004005 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004008#ifdef Py_UNICODE_WIDE
4009 const Py_ssize_t expandsize = 10;
4010#else
4011 const Py_ssize_t expandsize = 6;
4012#endif
4013
Thomas Wouters89f507f2006-12-13 04:49:30 +00004014 /* XXX(nnorwitz): rather than over-allocating, it would be
4015 better to choose a different scheme. Perhaps scan the
4016 first N-chars of the string and allocate based on that size.
4017 */
4018 /* Initial allocation is based on the longest-possible unichr
4019 escape.
4020
4021 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4022 unichr, so in this case it's the longest unichr escape. In
4023 narrow (UTF-16) builds this is five chars per source unichr
4024 since there are two unichrs in the surrogate pair, so in narrow
4025 (UTF-16) builds it's not the longest unichr escape.
4026
4027 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4028 so in the narrow (UTF-16) build case it's the longest unichr
4029 escape.
4030 */
4031
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004032 if (size == 0)
4033 return PyBytes_FromStringAndSize(NULL, 0);
4034
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004035 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004036 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004037
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004038 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004039 2
4040 + expandsize*size
4041 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042 if (repr == NULL)
4043 return NULL;
4044
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004045 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046
Guido van Rossumd57fd912000-03-10 22:53:23 +00004047 while (size-- > 0) {
4048 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004049
Walter Dörwald79e913e2007-05-12 11:08:06 +00004050 /* Escape backslashes */
4051 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 *p++ = '\\';
4053 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004054 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004055 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004056
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004057#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004058 /* Map 21-bit characters to '\U00xxxxxx' */
4059 else if (ch >= 0x10000) {
4060 *p++ = '\\';
4061 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004062 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4063 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4064 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4065 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4066 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4067 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4068 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4069 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004071 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004072#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004073 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4074 else if (ch >= 0xD800 && ch < 0xDC00) {
4075 Py_UNICODE ch2;
4076 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004077
Benjamin Peterson29060642009-01-31 22:14:21 +00004078 ch2 = *s++;
4079 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004080 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004081 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4082 *p++ = '\\';
4083 *p++ = 'U';
4084 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4085 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4086 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4087 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4088 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4089 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4090 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4091 *p++ = hexdigits[ucs & 0x0000000F];
4092 continue;
4093 }
4094 /* Fall through: isolated surrogates are copied as-is */
4095 s--;
4096 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004097 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004098#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004099
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004101 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102 *p++ = '\\';
4103 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004104 *p++ = hexdigits[(ch >> 12) & 0x000F];
4105 *p++ = hexdigits[(ch >> 8) & 0x000F];
4106 *p++ = hexdigits[(ch >> 4) & 0x000F];
4107 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004108 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004109
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004110 /* Map special whitespace to '\t', \n', '\r' */
4111 else if (ch == '\t') {
4112 *p++ = '\\';
4113 *p++ = 't';
4114 }
4115 else if (ch == '\n') {
4116 *p++ = '\\';
4117 *p++ = 'n';
4118 }
4119 else if (ch == '\r') {
4120 *p++ = '\\';
4121 *p++ = 'r';
4122 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004123
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004124 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004125 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004127 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004128 *p++ = hexdigits[(ch >> 4) & 0x000F];
4129 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004130 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004131
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 /* Copy everything else as-is */
4133 else
4134 *p++ = (char) ch;
4135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004137 assert(p - PyBytes_AS_STRING(repr) > 0);
4138 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4139 return NULL;
4140 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141}
4142
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00004143PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004145 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146 if (!PyUnicode_Check(unicode)) {
4147 PyErr_BadArgument();
4148 return NULL;
4149 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004150 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4151 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004152 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153}
4154
4155/* --- Raw Unicode Escape Codec ------------------------------------------- */
4156
4157PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004158 Py_ssize_t size,
4159 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004162 Py_ssize_t startinpos;
4163 Py_ssize_t endinpos;
4164 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167 const char *end;
4168 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004169 PyObject *errorHandler = NULL;
4170 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004171
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172 /* Escaped strings will always be longer than the resulting
4173 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174 length after conversion to the true value. (But decoding error
4175 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176 v = _PyUnicode_New(size);
4177 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004180 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004181 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182 end = s + size;
4183 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004184 unsigned char c;
4185 Py_UCS4 x;
4186 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004187 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188
Benjamin Peterson29060642009-01-31 22:14:21 +00004189 /* Non-escape characters are interpreted as Unicode ordinals */
4190 if (*s != '\\') {
4191 *p++ = (unsigned char)*s++;
4192 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004193 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 startinpos = s-starts;
4195
4196 /* \u-escapes are only interpreted iff the number of leading
4197 backslashes if odd */
4198 bs = s;
4199 for (;s < end;) {
4200 if (*s != '\\')
4201 break;
4202 *p++ = (unsigned char)*s++;
4203 }
4204 if (((s - bs) & 1) == 0 ||
4205 s >= end ||
4206 (*s != 'u' && *s != 'U')) {
4207 continue;
4208 }
4209 p--;
4210 count = *s=='u' ? 4 : 8;
4211 s++;
4212
4213 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4214 outpos = p-PyUnicode_AS_UNICODE(v);
4215 for (x = 0, i = 0; i < count; ++i, ++s) {
4216 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004217 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004218 endinpos = s-starts;
4219 if (unicode_decode_call_errorhandler(
4220 errors, &errorHandler,
4221 "rawunicodeescape", "truncated \\uXXXX",
4222 &starts, &end, &startinpos, &endinpos, &exc, &s,
4223 &v, &outpos, &p))
4224 goto onError;
4225 goto nextByte;
4226 }
4227 x = (x<<4) & ~0xF;
4228 if (c >= '0' && c <= '9')
4229 x += c - '0';
4230 else if (c >= 'a' && c <= 'f')
4231 x += 10 + c - 'a';
4232 else
4233 x += 10 + c - 'A';
4234 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004235 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004236 /* UCS-2 character */
4237 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004238 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004239 /* UCS-4 character. Either store directly, or as
4240 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004241#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004242 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004243#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 x -= 0x10000L;
4245 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4246 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004247#endif
4248 } else {
4249 endinpos = s-starts;
4250 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004251 if (unicode_decode_call_errorhandler(
4252 errors, &errorHandler,
4253 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004254 &starts, &end, &startinpos, &endinpos, &exc, &s,
4255 &v, &outpos, &p))
4256 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004257 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004258 nextByte:
4259 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004261 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004262 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263 Py_XDECREF(errorHandler);
4264 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004265 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004266
Benjamin Peterson29060642009-01-31 22:14:21 +00004267 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269 Py_XDECREF(errorHandler);
4270 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271 return NULL;
4272}
4273
4274PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004275 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004277 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278 char *p;
4279 char *q;
4280
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004281#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004282 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004283#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004284 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004285#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004286
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004287 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004289
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004290 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291 if (repr == NULL)
4292 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004293 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004294 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004295
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004296 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 while (size-- > 0) {
4298 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004299#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004300 /* Map 32-bit characters to '\Uxxxxxxxx' */
4301 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004302 *p++ = '\\';
4303 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004304 *p++ = hexdigits[(ch >> 28) & 0xf];
4305 *p++ = hexdigits[(ch >> 24) & 0xf];
4306 *p++ = hexdigits[(ch >> 20) & 0xf];
4307 *p++ = hexdigits[(ch >> 16) & 0xf];
4308 *p++ = hexdigits[(ch >> 12) & 0xf];
4309 *p++ = hexdigits[(ch >> 8) & 0xf];
4310 *p++ = hexdigits[(ch >> 4) & 0xf];
4311 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004312 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004313 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004314#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004315 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4316 if (ch >= 0xD800 && ch < 0xDC00) {
4317 Py_UNICODE ch2;
4318 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004319
Benjamin Peterson29060642009-01-31 22:14:21 +00004320 ch2 = *s++;
4321 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004322 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004323 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4324 *p++ = '\\';
4325 *p++ = 'U';
4326 *p++ = hexdigits[(ucs >> 28) & 0xf];
4327 *p++ = hexdigits[(ucs >> 24) & 0xf];
4328 *p++ = hexdigits[(ucs >> 20) & 0xf];
4329 *p++ = hexdigits[(ucs >> 16) & 0xf];
4330 *p++ = hexdigits[(ucs >> 12) & 0xf];
4331 *p++ = hexdigits[(ucs >> 8) & 0xf];
4332 *p++ = hexdigits[(ucs >> 4) & 0xf];
4333 *p++ = hexdigits[ucs & 0xf];
4334 continue;
4335 }
4336 /* Fall through: isolated surrogates are copied as-is */
4337 s--;
4338 size++;
4339 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004340#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004341 /* Map 16-bit characters to '\uxxxx' */
4342 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343 *p++ = '\\';
4344 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004345 *p++ = hexdigits[(ch >> 12) & 0xf];
4346 *p++ = hexdigits[(ch >> 8) & 0xf];
4347 *p++ = hexdigits[(ch >> 4) & 0xf];
4348 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004350 /* Copy everything else as-is */
4351 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004352 *p++ = (char) ch;
4353 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004354 size = p - q;
4355
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004356 assert(size > 0);
4357 if (_PyBytes_Resize(&repr, size) < 0)
4358 return NULL;
4359 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004360}
4361
4362PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4363{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004364 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004366 PyErr_BadArgument();
4367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004368 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004369 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4370 PyUnicode_GET_SIZE(unicode));
4371
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004372 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373}
4374
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004375/* --- Unicode Internal Codec ------------------------------------------- */
4376
4377PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004378 Py_ssize_t size,
4379 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004380{
4381 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004382 Py_ssize_t startinpos;
4383 Py_ssize_t endinpos;
4384 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004385 PyUnicodeObject *v;
4386 Py_UNICODE *p;
4387 const char *end;
4388 const char *reason;
4389 PyObject *errorHandler = NULL;
4390 PyObject *exc = NULL;
4391
Neal Norwitzd43069c2006-01-08 01:12:10 +00004392#ifdef Py_UNICODE_WIDE
4393 Py_UNICODE unimax = PyUnicode_GetMax();
4394#endif
4395
Thomas Wouters89f507f2006-12-13 04:49:30 +00004396 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004397 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4398 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004400 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004401 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004402 p = PyUnicode_AS_UNICODE(v);
4403 end = s + size;
4404
4405 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004406 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004407 /* We have to sanity check the raw data, otherwise doom looms for
4408 some malformed UCS-4 data. */
4409 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004410#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004411 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004412#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004413 end-s < Py_UNICODE_SIZE
4414 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004415 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004416 startinpos = s - starts;
4417 if (end-s < Py_UNICODE_SIZE) {
4418 endinpos = end-starts;
4419 reason = "truncated input";
4420 }
4421 else {
4422 endinpos = s - starts + Py_UNICODE_SIZE;
4423 reason = "illegal code point (> 0x10FFFF)";
4424 }
4425 outpos = p - PyUnicode_AS_UNICODE(v);
4426 if (unicode_decode_call_errorhandler(
4427 errors, &errorHandler,
4428 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004429 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004430 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004431 goto onError;
4432 }
4433 }
4434 else {
4435 p++;
4436 s += Py_UNICODE_SIZE;
4437 }
4438 }
4439
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004440 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004441 goto onError;
4442 Py_XDECREF(errorHandler);
4443 Py_XDECREF(exc);
4444 return (PyObject *)v;
4445
Benjamin Peterson29060642009-01-31 22:14:21 +00004446 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004447 Py_XDECREF(v);
4448 Py_XDECREF(errorHandler);
4449 Py_XDECREF(exc);
4450 return NULL;
4451}
4452
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453/* --- Latin-1 Codec ------------------------------------------------------ */
4454
4455PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004456 Py_ssize_t size,
4457 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458{
4459 PyUnicodeObject *v;
4460 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004461 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004462
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004464 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 Py_UNICODE r = *(unsigned char*)s;
4466 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004467 }
4468
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 v = _PyUnicode_New(size);
4470 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004471 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004473 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004475 e = s + size;
4476 /* Unrolling the copy makes it much faster by reducing the looping
4477 overhead. This is similar to what many memcpy() implementations do. */
4478 unrolled_end = e - 4;
4479 while (s < unrolled_end) {
4480 p[0] = (unsigned char) s[0];
4481 p[1] = (unsigned char) s[1];
4482 p[2] = (unsigned char) s[2];
4483 p[3] = (unsigned char) s[3];
4484 s += 4;
4485 p += 4;
4486 }
4487 while (s < e)
4488 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004490
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 Py_XDECREF(v);
4493 return NULL;
4494}
4495
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004496/* create or adjust a UnicodeEncodeError */
4497static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004498 const char *encoding,
4499 const Py_UNICODE *unicode, Py_ssize_t size,
4500 Py_ssize_t startpos, Py_ssize_t endpos,
4501 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004502{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004503 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004504 *exceptionObject = PyUnicodeEncodeError_Create(
4505 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506 }
4507 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4509 goto onError;
4510 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4511 goto onError;
4512 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4513 goto onError;
4514 return;
4515 onError:
4516 Py_DECREF(*exceptionObject);
4517 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 }
4519}
4520
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521/* raises a UnicodeEncodeError */
4522static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004523 const char *encoding,
4524 const Py_UNICODE *unicode, Py_ssize_t size,
4525 Py_ssize_t startpos, Py_ssize_t endpos,
4526 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527{
4528 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532}
4533
4534/* error handling callback helper:
4535 build arguments, call the callback and check the arguments,
4536 put the result into newpos and return the replacement string, which
4537 has to be freed by the caller */
4538static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 PyObject **errorHandler,
4540 const char *encoding, const char *reason,
4541 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4542 Py_ssize_t startpos, Py_ssize_t endpos,
4543 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004544{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004545 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004546
4547 PyObject *restuple;
4548 PyObject *resunicode;
4549
4550 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004551 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554 }
4555
4556 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004557 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004559 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004560
4561 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004562 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004564 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004566 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004567 Py_DECREF(restuple);
4568 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004570 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004571 &resunicode, newpos)) {
4572 Py_DECREF(restuple);
4573 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004575 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4576 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4577 Py_DECREF(restuple);
4578 return NULL;
4579 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004581 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004582 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004583 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4584 Py_DECREF(restuple);
4585 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004586 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004587 Py_INCREF(resunicode);
4588 Py_DECREF(restuple);
4589 return resunicode;
4590}
4591
4592static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 Py_ssize_t size,
4594 const char *errors,
4595 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596{
4597 /* output object */
4598 PyObject *res;
4599 /* pointers to the beginning and end+1 of input */
4600 const Py_UNICODE *startp = p;
4601 const Py_UNICODE *endp = p + size;
4602 /* pointer to the beginning of the unencodable characters */
4603 /* const Py_UNICODE *badp = NULL; */
4604 /* pointer into the output */
4605 char *str;
4606 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004607 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004608 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4609 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610 PyObject *errorHandler = NULL;
4611 PyObject *exc = NULL;
4612 /* the following variable is used for caching string comparisons
4613 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4614 int known_errorHandler = -1;
4615
4616 /* allocate enough for a simple encoding without
4617 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004618 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004619 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004620 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004621 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004622 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004623 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004624 ressize = size;
4625
4626 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004627 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004628
Benjamin Peterson29060642009-01-31 22:14:21 +00004629 /* can we encode this? */
4630 if (c<limit) {
4631 /* no overflow check, because we know that the space is enough */
4632 *str++ = (char)c;
4633 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004634 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004635 else {
4636 Py_ssize_t unicodepos = p-startp;
4637 Py_ssize_t requiredsize;
4638 PyObject *repunicode;
4639 Py_ssize_t repsize;
4640 Py_ssize_t newpos;
4641 Py_ssize_t respos;
4642 Py_UNICODE *uni2;
4643 /* startpos for collecting unencodable chars */
4644 const Py_UNICODE *collstart = p;
4645 const Py_UNICODE *collend = p;
4646 /* find all unecodable characters */
4647 while ((collend < endp) && ((*collend)>=limit))
4648 ++collend;
4649 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4650 if (known_errorHandler==-1) {
4651 if ((errors==NULL) || (!strcmp(errors, "strict")))
4652 known_errorHandler = 1;
4653 else if (!strcmp(errors, "replace"))
4654 known_errorHandler = 2;
4655 else if (!strcmp(errors, "ignore"))
4656 known_errorHandler = 3;
4657 else if (!strcmp(errors, "xmlcharrefreplace"))
4658 known_errorHandler = 4;
4659 else
4660 known_errorHandler = 0;
4661 }
4662 switch (known_errorHandler) {
4663 case 1: /* strict */
4664 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4665 goto onError;
4666 case 2: /* replace */
4667 while (collstart++<collend)
4668 *str++ = '?'; /* fall through */
4669 case 3: /* ignore */
4670 p = collend;
4671 break;
4672 case 4: /* xmlcharrefreplace */
4673 respos = str - PyBytes_AS_STRING(res);
4674 /* determine replacement size (temporarily (mis)uses p) */
4675 for (p = collstart, repsize = 0; p < collend; ++p) {
4676 if (*p<10)
4677 repsize += 2+1+1;
4678 else if (*p<100)
4679 repsize += 2+2+1;
4680 else if (*p<1000)
4681 repsize += 2+3+1;
4682 else if (*p<10000)
4683 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004684#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004685 else
4686 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004687#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004688 else if (*p<100000)
4689 repsize += 2+5+1;
4690 else if (*p<1000000)
4691 repsize += 2+6+1;
4692 else
4693 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004694#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004695 }
4696 requiredsize = respos+repsize+(endp-collend);
4697 if (requiredsize > ressize) {
4698 if (requiredsize<2*ressize)
4699 requiredsize = 2*ressize;
4700 if (_PyBytes_Resize(&res, requiredsize))
4701 goto onError;
4702 str = PyBytes_AS_STRING(res) + respos;
4703 ressize = requiredsize;
4704 }
4705 /* generate replacement (temporarily (mis)uses p) */
4706 for (p = collstart; p < collend; ++p) {
4707 str += sprintf(str, "&#%d;", (int)*p);
4708 }
4709 p = collend;
4710 break;
4711 default:
4712 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4713 encoding, reason, startp, size, &exc,
4714 collstart-startp, collend-startp, &newpos);
4715 if (repunicode == NULL)
4716 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004717 if (PyBytes_Check(repunicode)) {
4718 /* Directly copy bytes result to output. */
4719 repsize = PyBytes_Size(repunicode);
4720 if (repsize > 1) {
4721 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004722 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004723 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4724 Py_DECREF(repunicode);
4725 goto onError;
4726 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004727 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004728 ressize += repsize-1;
4729 }
4730 memcpy(str, PyBytes_AsString(repunicode), repsize);
4731 str += repsize;
4732 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004733 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004734 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004735 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004736 /* need more space? (at least enough for what we
4737 have+the replacement+the rest of the string, so
4738 we won't have to check space for encodable characters) */
4739 respos = str - PyBytes_AS_STRING(res);
4740 repsize = PyUnicode_GET_SIZE(repunicode);
4741 requiredsize = respos+repsize+(endp-collend);
4742 if (requiredsize > ressize) {
4743 if (requiredsize<2*ressize)
4744 requiredsize = 2*ressize;
4745 if (_PyBytes_Resize(&res, requiredsize)) {
4746 Py_DECREF(repunicode);
4747 goto onError;
4748 }
4749 str = PyBytes_AS_STRING(res) + respos;
4750 ressize = requiredsize;
4751 }
4752 /* check if there is anything unencodable in the replacement
4753 and copy it to the output */
4754 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4755 c = *uni2;
4756 if (c >= limit) {
4757 raise_encode_exception(&exc, encoding, startp, size,
4758 unicodepos, unicodepos+1, reason);
4759 Py_DECREF(repunicode);
4760 goto onError;
4761 }
4762 *str = (char)c;
4763 }
4764 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004765 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004766 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004767 }
4768 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004769 /* Resize if we allocated to much */
4770 size = str - PyBytes_AS_STRING(res);
4771 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004772 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004773 if (_PyBytes_Resize(&res, size) < 0)
4774 goto onError;
4775 }
4776
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004777 Py_XDECREF(errorHandler);
4778 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004779 return res;
4780
4781 onError:
4782 Py_XDECREF(res);
4783 Py_XDECREF(errorHandler);
4784 Py_XDECREF(exc);
4785 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786}
4787
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004789 Py_ssize_t size,
4790 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004792 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793}
4794
4795PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4796{
4797 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 PyErr_BadArgument();
4799 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 }
4801 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004802 PyUnicode_GET_SIZE(unicode),
4803 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804}
4805
4806/* --- 7-bit ASCII Codec -------------------------------------------------- */
4807
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004809 Py_ssize_t size,
4810 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813 PyUnicodeObject *v;
4814 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004815 Py_ssize_t startinpos;
4816 Py_ssize_t endinpos;
4817 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 const char *e;
4819 PyObject *errorHandler = NULL;
4820 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004821
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004823 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004824 Py_UNICODE r = *(unsigned char*)s;
4825 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004826 }
Tim Petersced69f82003-09-16 20:30:58 +00004827
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828 v = _PyUnicode_New(size);
4829 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004830 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004832 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004834 e = s + size;
4835 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004836 register unsigned char c = (unsigned char)*s;
4837 if (c < 128) {
4838 *p++ = c;
4839 ++s;
4840 }
4841 else {
4842 startinpos = s-starts;
4843 endinpos = startinpos + 1;
4844 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4845 if (unicode_decode_call_errorhandler(
4846 errors, &errorHandler,
4847 "ascii", "ordinal not in range(128)",
4848 &starts, &e, &startinpos, &endinpos, &exc, &s,
4849 &v, &outpos, &p))
4850 goto onError;
4851 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004853 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4855 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856 Py_XDECREF(errorHandler);
4857 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004859
Benjamin Peterson29060642009-01-31 22:14:21 +00004860 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004862 Py_XDECREF(errorHandler);
4863 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 return NULL;
4865}
4866
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004868 Py_ssize_t size,
4869 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004871 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872}
4873
4874PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4875{
4876 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004877 PyErr_BadArgument();
4878 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 }
4880 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004881 PyUnicode_GET_SIZE(unicode),
4882 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883}
4884
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004885#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004886
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004887/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004888
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004889#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004890#define NEED_RETRY
4891#endif
4892
4893/* XXX This code is limited to "true" double-byte encodings, as
4894 a) it assumes an incomplete character consists of a single byte, and
4895 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004896 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004897
4898static int is_dbcs_lead_byte(const char *s, int offset)
4899{
4900 const char *curr = s + offset;
4901
4902 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 const char *prev = CharPrev(s, curr);
4904 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004905 }
4906 return 0;
4907}
4908
4909/*
4910 * Decode MBCS string into unicode object. If 'final' is set, converts
4911 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4912 */
4913static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004914 const char *s, /* MBCS string */
4915 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004916 int final,
4917 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004918{
4919 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004920 Py_ssize_t n;
4921 DWORD usize;
4922 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004923
4924 assert(size >= 0);
4925
Victor Stinner554f3f02010-06-16 23:33:54 +00004926 /* check and handle 'errors' arg */
4927 if (errors==NULL || strcmp(errors, "strict")==0)
4928 flags = MB_ERR_INVALID_CHARS;
4929 else if (strcmp(errors, "ignore")==0)
4930 flags = 0;
4931 else {
4932 PyErr_Format(PyExc_ValueError,
4933 "mbcs encoding does not support errors='%s'",
4934 errors);
4935 return -1;
4936 }
4937
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004938 /* Skip trailing lead-byte unless 'final' is set */
4939 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004941
4942 /* First get the size of the result */
4943 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004944 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4945 if (usize==0)
4946 goto mbcs_decode_error;
4947 } else
4948 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004949
4950 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004951 /* Create unicode object */
4952 *v = _PyUnicode_New(usize);
4953 if (*v == NULL)
4954 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004955 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004956 }
4957 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 /* Extend unicode object */
4959 n = PyUnicode_GET_SIZE(*v);
4960 if (_PyUnicode_Resize(v, n + usize) < 0)
4961 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004962 }
4963
4964 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004965 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004966 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004967 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4968 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004970 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004971 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004972
4973mbcs_decode_error:
4974 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4975 we raise a UnicodeDecodeError - else it is a 'generic'
4976 windows error
4977 */
4978 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4979 /* Ideally, we should get reason from FormatMessage - this
4980 is the Windows 2000 English version of the message
4981 */
4982 PyObject *exc = NULL;
4983 const char *reason = "No mapping for the Unicode character exists "
4984 "in the target multi-byte code page.";
4985 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4986 if (exc != NULL) {
4987 PyCodec_StrictErrors(exc);
4988 Py_DECREF(exc);
4989 }
4990 } else {
4991 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4992 }
4993 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004994}
4995
4996PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004997 Py_ssize_t size,
4998 const char *errors,
4999 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005000{
5001 PyUnicodeObject *v = NULL;
5002 int done;
5003
5004 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005005 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005006
5007#ifdef NEED_RETRY
5008 retry:
5009 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005010 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005011 else
5012#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005013 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005014
5015 if (done < 0) {
5016 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005017 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005018 }
5019
5020 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005021 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005022
5023#ifdef NEED_RETRY
5024 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 s += done;
5026 size -= done;
5027 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005028 }
5029#endif
5030
5031 return (PyObject *)v;
5032}
5033
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005034PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 Py_ssize_t size,
5036 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005037{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005038 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5039}
5040
5041/*
5042 * Convert unicode into string object (MBCS).
5043 * Returns 0 if succeed, -1 otherwise.
5044 */
5045static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00005047 int size, /* size of unicode */
5048 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005049{
Victor Stinner554f3f02010-06-16 23:33:54 +00005050 BOOL usedDefaultChar = FALSE;
5051 BOOL *pusedDefaultChar;
5052 int mbcssize;
5053 Py_ssize_t n;
5054 PyObject *exc = NULL;
5055 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005056
5057 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005058
Victor Stinner554f3f02010-06-16 23:33:54 +00005059 /* check and handle 'errors' arg */
5060 if (errors==NULL || strcmp(errors, "strict")==0) {
5061 flags = WC_NO_BEST_FIT_CHARS;
5062 pusedDefaultChar = &usedDefaultChar;
5063 } else if (strcmp(errors, "replace")==0) {
5064 flags = 0;
5065 pusedDefaultChar = NULL;
5066 } else {
5067 PyErr_Format(PyExc_ValueError,
5068 "mbcs encoding does not support errors='%s'",
5069 errors);
5070 return -1;
5071 }
5072
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005073 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005074 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005075 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5076 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005077 if (mbcssize == 0) {
5078 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5079 return -1;
5080 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005081 /* If we used a default char, then we failed! */
5082 if (pusedDefaultChar && *pusedDefaultChar)
5083 goto mbcs_encode_error;
5084 } else {
5085 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005086 }
5087
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005088 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005089 /* Create string object */
5090 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5091 if (*repr == NULL)
5092 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005093 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005094 }
5095 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005096 /* Extend string object */
5097 n = PyBytes_Size(*repr);
5098 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5099 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005100 }
5101
5102 /* Do the conversion */
5103 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005105 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5106 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005107 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5108 return -1;
5109 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005110 if (pusedDefaultChar && *pusedDefaultChar)
5111 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005112 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005113 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005114
5115mbcs_encode_error:
5116 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5117 Py_XDECREF(exc);
5118 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005119}
5120
5121PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005122 Py_ssize_t size,
5123 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005124{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005125 PyObject *repr = NULL;
5126 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005127
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005128#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005130 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005131 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005132 else
5133#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005134 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005135
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005136 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005137 Py_XDECREF(repr);
5138 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005139 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005140
5141#ifdef NEED_RETRY
5142 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005143 p += INT_MAX;
5144 size -= INT_MAX;
5145 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005146 }
5147#endif
5148
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005149 return repr;
5150}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005151
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005152PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5153{
5154 if (!PyUnicode_Check(unicode)) {
5155 PyErr_BadArgument();
5156 return NULL;
5157 }
5158 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005159 PyUnicode_GET_SIZE(unicode),
5160 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005161}
5162
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005163#undef NEED_RETRY
5164
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005165#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005166
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167/* --- Character Mapping Codec -------------------------------------------- */
5168
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 Py_ssize_t size,
5171 PyObject *mapping,
5172 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005174 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005175 Py_ssize_t startinpos;
5176 Py_ssize_t endinpos;
5177 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005178 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 PyUnicodeObject *v;
5180 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005181 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005182 PyObject *errorHandler = NULL;
5183 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005184 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005185 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005186
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187 /* Default to Latin-1 */
5188 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190
5191 v = _PyUnicode_New(size);
5192 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005195 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005197 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005198 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005199 mapstring = PyUnicode_AS_UNICODE(mapping);
5200 maplen = PyUnicode_GET_SIZE(mapping);
5201 while (s < e) {
5202 unsigned char ch = *s;
5203 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204
Benjamin Peterson29060642009-01-31 22:14:21 +00005205 if (ch < maplen)
5206 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207
Benjamin Peterson29060642009-01-31 22:14:21 +00005208 if (x == 0xfffe) {
5209 /* undefined mapping */
5210 outpos = p-PyUnicode_AS_UNICODE(v);
5211 startinpos = s-starts;
5212 endinpos = startinpos+1;
5213 if (unicode_decode_call_errorhandler(
5214 errors, &errorHandler,
5215 "charmap", "character maps to <undefined>",
5216 &starts, &e, &startinpos, &endinpos, &exc, &s,
5217 &v, &outpos, &p)) {
5218 goto onError;
5219 }
5220 continue;
5221 }
5222 *p++ = x;
5223 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005224 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005225 }
5226 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005227 while (s < e) {
5228 unsigned char ch = *s;
5229 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005230
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5232 w = PyLong_FromLong((long)ch);
5233 if (w == NULL)
5234 goto onError;
5235 x = PyObject_GetItem(mapping, w);
5236 Py_DECREF(w);
5237 if (x == NULL) {
5238 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5239 /* No mapping found means: mapping is undefined. */
5240 PyErr_Clear();
5241 x = Py_None;
5242 Py_INCREF(x);
5243 } else
5244 goto onError;
5245 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005246
Benjamin Peterson29060642009-01-31 22:14:21 +00005247 /* Apply mapping */
5248 if (PyLong_Check(x)) {
5249 long value = PyLong_AS_LONG(x);
5250 if (value < 0 || value > 65535) {
5251 PyErr_SetString(PyExc_TypeError,
5252 "character mapping must be in range(65536)");
5253 Py_DECREF(x);
5254 goto onError;
5255 }
5256 *p++ = (Py_UNICODE)value;
5257 }
5258 else if (x == Py_None) {
5259 /* undefined mapping */
5260 outpos = p-PyUnicode_AS_UNICODE(v);
5261 startinpos = s-starts;
5262 endinpos = startinpos+1;
5263 if (unicode_decode_call_errorhandler(
5264 errors, &errorHandler,
5265 "charmap", "character maps to <undefined>",
5266 &starts, &e, &startinpos, &endinpos, &exc, &s,
5267 &v, &outpos, &p)) {
5268 Py_DECREF(x);
5269 goto onError;
5270 }
5271 Py_DECREF(x);
5272 continue;
5273 }
5274 else if (PyUnicode_Check(x)) {
5275 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005276
Benjamin Peterson29060642009-01-31 22:14:21 +00005277 if (targetsize == 1)
5278 /* 1-1 mapping */
5279 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005280
Benjamin Peterson29060642009-01-31 22:14:21 +00005281 else if (targetsize > 1) {
5282 /* 1-n mapping */
5283 if (targetsize > extrachars) {
5284 /* resize first */
5285 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5286 Py_ssize_t needed = (targetsize - extrachars) + \
5287 (targetsize << 2);
5288 extrachars += needed;
5289 /* XXX overflow detection missing */
5290 if (_PyUnicode_Resize(&v,
5291 PyUnicode_GET_SIZE(v) + needed) < 0) {
5292 Py_DECREF(x);
5293 goto onError;
5294 }
5295 p = PyUnicode_AS_UNICODE(v) + oldpos;
5296 }
5297 Py_UNICODE_COPY(p,
5298 PyUnicode_AS_UNICODE(x),
5299 targetsize);
5300 p += targetsize;
5301 extrachars -= targetsize;
5302 }
5303 /* 1-0 mapping: skip the character */
5304 }
5305 else {
5306 /* wrong return value */
5307 PyErr_SetString(PyExc_TypeError,
5308 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005309 Py_DECREF(x);
5310 goto onError;
5311 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 Py_DECREF(x);
5313 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005314 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 }
5316 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005317 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5318 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005319 Py_XDECREF(errorHandler);
5320 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005322
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005324 Py_XDECREF(errorHandler);
5325 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 Py_XDECREF(v);
5327 return NULL;
5328}
5329
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005330/* Charmap encoding: the lookup table */
5331
5332struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 PyObject_HEAD
5334 unsigned char level1[32];
5335 int count2, count3;
5336 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005337};
5338
5339static PyObject*
5340encoding_map_size(PyObject *obj, PyObject* args)
5341{
5342 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005343 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005345}
5346
5347static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005348 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 PyDoc_STR("Return the size (in bytes) of this object") },
5350 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005351};
5352
5353static void
5354encoding_map_dealloc(PyObject* o)
5355{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005356 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005357}
5358
5359static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005360 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 "EncodingMap", /*tp_name*/
5362 sizeof(struct encoding_map), /*tp_basicsize*/
5363 0, /*tp_itemsize*/
5364 /* methods */
5365 encoding_map_dealloc, /*tp_dealloc*/
5366 0, /*tp_print*/
5367 0, /*tp_getattr*/
5368 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005369 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 0, /*tp_repr*/
5371 0, /*tp_as_number*/
5372 0, /*tp_as_sequence*/
5373 0, /*tp_as_mapping*/
5374 0, /*tp_hash*/
5375 0, /*tp_call*/
5376 0, /*tp_str*/
5377 0, /*tp_getattro*/
5378 0, /*tp_setattro*/
5379 0, /*tp_as_buffer*/
5380 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5381 0, /*tp_doc*/
5382 0, /*tp_traverse*/
5383 0, /*tp_clear*/
5384 0, /*tp_richcompare*/
5385 0, /*tp_weaklistoffset*/
5386 0, /*tp_iter*/
5387 0, /*tp_iternext*/
5388 encoding_map_methods, /*tp_methods*/
5389 0, /*tp_members*/
5390 0, /*tp_getset*/
5391 0, /*tp_base*/
5392 0, /*tp_dict*/
5393 0, /*tp_descr_get*/
5394 0, /*tp_descr_set*/
5395 0, /*tp_dictoffset*/
5396 0, /*tp_init*/
5397 0, /*tp_alloc*/
5398 0, /*tp_new*/
5399 0, /*tp_free*/
5400 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005401};
5402
5403PyObject*
5404PyUnicode_BuildEncodingMap(PyObject* string)
5405{
5406 Py_UNICODE *decode;
5407 PyObject *result;
5408 struct encoding_map *mresult;
5409 int i;
5410 int need_dict = 0;
5411 unsigned char level1[32];
5412 unsigned char level2[512];
5413 unsigned char *mlevel1, *mlevel2, *mlevel3;
5414 int count2 = 0, count3 = 0;
5415
5416 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5417 PyErr_BadArgument();
5418 return NULL;
5419 }
5420 decode = PyUnicode_AS_UNICODE(string);
5421 memset(level1, 0xFF, sizeof level1);
5422 memset(level2, 0xFF, sizeof level2);
5423
5424 /* If there isn't a one-to-one mapping of NULL to \0,
5425 or if there are non-BMP characters, we need to use
5426 a mapping dictionary. */
5427 if (decode[0] != 0)
5428 need_dict = 1;
5429 for (i = 1; i < 256; i++) {
5430 int l1, l2;
5431 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005432#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005433 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005434#endif
5435 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005436 need_dict = 1;
5437 break;
5438 }
5439 if (decode[i] == 0xFFFE)
5440 /* unmapped character */
5441 continue;
5442 l1 = decode[i] >> 11;
5443 l2 = decode[i] >> 7;
5444 if (level1[l1] == 0xFF)
5445 level1[l1] = count2++;
5446 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005447 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005448 }
5449
5450 if (count2 >= 0xFF || count3 >= 0xFF)
5451 need_dict = 1;
5452
5453 if (need_dict) {
5454 PyObject *result = PyDict_New();
5455 PyObject *key, *value;
5456 if (!result)
5457 return NULL;
5458 for (i = 0; i < 256; i++) {
5459 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005460 key = PyLong_FromLong(decode[i]);
5461 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005462 if (!key || !value)
5463 goto failed1;
5464 if (PyDict_SetItem(result, key, value) == -1)
5465 goto failed1;
5466 Py_DECREF(key);
5467 Py_DECREF(value);
5468 }
5469 return result;
5470 failed1:
5471 Py_XDECREF(key);
5472 Py_XDECREF(value);
5473 Py_DECREF(result);
5474 return NULL;
5475 }
5476
5477 /* Create a three-level trie */
5478 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5479 16*count2 + 128*count3 - 1);
5480 if (!result)
5481 return PyErr_NoMemory();
5482 PyObject_Init(result, &EncodingMapType);
5483 mresult = (struct encoding_map*)result;
5484 mresult->count2 = count2;
5485 mresult->count3 = count3;
5486 mlevel1 = mresult->level1;
5487 mlevel2 = mresult->level23;
5488 mlevel3 = mresult->level23 + 16*count2;
5489 memcpy(mlevel1, level1, 32);
5490 memset(mlevel2, 0xFF, 16*count2);
5491 memset(mlevel3, 0, 128*count3);
5492 count3 = 0;
5493 for (i = 1; i < 256; i++) {
5494 int o1, o2, o3, i2, i3;
5495 if (decode[i] == 0xFFFE)
5496 /* unmapped character */
5497 continue;
5498 o1 = decode[i]>>11;
5499 o2 = (decode[i]>>7) & 0xF;
5500 i2 = 16*mlevel1[o1] + o2;
5501 if (mlevel2[i2] == 0xFF)
5502 mlevel2[i2] = count3++;
5503 o3 = decode[i] & 0x7F;
5504 i3 = 128*mlevel2[i2] + o3;
5505 mlevel3[i3] = i;
5506 }
5507 return result;
5508}
5509
5510static int
5511encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5512{
5513 struct encoding_map *map = (struct encoding_map*)mapping;
5514 int l1 = c>>11;
5515 int l2 = (c>>7) & 0xF;
5516 int l3 = c & 0x7F;
5517 int i;
5518
5519#ifdef Py_UNICODE_WIDE
5520 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005522 }
5523#endif
5524 if (c == 0)
5525 return 0;
5526 /* level 1*/
5527 i = map->level1[l1];
5528 if (i == 0xFF) {
5529 return -1;
5530 }
5531 /* level 2*/
5532 i = map->level23[16*i+l2];
5533 if (i == 0xFF) {
5534 return -1;
5535 }
5536 /* level 3 */
5537 i = map->level23[16*map->count2 + 128*i + l3];
5538 if (i == 0) {
5539 return -1;
5540 }
5541 return i;
5542}
5543
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005544/* Lookup the character ch in the mapping. If the character
5545 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005546 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005547static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548{
Christian Heimes217cfd12007-12-02 14:31:20 +00005549 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005550 PyObject *x;
5551
5552 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005553 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005554 x = PyObject_GetItem(mapping, w);
5555 Py_DECREF(w);
5556 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005557 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5558 /* No mapping found means: mapping is undefined. */
5559 PyErr_Clear();
5560 x = Py_None;
5561 Py_INCREF(x);
5562 return x;
5563 } else
5564 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005566 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005567 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005568 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005569 long value = PyLong_AS_LONG(x);
5570 if (value < 0 || value > 255) {
5571 PyErr_SetString(PyExc_TypeError,
5572 "character mapping must be in range(256)");
5573 Py_DECREF(x);
5574 return NULL;
5575 }
5576 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005578 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005581 /* wrong return value */
5582 PyErr_Format(PyExc_TypeError,
5583 "character mapping must return integer, bytes or None, not %.400s",
5584 x->ob_type->tp_name);
5585 Py_DECREF(x);
5586 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 }
5588}
5589
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005590static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005591charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005592{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005593 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5594 /* exponentially overallocate to minimize reallocations */
5595 if (requiredsize < 2*outsize)
5596 requiredsize = 2*outsize;
5597 if (_PyBytes_Resize(outobj, requiredsize))
5598 return -1;
5599 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005600}
5601
Benjamin Peterson14339b62009-01-31 16:36:08 +00005602typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005603 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005604}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005605/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005606 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005607 space is available. Return a new reference to the object that
5608 was put in the output buffer, or Py_None, if the mapping was undefined
5609 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005610 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005611static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005612charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005614{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005615 PyObject *rep;
5616 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005617 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618
Christian Heimes90aa7642007-12-19 02:45:37 +00005619 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005620 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005621 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005622 if (res == -1)
5623 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 if (outsize<requiredsize)
5625 if (charmapencode_resize(outobj, outpos, requiredsize))
5626 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005627 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 outstart[(*outpos)++] = (char)res;
5629 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005630 }
5631
5632 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005633 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005635 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005636 Py_DECREF(rep);
5637 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005638 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005639 if (PyLong_Check(rep)) {
5640 Py_ssize_t requiredsize = *outpos+1;
5641 if (outsize<requiredsize)
5642 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5643 Py_DECREF(rep);
5644 return enc_EXCEPTION;
5645 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005646 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005648 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005649 else {
5650 const char *repchars = PyBytes_AS_STRING(rep);
5651 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5652 Py_ssize_t requiredsize = *outpos+repsize;
5653 if (outsize<requiredsize)
5654 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5655 Py_DECREF(rep);
5656 return enc_EXCEPTION;
5657 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005658 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005659 memcpy(outstart + *outpos, repchars, repsize);
5660 *outpos += repsize;
5661 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005662 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005663 Py_DECREF(rep);
5664 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005665}
5666
5667/* handle an error in PyUnicode_EncodeCharmap
5668 Return 0 on success, -1 on error */
5669static
5670int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005671 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005672 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005673 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005674 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005675{
5676 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005677 Py_ssize_t repsize;
5678 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005679 Py_UNICODE *uni2;
5680 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005681 Py_ssize_t collstartpos = *inpos;
5682 Py_ssize_t collendpos = *inpos+1;
5683 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005684 char *encoding = "charmap";
5685 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005686 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005687
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005688 /* find all unencodable characters */
5689 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005690 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005691 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 int res = encoding_map_lookup(p[collendpos], mapping);
5693 if (res != -1)
5694 break;
5695 ++collendpos;
5696 continue;
5697 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005698
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 rep = charmapencode_lookup(p[collendpos], mapping);
5700 if (rep==NULL)
5701 return -1;
5702 else if (rep!=Py_None) {
5703 Py_DECREF(rep);
5704 break;
5705 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005706 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005708 }
5709 /* cache callback name lookup
5710 * (if not done yet, i.e. it's the first error) */
5711 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 if ((errors==NULL) || (!strcmp(errors, "strict")))
5713 *known_errorHandler = 1;
5714 else if (!strcmp(errors, "replace"))
5715 *known_errorHandler = 2;
5716 else if (!strcmp(errors, "ignore"))
5717 *known_errorHandler = 3;
5718 else if (!strcmp(errors, "xmlcharrefreplace"))
5719 *known_errorHandler = 4;
5720 else
5721 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005722 }
5723 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005724 case 1: /* strict */
5725 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5726 return -1;
5727 case 2: /* replace */
5728 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 x = charmapencode_output('?', mapping, res, respos);
5730 if (x==enc_EXCEPTION) {
5731 return -1;
5732 }
5733 else if (x==enc_FAILED) {
5734 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5735 return -1;
5736 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005737 }
5738 /* fall through */
5739 case 3: /* ignore */
5740 *inpos = collendpos;
5741 break;
5742 case 4: /* xmlcharrefreplace */
5743 /* generate replacement (temporarily (mis)uses p) */
5744 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 char buffer[2+29+1+1];
5746 char *cp;
5747 sprintf(buffer, "&#%d;", (int)p[collpos]);
5748 for (cp = buffer; *cp; ++cp) {
5749 x = charmapencode_output(*cp, mapping, res, respos);
5750 if (x==enc_EXCEPTION)
5751 return -1;
5752 else if (x==enc_FAILED) {
5753 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5754 return -1;
5755 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005756 }
5757 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005758 *inpos = collendpos;
5759 break;
5760 default:
5761 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 encoding, reason, p, size, exceptionObject,
5763 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005764 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005766 if (PyBytes_Check(repunicode)) {
5767 /* Directly copy bytes result to output. */
5768 Py_ssize_t outsize = PyBytes_Size(*res);
5769 Py_ssize_t requiredsize;
5770 repsize = PyBytes_Size(repunicode);
5771 requiredsize = *respos + repsize;
5772 if (requiredsize > outsize)
5773 /* Make room for all additional bytes. */
5774 if (charmapencode_resize(res, respos, requiredsize)) {
5775 Py_DECREF(repunicode);
5776 return -1;
5777 }
5778 memcpy(PyBytes_AsString(*res) + *respos,
5779 PyBytes_AsString(repunicode), repsize);
5780 *respos += repsize;
5781 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005782 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005783 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005784 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005785 /* generate replacement */
5786 repsize = PyUnicode_GET_SIZE(repunicode);
5787 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005788 x = charmapencode_output(*uni2, mapping, res, respos);
5789 if (x==enc_EXCEPTION) {
5790 return -1;
5791 }
5792 else if (x==enc_FAILED) {
5793 Py_DECREF(repunicode);
5794 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5795 return -1;
5796 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005797 }
5798 *inpos = newpos;
5799 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005800 }
5801 return 0;
5802}
5803
Guido van Rossumd57fd912000-03-10 22:53:23 +00005804PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 Py_ssize_t size,
5806 PyObject *mapping,
5807 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005809 /* output object */
5810 PyObject *res = NULL;
5811 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005812 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005813 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005814 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005815 PyObject *errorHandler = NULL;
5816 PyObject *exc = NULL;
5817 /* the following variable is used for caching string comparisons
5818 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5819 * 3=ignore, 4=xmlcharrefreplace */
5820 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821
5822 /* Default to Latin-1 */
5823 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005826 /* allocate enough for a simple encoding without
5827 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005828 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005829 if (res == NULL)
5830 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005831 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005834 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 /* try to encode it */
5836 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5837 if (x==enc_EXCEPTION) /* error */
5838 goto onError;
5839 if (x==enc_FAILED) { /* unencodable character */
5840 if (charmap_encoding_error(p, size, &inpos, mapping,
5841 &exc,
5842 &known_errorHandler, &errorHandler, errors,
5843 &res, &respos)) {
5844 goto onError;
5845 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005846 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 else
5848 /* done with this character => adjust input position */
5849 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005852 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005853 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005854 if (_PyBytes_Resize(&res, respos) < 0)
5855 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005856
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005857 Py_XDECREF(exc);
5858 Py_XDECREF(errorHandler);
5859 return res;
5860
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005862 Py_XDECREF(res);
5863 Py_XDECREF(exc);
5864 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 return NULL;
5866}
5867
5868PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005869 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870{
5871 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 PyErr_BadArgument();
5873 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 }
5875 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005876 PyUnicode_GET_SIZE(unicode),
5877 mapping,
5878 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879}
5880
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005881/* create or adjust a UnicodeTranslateError */
5882static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 const Py_UNICODE *unicode, Py_ssize_t size,
5884 Py_ssize_t startpos, Py_ssize_t endpos,
5885 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005887 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005888 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 }
5891 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005892 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5893 goto onError;
5894 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5895 goto onError;
5896 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5897 goto onError;
5898 return;
5899 onError:
5900 Py_DECREF(*exceptionObject);
5901 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 }
5903}
5904
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005905/* raises a UnicodeTranslateError */
5906static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005907 const Py_UNICODE *unicode, Py_ssize_t size,
5908 Py_ssize_t startpos, Py_ssize_t endpos,
5909 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005910{
5911 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005913 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005915}
5916
5917/* error handling callback helper:
5918 build arguments, call the callback and check the arguments,
5919 put the result into newpos and return the replacement string, which
5920 has to be freed by the caller */
5921static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 PyObject **errorHandler,
5923 const char *reason,
5924 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5925 Py_ssize_t startpos, Py_ssize_t endpos,
5926 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005927{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005928 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005929
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005930 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005931 PyObject *restuple;
5932 PyObject *resunicode;
5933
5934 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005936 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005938 }
5939
5940 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005942 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005944
5945 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005947 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005949 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005950 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 Py_DECREF(restuple);
5952 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005953 }
5954 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 &resunicode, &i_newpos)) {
5956 Py_DECREF(restuple);
5957 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005958 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005959 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005961 else
5962 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005963 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005964 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5965 Py_DECREF(restuple);
5966 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005967 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005968 Py_INCREF(resunicode);
5969 Py_DECREF(restuple);
5970 return resunicode;
5971}
5972
5973/* Lookup the character ch in the mapping and put the result in result,
5974 which must be decrefed by the caller.
5975 Return 0 on success, -1 on error */
5976static
5977int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5978{
Christian Heimes217cfd12007-12-02 14:31:20 +00005979 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005980 PyObject *x;
5981
5982 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005984 x = PyObject_GetItem(mapping, w);
5985 Py_DECREF(w);
5986 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5988 /* No mapping found means: use 1:1 mapping. */
5989 PyErr_Clear();
5990 *result = NULL;
5991 return 0;
5992 } else
5993 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005994 }
5995 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 *result = x;
5997 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005998 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005999 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 long value = PyLong_AS_LONG(x);
6001 long max = PyUnicode_GetMax();
6002 if (value < 0 || value > max) {
6003 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00006004 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00006005 Py_DECREF(x);
6006 return -1;
6007 }
6008 *result = x;
6009 return 0;
6010 }
6011 else if (PyUnicode_Check(x)) {
6012 *result = x;
6013 return 0;
6014 }
6015 else {
6016 /* wrong return value */
6017 PyErr_SetString(PyExc_TypeError,
6018 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006019 Py_DECREF(x);
6020 return -1;
6021 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006022}
6023/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 if not reallocate and adjust various state variables.
6025 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026static
Walter Dörwald4894c302003-10-24 14:25:28 +00006027int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006029{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006030 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006031 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006032 /* remember old output position */
6033 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6034 /* exponentially overallocate to minimize reallocations */
6035 if (requiredsize < 2 * oldsize)
6036 requiredsize = 2 * oldsize;
6037 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6038 return -1;
6039 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006040 }
6041 return 0;
6042}
6043/* lookup the character, put the result in the output string and adjust
6044 various state variables. Return a new reference to the object that
6045 was put in the output buffer in *result, or Py_None, if the mapping was
6046 undefined (in which case no character was written).
6047 The called must decref result.
6048 Return 0 on success, -1 on error. */
6049static
Walter Dörwald4894c302003-10-24 14:25:28 +00006050int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6052 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006053{
Walter Dörwald4894c302003-10-24 14:25:28 +00006054 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006055 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 /* not found => default to 1:1 mapping */
6058 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006059 }
6060 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006061 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006062 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 /* no overflow check, because we know that the space is enough */
6064 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006065 }
6066 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6068 if (repsize==1) {
6069 /* no overflow check, because we know that the space is enough */
6070 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6071 }
6072 else if (repsize!=0) {
6073 /* more than one character */
6074 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6075 (insize - (curinp-startinp)) +
6076 repsize - 1;
6077 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6078 return -1;
6079 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6080 *outp += repsize;
6081 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006082 }
6083 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006085 return 0;
6086}
6087
6088PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00006089 Py_ssize_t size,
6090 PyObject *mapping,
6091 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006093 /* output object */
6094 PyObject *res = NULL;
6095 /* pointers to the beginning and end+1 of input */
6096 const Py_UNICODE *startp = p;
6097 const Py_UNICODE *endp = p + size;
6098 /* pointer into the output */
6099 Py_UNICODE *str;
6100 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006101 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006102 char *reason = "character maps to <undefined>";
6103 PyObject *errorHandler = NULL;
6104 PyObject *exc = NULL;
6105 /* the following variable is used for caching string comparisons
6106 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6107 * 3=ignore, 4=xmlcharrefreplace */
6108 int known_errorHandler = -1;
6109
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 PyErr_BadArgument();
6112 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006114
6115 /* allocate enough for a simple 1:1 translation without
6116 replacements, if we need more, we'll resize */
6117 res = PyUnicode_FromUnicode(NULL, size);
6118 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006122 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006124 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 /* try to encode it */
6126 PyObject *x = NULL;
6127 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6128 Py_XDECREF(x);
6129 goto onError;
6130 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006131 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 if (x!=Py_None) /* it worked => adjust input pointer */
6133 ++p;
6134 else { /* untranslatable character */
6135 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6136 Py_ssize_t repsize;
6137 Py_ssize_t newpos;
6138 Py_UNICODE *uni2;
6139 /* startpos for collecting untranslatable chars */
6140 const Py_UNICODE *collstart = p;
6141 const Py_UNICODE *collend = p+1;
6142 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 /* find all untranslatable characters */
6145 while (collend < endp) {
6146 if (charmaptranslate_lookup(*collend, mapping, &x))
6147 goto onError;
6148 Py_XDECREF(x);
6149 if (x!=Py_None)
6150 break;
6151 ++collend;
6152 }
6153 /* cache callback name lookup
6154 * (if not done yet, i.e. it's the first error) */
6155 if (known_errorHandler==-1) {
6156 if ((errors==NULL) || (!strcmp(errors, "strict")))
6157 known_errorHandler = 1;
6158 else if (!strcmp(errors, "replace"))
6159 known_errorHandler = 2;
6160 else if (!strcmp(errors, "ignore"))
6161 known_errorHandler = 3;
6162 else if (!strcmp(errors, "xmlcharrefreplace"))
6163 known_errorHandler = 4;
6164 else
6165 known_errorHandler = 0;
6166 }
6167 switch (known_errorHandler) {
6168 case 1: /* strict */
6169 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006170 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 case 2: /* replace */
6172 /* No need to check for space, this is a 1:1 replacement */
6173 for (coll = collstart; coll<collend; ++coll)
6174 *str++ = '?';
6175 /* fall through */
6176 case 3: /* ignore */
6177 p = collend;
6178 break;
6179 case 4: /* xmlcharrefreplace */
6180 /* generate replacement (temporarily (mis)uses p) */
6181 for (p = collstart; p < collend; ++p) {
6182 char buffer[2+29+1+1];
6183 char *cp;
6184 sprintf(buffer, "&#%d;", (int)*p);
6185 if (charmaptranslate_makespace(&res, &str,
6186 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6187 goto onError;
6188 for (cp = buffer; *cp; ++cp)
6189 *str++ = *cp;
6190 }
6191 p = collend;
6192 break;
6193 default:
6194 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6195 reason, startp, size, &exc,
6196 collstart-startp, collend-startp, &newpos);
6197 if (repunicode == NULL)
6198 goto onError;
6199 /* generate replacement */
6200 repsize = PyUnicode_GET_SIZE(repunicode);
6201 if (charmaptranslate_makespace(&res, &str,
6202 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6203 Py_DECREF(repunicode);
6204 goto onError;
6205 }
6206 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6207 *str++ = *uni2;
6208 p = startp + newpos;
6209 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006210 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006211 }
6212 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006213 /* Resize if we allocated to much */
6214 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006215 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006216 if (PyUnicode_Resize(&res, respos) < 0)
6217 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006218 }
6219 Py_XDECREF(exc);
6220 Py_XDECREF(errorHandler);
6221 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006224 Py_XDECREF(res);
6225 Py_XDECREF(exc);
6226 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 return NULL;
6228}
6229
6230PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 PyObject *mapping,
6232 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233{
6234 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006235
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 str = PyUnicode_FromObject(str);
6237 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 PyUnicode_GET_SIZE(str),
6241 mapping,
6242 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 Py_DECREF(str);
6244 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006245
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 Py_XDECREF(str);
6248 return NULL;
6249}
Tim Petersced69f82003-09-16 20:30:58 +00006250
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006251PyObject *
6252PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6253 Py_ssize_t length)
6254{
6255 PyObject *result;
6256 Py_UNICODE *p; /* write pointer into result */
6257 Py_ssize_t i;
6258 /* Copy to a new string */
6259 result = (PyObject *)_PyUnicode_New(length);
6260 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6261 if (result == NULL)
6262 return result;
6263 p = PyUnicode_AS_UNICODE(result);
6264 /* Iterate over code points */
6265 for (i = 0; i < length; i++) {
6266 Py_UNICODE ch =s[i];
6267 if (ch > 127) {
6268 int decimal = Py_UNICODE_TODECIMAL(ch);
6269 if (decimal >= 0)
6270 p[i] = '0' + decimal;
6271 }
6272 }
6273 return result;
6274}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006275/* --- Decimal Encoder ---------------------------------------------------- */
6276
6277int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 Py_ssize_t length,
6279 char *output,
6280 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006281{
6282 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006283 PyObject *errorHandler = NULL;
6284 PyObject *exc = NULL;
6285 const char *encoding = "decimal";
6286 const char *reason = "invalid decimal Unicode string";
6287 /* the following variable is used for caching string comparisons
6288 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6289 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006290
6291 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 PyErr_BadArgument();
6293 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006294 }
6295
6296 p = s;
6297 end = s + length;
6298 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 register Py_UNICODE ch = *p;
6300 int decimal;
6301 PyObject *repunicode;
6302 Py_ssize_t repsize;
6303 Py_ssize_t newpos;
6304 Py_UNICODE *uni2;
6305 Py_UNICODE *collstart;
6306 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006307
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006309 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006310 ++p;
6311 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006312 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 decimal = Py_UNICODE_TODECIMAL(ch);
6314 if (decimal >= 0) {
6315 *output++ = '0' + decimal;
6316 ++p;
6317 continue;
6318 }
6319 if (0 < ch && ch < 256) {
6320 *output++ = (char)ch;
6321 ++p;
6322 continue;
6323 }
6324 /* All other characters are considered unencodable */
6325 collstart = p;
6326 collend = p+1;
6327 while (collend < end) {
6328 if ((0 < *collend && *collend < 256) ||
6329 !Py_UNICODE_ISSPACE(*collend) ||
6330 Py_UNICODE_TODECIMAL(*collend))
6331 break;
6332 }
6333 /* cache callback name lookup
6334 * (if not done yet, i.e. it's the first error) */
6335 if (known_errorHandler==-1) {
6336 if ((errors==NULL) || (!strcmp(errors, "strict")))
6337 known_errorHandler = 1;
6338 else if (!strcmp(errors, "replace"))
6339 known_errorHandler = 2;
6340 else if (!strcmp(errors, "ignore"))
6341 known_errorHandler = 3;
6342 else if (!strcmp(errors, "xmlcharrefreplace"))
6343 known_errorHandler = 4;
6344 else
6345 known_errorHandler = 0;
6346 }
6347 switch (known_errorHandler) {
6348 case 1: /* strict */
6349 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6350 goto onError;
6351 case 2: /* replace */
6352 for (p = collstart; p < collend; ++p)
6353 *output++ = '?';
6354 /* fall through */
6355 case 3: /* ignore */
6356 p = collend;
6357 break;
6358 case 4: /* xmlcharrefreplace */
6359 /* generate replacement (temporarily (mis)uses p) */
6360 for (p = collstart; p < collend; ++p)
6361 output += sprintf(output, "&#%d;", (int)*p);
6362 p = collend;
6363 break;
6364 default:
6365 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6366 encoding, reason, s, length, &exc,
6367 collstart-s, collend-s, &newpos);
6368 if (repunicode == NULL)
6369 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006370 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006371 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006372 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6373 Py_DECREF(repunicode);
6374 goto onError;
6375 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 /* generate replacement */
6377 repsize = PyUnicode_GET_SIZE(repunicode);
6378 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6379 Py_UNICODE ch = *uni2;
6380 if (Py_UNICODE_ISSPACE(ch))
6381 *output++ = ' ';
6382 else {
6383 decimal = Py_UNICODE_TODECIMAL(ch);
6384 if (decimal >= 0)
6385 *output++ = '0' + decimal;
6386 else if (0 < ch && ch < 256)
6387 *output++ = (char)ch;
6388 else {
6389 Py_DECREF(repunicode);
6390 raise_encode_exception(&exc, encoding,
6391 s, length, collstart-s, collend-s, reason);
6392 goto onError;
6393 }
6394 }
6395 }
6396 p = s + newpos;
6397 Py_DECREF(repunicode);
6398 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006399 }
6400 /* 0-terminate the output string */
6401 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006402 Py_XDECREF(exc);
6403 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006404 return 0;
6405
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006407 Py_XDECREF(exc);
6408 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006409 return -1;
6410}
6411
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412/* --- Helpers ------------------------------------------------------------ */
6413
Eric Smith8c663262007-08-25 02:26:07 +00006414#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006415#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006416
Thomas Wouters477c8d52006-05-27 19:21:47 +00006417#include "stringlib/count.h"
6418#include "stringlib/find.h"
6419#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006420#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006421
Eric Smith5807c412008-05-11 21:00:57 +00006422#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006423#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006424#include "stringlib/localeutil.h"
6425
Thomas Wouters477c8d52006-05-27 19:21:47 +00006426/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006427#define ADJUST_INDICES(start, end, len) \
6428 if (end > len) \
6429 end = len; \
6430 else if (end < 0) { \
6431 end += len; \
6432 if (end < 0) \
6433 end = 0; \
6434 } \
6435 if (start < 0) { \
6436 start += len; \
6437 if (start < 0) \
6438 start = 0; \
6439 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006440
Ezio Melotti93e7afc2011-08-22 14:08:38 +03006441/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
6442 * by 'ptr', possibly combining surrogate pairs on narrow builds.
6443 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
6444 * that should be returned and 'end' pointing to the end of the buffer.
6445 * ('end' is used on narrow builds to detect a lone surrogate at the
6446 * end of the buffer that should be returned unchanged.)
6447 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
6448 * The type of the returned char is always Py_UCS4.
6449 *
6450 * Note: the macro advances ptr to next char, so it might have side-effects
6451 * (especially if used with other macros).
6452 */
6453
6454/* helper macros used by _Py_UNICODE_NEXT */
6455#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
6456#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
6457/* Join two surrogate characters and return a single Py_UCS4 value. */
6458#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
6459 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
6460 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
6461
6462#ifdef Py_UNICODE_WIDE
6463#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
6464#else
6465#define _Py_UNICODE_NEXT(ptr, end) \
6466 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
6467 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
6468 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
6469 (Py_UCS4)*(ptr)++)
6470#endif
6471
Martin v. Löwis18e16552006-02-15 17:27:45 +00006472Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006473 PyObject *substr,
6474 Py_ssize_t start,
6475 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006477 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006478 PyUnicodeObject* str_obj;
6479 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006480
Thomas Wouters477c8d52006-05-27 19:21:47 +00006481 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6482 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006484 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6485 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006486 Py_DECREF(str_obj);
6487 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 }
Tim Petersced69f82003-09-16 20:30:58 +00006489
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006490 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006491 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006492 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6493 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006494 );
6495
6496 Py_DECREF(sub_obj);
6497 Py_DECREF(str_obj);
6498
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499 return result;
6500}
6501
Martin v. Löwis18e16552006-02-15 17:27:45 +00006502Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006503 PyObject *sub,
6504 Py_ssize_t start,
6505 Py_ssize_t end,
6506 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006508 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006509
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006511 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006513 sub = PyUnicode_FromObject(sub);
6514 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 Py_DECREF(str);
6516 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 }
Tim Petersced69f82003-09-16 20:30:58 +00006518
Thomas Wouters477c8d52006-05-27 19:21:47 +00006519 if (direction > 0)
6520 result = stringlib_find_slice(
6521 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6522 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6523 start, end
6524 );
6525 else
6526 result = stringlib_rfind_slice(
6527 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6528 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6529 start, end
6530 );
6531
Guido van Rossumd57fd912000-03-10 22:53:23 +00006532 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006533 Py_DECREF(sub);
6534
Guido van Rossumd57fd912000-03-10 22:53:23 +00006535 return result;
6536}
6537
Tim Petersced69f82003-09-16 20:30:58 +00006538static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 PyUnicodeObject *substring,
6541 Py_ssize_t start,
6542 Py_ssize_t end,
6543 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 if (substring->length == 0)
6546 return 1;
6547
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006548 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 end -= substring->length;
6550 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006551 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552
6553 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 if (Py_UNICODE_MATCH(self, end, substring))
6555 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556 } else {
6557 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 }
6560
6561 return 0;
6562}
6563
Martin v. Löwis18e16552006-02-15 17:27:45 +00006564Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 PyObject *substr,
6566 Py_ssize_t start,
6567 Py_ssize_t end,
6568 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006570 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006571
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 str = PyUnicode_FromObject(str);
6573 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 substr = PyUnicode_FromObject(substr);
6576 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 Py_DECREF(str);
6578 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 }
Tim Petersced69f82003-09-16 20:30:58 +00006580
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006582 (PyUnicodeObject *)substr,
6583 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 Py_DECREF(str);
6585 Py_DECREF(substr);
6586 return result;
6587}
6588
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589/* Apply fixfct filter to the Unicode object self and return a
6590 reference to the modified object */
6591
Tim Petersced69f82003-09-16 20:30:58 +00006592static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006594 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595{
6596
6597 PyUnicodeObject *u;
6598
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006599 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006602
6603 Py_UNICODE_COPY(u->str, self->str, self->length);
6604
Tim Peters7a29bd52001-09-12 03:03:31 +00006605 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 /* fixfct should return TRUE if it modified the buffer. If
6607 FALSE, return a reference to the original buffer instead
6608 (to save space, not time) */
6609 Py_INCREF(self);
6610 Py_DECREF(u);
6611 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 }
6613 return (PyObject*) u;
6614}
6615
Tim Petersced69f82003-09-16 20:30:58 +00006616static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617int fixupper(PyUnicodeObject *self)
6618{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006619 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 Py_UNICODE *s = self->str;
6621 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006622
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006625
Benjamin Peterson29060642009-01-31 22:14:21 +00006626 ch = Py_UNICODE_TOUPPER(*s);
6627 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006629 *s = ch;
6630 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 s++;
6632 }
6633
6634 return status;
6635}
6636
Tim Petersced69f82003-09-16 20:30:58 +00006637static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638int fixlower(PyUnicodeObject *self)
6639{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006640 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006641 Py_UNICODE *s = self->str;
6642 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006643
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006646
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 ch = Py_UNICODE_TOLOWER(*s);
6648 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006650 *s = ch;
6651 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 s++;
6653 }
6654
6655 return status;
6656}
6657
Tim Petersced69f82003-09-16 20:30:58 +00006658static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659int fixswapcase(PyUnicodeObject *self)
6660{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006661 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 Py_UNICODE *s = self->str;
6663 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006664
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 while (len-- > 0) {
6666 if (Py_UNICODE_ISUPPER(*s)) {
6667 *s = Py_UNICODE_TOLOWER(*s);
6668 status = 1;
6669 } else if (Py_UNICODE_ISLOWER(*s)) {
6670 *s = Py_UNICODE_TOUPPER(*s);
6671 status = 1;
6672 }
6673 s++;
6674 }
6675
6676 return status;
6677}
6678
Tim Petersced69f82003-09-16 20:30:58 +00006679static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680int fixcapitalize(PyUnicodeObject *self)
6681{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006682 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006683 Py_UNICODE *s = self->str;
6684 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006685
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006686 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006687 return 0;
Ezio Melottiee8d9982011-08-15 09:09:57 +03006688 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 *s = Py_UNICODE_TOUPPER(*s);
6690 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006692 s++;
6693 while (--len > 0) {
Ezio Melottiee8d9982011-08-15 09:09:57 +03006694 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006695 *s = Py_UNICODE_TOLOWER(*s);
6696 status = 1;
6697 }
6698 s++;
6699 }
6700 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701}
6702
6703static
6704int fixtitle(PyUnicodeObject *self)
6705{
6706 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6707 register Py_UNICODE *e;
6708 int previous_is_cased;
6709
6710 /* Shortcut for single character strings */
6711 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006712 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6713 if (*p != ch) {
6714 *p = ch;
6715 return 1;
6716 }
6717 else
6718 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719 }
Tim Petersced69f82003-09-16 20:30:58 +00006720
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 e = p + PyUnicode_GET_SIZE(self);
6722 previous_is_cased = 0;
6723 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006725
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 if (previous_is_cased)
6727 *p = Py_UNICODE_TOLOWER(ch);
6728 else
6729 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006730
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 if (Py_UNICODE_ISLOWER(ch) ||
6732 Py_UNICODE_ISUPPER(ch) ||
6733 Py_UNICODE_ISTITLE(ch))
6734 previous_is_cased = 1;
6735 else
6736 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 }
6738 return 1;
6739}
6740
Tim Peters8ce9f162004-08-27 01:49:32 +00006741PyObject *
6742PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743{
Skip Montanaro6543b452004-09-16 03:28:13 +00006744 const Py_UNICODE blank = ' ';
6745 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006746 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006747 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006748 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6749 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006750 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6751 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006752 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006753 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754
Tim Peters05eba1f2004-08-27 21:32:02 +00006755 fseq = PySequence_Fast(seq, "");
6756 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006757 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006758 }
6759
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006760 /* NOTE: the following code can't call back into Python code,
6761 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006762 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006763
Tim Peters05eba1f2004-08-27 21:32:02 +00006764 seqlen = PySequence_Fast_GET_SIZE(fseq);
6765 /* If empty sequence, return u"". */
6766 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006767 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6768 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006769 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006770 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006771 /* If singleton sequence with an exact Unicode, return that. */
6772 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 item = items[0];
6774 if (PyUnicode_CheckExact(item)) {
6775 Py_INCREF(item);
6776 res = (PyUnicodeObject *)item;
6777 goto Done;
6778 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006779 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006780 else {
6781 /* Set up sep and seplen */
6782 if (separator == NULL) {
6783 sep = &blank;
6784 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006785 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006786 else {
6787 if (!PyUnicode_Check(separator)) {
6788 PyErr_Format(PyExc_TypeError,
6789 "separator: expected str instance,"
6790 " %.80s found",
6791 Py_TYPE(separator)->tp_name);
6792 goto onError;
6793 }
6794 sep = PyUnicode_AS_UNICODE(separator);
6795 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006796 }
6797 }
6798
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006799 /* There are at least two things to join, or else we have a subclass
6800 * of str in the sequence.
6801 * Do a pre-pass to figure out the total amount of space we'll
6802 * need (sz), and see whether all argument are strings.
6803 */
6804 sz = 0;
6805 for (i = 0; i < seqlen; i++) {
6806 const Py_ssize_t old_sz = sz;
6807 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 if (!PyUnicode_Check(item)) {
6809 PyErr_Format(PyExc_TypeError,
6810 "sequence item %zd: expected str instance,"
6811 " %.80s found",
6812 i, Py_TYPE(item)->tp_name);
6813 goto onError;
6814 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006815 sz += PyUnicode_GET_SIZE(item);
6816 if (i != 0)
6817 sz += seplen;
6818 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6819 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006821 goto onError;
6822 }
6823 }
Tim Petersced69f82003-09-16 20:30:58 +00006824
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006825 res = _PyUnicode_New(sz);
6826 if (res == NULL)
6827 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006828
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006829 /* Catenate everything. */
6830 res_p = PyUnicode_AS_UNICODE(res);
6831 for (i = 0; i < seqlen; ++i) {
6832 Py_ssize_t itemlen;
6833 item = items[i];
6834 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 /* Copy item, and maybe the separator. */
6836 if (i) {
6837 Py_UNICODE_COPY(res_p, sep, seplen);
6838 res_p += seplen;
6839 }
6840 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6841 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006842 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006843
Benjamin Peterson29060642009-01-31 22:14:21 +00006844 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006845 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846 return (PyObject *)res;
6847
Benjamin Peterson29060642009-01-31 22:14:21 +00006848 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006849 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006850 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851 return NULL;
6852}
6853
Tim Petersced69f82003-09-16 20:30:58 +00006854static
6855PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006856 Py_ssize_t left,
6857 Py_ssize_t right,
6858 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859{
6860 PyUnicodeObject *u;
6861
6862 if (left < 0)
6863 left = 0;
6864 if (right < 0)
6865 right = 0;
6866
Tim Peters7a29bd52001-09-12 03:03:31 +00006867 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 Py_INCREF(self);
6869 return self;
6870 }
6871
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006872 if (left > PY_SSIZE_T_MAX - self->length ||
6873 right > PY_SSIZE_T_MAX - (left + self->length)) {
6874 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6875 return NULL;
6876 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 u = _PyUnicode_New(left + self->length + right);
6878 if (u) {
6879 if (left)
6880 Py_UNICODE_FILL(u->str, fill, left);
6881 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6882 if (right)
6883 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6884 }
6885
6886 return u;
6887}
6888
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006889PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892
6893 string = PyUnicode_FromObject(string);
6894 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006897 list = stringlib_splitlines(
6898 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6899 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900
6901 Py_DECREF(string);
6902 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903}
6904
Tim Petersced69f82003-09-16 20:30:58 +00006905static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 PyUnicodeObject *substring,
6908 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006909{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006911 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912
Guido van Rossumd57fd912000-03-10 22:53:23 +00006913 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006914 return stringlib_split_whitespace(
6915 (PyObject*) self, self->str, self->length, maxcount
6916 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006918 return stringlib_split(
6919 (PyObject*) self, self->str, self->length,
6920 substring->str, substring->length,
6921 maxcount
6922 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006923}
6924
Tim Petersced69f82003-09-16 20:30:58 +00006925static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006926PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 PyUnicodeObject *substring,
6928 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006929{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006930 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006931 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006932
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006933 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006934 return stringlib_rsplit_whitespace(
6935 (PyObject*) self, self->str, self->length, maxcount
6936 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006937
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006938 return stringlib_rsplit(
6939 (PyObject*) self, self->str, self->length,
6940 substring->str, substring->length,
6941 maxcount
6942 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006943}
6944
6945static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006947 PyUnicodeObject *str1,
6948 PyUnicodeObject *str2,
6949 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006950{
6951 PyUnicodeObject *u;
6952
6953 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006954 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006955 else if (maxcount == 0 || self->length == 0)
6956 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957
Thomas Wouters477c8d52006-05-27 19:21:47 +00006958 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006959 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006960 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006961 if (str1->length == 0)
6962 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006963 if (str1->length == 1) {
6964 /* replace characters */
6965 Py_UNICODE u1, u2;
6966 if (!findchar(self->str, self->length, str1->str[0]))
6967 goto nothing;
6968 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6969 if (!u)
6970 return NULL;
6971 Py_UNICODE_COPY(u->str, self->str, self->length);
6972 u1 = str1->str[0];
6973 u2 = str2->str[0];
6974 for (i = 0; i < u->length; i++)
6975 if (u->str[i] == u1) {
6976 if (--maxcount < 0)
6977 break;
6978 u->str[i] = u2;
6979 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006981 i = stringlib_find(
6982 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006984 if (i < 0)
6985 goto nothing;
6986 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6987 if (!u)
6988 return NULL;
6989 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006990
6991 /* change everything in-place, starting with this one */
6992 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6993 i += str1->length;
6994
6995 while ( --maxcount > 0) {
6996 i = stringlib_find(self->str+i, self->length-i,
6997 str1->str, str1->length,
6998 i);
6999 if (i == -1)
7000 break;
7001 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7002 i += str1->length;
7003 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007006
7007 Py_ssize_t n, i, j, e;
7008 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009 Py_UNICODE *p;
7010
7011 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007012 n = stringlib_count(self->str, self->length, str1->str, str1->length,
7013 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007014 if (n == 0)
7015 goto nothing;
7016 /* new_size = self->length + n * (str2->length - str1->length)); */
7017 delta = (str2->length - str1->length);
7018 if (delta == 0) {
7019 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007021 product = n * (str2->length - str1->length);
7022 if ((product / (str2->length - str1->length)) != n) {
7023 PyErr_SetString(PyExc_OverflowError,
7024 "replace string is too long");
7025 return NULL;
7026 }
7027 new_size = self->length + product;
7028 if (new_size < 0) {
7029 PyErr_SetString(PyExc_OverflowError,
7030 "replace string is too long");
7031 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032 }
7033 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007034 u = _PyUnicode_New(new_size);
7035 if (!u)
7036 return NULL;
7037 i = 0;
7038 p = u->str;
7039 e = self->length - str1->length;
7040 if (str1->length > 0) {
7041 while (n-- > 0) {
7042 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007043 j = stringlib_find(self->str+i, self->length-i,
7044 str1->str, str1->length,
7045 i);
7046 if (j == -1)
7047 break;
7048 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007049 /* copy unchanged part [i:j] */
7050 Py_UNICODE_COPY(p, self->str+i, j-i);
7051 p += j - i;
7052 }
7053 /* copy substitution string */
7054 if (str2->length > 0) {
7055 Py_UNICODE_COPY(p, str2->str, str2->length);
7056 p += str2->length;
7057 }
7058 i = j + str1->length;
7059 }
7060 if (i < self->length)
7061 /* copy tail [i:] */
7062 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7063 } else {
7064 /* interleave */
7065 while (n > 0) {
7066 Py_UNICODE_COPY(p, str2->str, str2->length);
7067 p += str2->length;
7068 if (--n <= 0)
7069 break;
7070 *p++ = self->str[i++];
7071 }
7072 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7073 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007075 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007076
Benjamin Peterson29060642009-01-31 22:14:21 +00007077 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007078 /* nothing to replace; return original string (when possible) */
7079 if (PyUnicode_CheckExact(self)) {
7080 Py_INCREF(self);
7081 return (PyObject *) self;
7082 }
7083 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084}
7085
7086/* --- Unicode Object Methods --------------------------------------------- */
7087
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007088PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007089 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090\n\
7091Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007092characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093
7094static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007095unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097 return fixup(self, fixtitle);
7098}
7099
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007100PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007101 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102\n\
7103Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007104have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007105
7106static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007107unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109 return fixup(self, fixcapitalize);
7110}
7111
7112#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007113PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007114 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007115\n\
7116Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007117normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118
7119static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007120unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121{
7122 PyObject *list;
7123 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007124 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126 /* Split into words */
7127 list = split(self, NULL, -1);
7128 if (!list)
7129 return NULL;
7130
7131 /* Capitalize each word */
7132 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7133 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135 if (item == NULL)
7136 goto onError;
7137 Py_DECREF(PyList_GET_ITEM(list, i));
7138 PyList_SET_ITEM(list, i, item);
7139 }
7140
7141 /* Join the words to form a new string */
7142 item = PyUnicode_Join(NULL, list);
7143
Benjamin Peterson29060642009-01-31 22:14:21 +00007144 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 Py_DECREF(list);
7146 return (PyObject *)item;
7147}
7148#endif
7149
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007150/* Argument converter. Coerces to a single unicode character */
7151
7152static int
7153convert_uc(PyObject *obj, void *addr)
7154{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007155 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7156 PyObject *uniobj;
7157 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007158
Benjamin Peterson14339b62009-01-31 16:36:08 +00007159 uniobj = PyUnicode_FromObject(obj);
7160 if (uniobj == NULL) {
7161 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007162 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007163 return 0;
7164 }
7165 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7166 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007167 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007168 Py_DECREF(uniobj);
7169 return 0;
7170 }
7171 unistr = PyUnicode_AS_UNICODE(uniobj);
7172 *fillcharloc = unistr[0];
7173 Py_DECREF(uniobj);
7174 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007175}
7176
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007177PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007180Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007181done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007182
7183static PyObject *
7184unicode_center(PyUnicodeObject *self, PyObject *args)
7185{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007186 Py_ssize_t marg, left;
7187 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007188 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189
Thomas Woutersde017742006-02-16 19:34:37 +00007190 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 return NULL;
7192
Tim Peters7a29bd52001-09-12 03:03:31 +00007193 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194 Py_INCREF(self);
7195 return (PyObject*) self;
7196 }
7197
7198 marg = width - self->length;
7199 left = marg / 2 + (marg & width & 1);
7200
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007201 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202}
7203
Marc-André Lemburge5034372000-08-08 08:04:29 +00007204#if 0
7205
7206/* This code should go into some future Unicode collation support
7207 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007208 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007209
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007210/* speedy UTF-16 code point order comparison */
7211/* gleaned from: */
7212/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7213
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007214static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007215{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007216 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007217 0, 0, 0, 0, 0, 0, 0, 0,
7218 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007219 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007220};
7221
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222static int
7223unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7224{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007225 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007226
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227 Py_UNICODE *s1 = str1->str;
7228 Py_UNICODE *s2 = str2->str;
7229
7230 len1 = str1->length;
7231 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007232
Guido van Rossumd57fd912000-03-10 22:53:23 +00007233 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007234 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007235
7236 c1 = *s1++;
7237 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007238
Benjamin Peterson29060642009-01-31 22:14:21 +00007239 if (c1 > (1<<11) * 26)
7240 c1 += utf16Fixup[c1>>11];
7241 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007242 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007243 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007244
7245 if (c1 != c2)
7246 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007247
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007248 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007249 }
7250
7251 return (len1 < len2) ? -1 : (len1 != len2);
7252}
7253
Marc-André Lemburge5034372000-08-08 08:04:29 +00007254#else
7255
7256static int
7257unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7258{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007259 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007260
7261 Py_UNICODE *s1 = str1->str;
7262 Py_UNICODE *s2 = str2->str;
7263
7264 len1 = str1->length;
7265 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007266
Marc-André Lemburge5034372000-08-08 08:04:29 +00007267 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007268 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007269
Fredrik Lundh45714e92001-06-26 16:39:36 +00007270 c1 = *s1++;
7271 c2 = *s2++;
7272
7273 if (c1 != c2)
7274 return (c1 < c2) ? -1 : 1;
7275
Marc-André Lemburge5034372000-08-08 08:04:29 +00007276 len1--; len2--;
7277 }
7278
7279 return (len1 < len2) ? -1 : (len1 != len2);
7280}
7281
7282#endif
7283
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007285 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007287 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7288 return unicode_compare((PyUnicodeObject *)left,
7289 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007290 PyErr_Format(PyExc_TypeError,
7291 "Can't compare %.100s and %.100s",
7292 left->ob_type->tp_name,
7293 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294 return -1;
7295}
7296
Martin v. Löwis5b222132007-06-10 09:51:05 +00007297int
7298PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7299{
7300 int i;
7301 Py_UNICODE *id;
7302 assert(PyUnicode_Check(uni));
7303 id = PyUnicode_AS_UNICODE(uni);
7304 /* Compare Unicode string and source character set string */
7305 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007306 if (id[i] != str[i])
7307 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007308 /* This check keeps Python strings that end in '\0' from comparing equal
7309 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007310 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007312 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007313 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007314 return 0;
7315}
7316
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007317
Benjamin Peterson29060642009-01-31 22:14:21 +00007318#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007319 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007320
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007321PyObject *PyUnicode_RichCompare(PyObject *left,
7322 PyObject *right,
7323 int op)
7324{
7325 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007326
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007327 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7328 PyObject *v;
7329 if (((PyUnicodeObject *) left)->length !=
7330 ((PyUnicodeObject *) right)->length) {
7331 if (op == Py_EQ) {
7332 Py_INCREF(Py_False);
7333 return Py_False;
7334 }
7335 if (op == Py_NE) {
7336 Py_INCREF(Py_True);
7337 return Py_True;
7338 }
7339 }
7340 if (left == right)
7341 result = 0;
7342 else
7343 result = unicode_compare((PyUnicodeObject *)left,
7344 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007345
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007346 /* Convert the return value to a Boolean */
7347 switch (op) {
7348 case Py_EQ:
7349 v = TEST_COND(result == 0);
7350 break;
7351 case Py_NE:
7352 v = TEST_COND(result != 0);
7353 break;
7354 case Py_LE:
7355 v = TEST_COND(result <= 0);
7356 break;
7357 case Py_GE:
7358 v = TEST_COND(result >= 0);
7359 break;
7360 case Py_LT:
7361 v = TEST_COND(result == -1);
7362 break;
7363 case Py_GT:
7364 v = TEST_COND(result == 1);
7365 break;
7366 default:
7367 PyErr_BadArgument();
7368 return NULL;
7369 }
7370 Py_INCREF(v);
7371 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007372 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007373
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007374 Py_INCREF(Py_NotImplemented);
7375 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007376}
7377
Guido van Rossum403d68b2000-03-13 15:55:09 +00007378int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007379 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007380{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007381 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007382 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007383
7384 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007385 sub = PyUnicode_FromObject(element);
7386 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007387 PyErr_Format(PyExc_TypeError,
7388 "'in <string>' requires string as left operand, not %s",
7389 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007390 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007391 }
7392
Thomas Wouters477c8d52006-05-27 19:21:47 +00007393 str = PyUnicode_FromObject(container);
7394 if (!str) {
7395 Py_DECREF(sub);
7396 return -1;
7397 }
7398
7399 result = stringlib_contains_obj(str, sub);
7400
7401 Py_DECREF(str);
7402 Py_DECREF(sub);
7403
Guido van Rossum403d68b2000-03-13 15:55:09 +00007404 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007405}
7406
Guido van Rossumd57fd912000-03-10 22:53:23 +00007407/* Concat to string or Unicode object giving a new Unicode object. */
7408
7409PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411{
7412 PyUnicodeObject *u = NULL, *v = NULL, *w;
7413
7414 /* Coerce the two arguments */
7415 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7416 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7419 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007420 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007421
7422 /* Shortcuts */
7423 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007424 Py_DECREF(v);
7425 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007426 }
7427 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007428 Py_DECREF(u);
7429 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430 }
7431
7432 /* Concat the two Unicode strings */
7433 w = _PyUnicode_New(u->length + v->length);
7434 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007435 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436 Py_UNICODE_COPY(w->str, u->str, u->length);
7437 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7438
7439 Py_DECREF(u);
7440 Py_DECREF(v);
7441 return (PyObject *)w;
7442
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444 Py_XDECREF(u);
7445 Py_XDECREF(v);
7446 return NULL;
7447}
7448
Walter Dörwald1ab83302007-05-18 17:15:44 +00007449void
7450PyUnicode_Append(PyObject **pleft, PyObject *right)
7451{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007452 PyObject *new;
7453 if (*pleft == NULL)
7454 return;
7455 if (right == NULL || !PyUnicode_Check(*pleft)) {
7456 Py_DECREF(*pleft);
7457 *pleft = NULL;
7458 return;
7459 }
7460 new = PyUnicode_Concat(*pleft, right);
7461 Py_DECREF(*pleft);
7462 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007463}
7464
7465void
7466PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7467{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007468 PyUnicode_Append(pleft, right);
7469 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007470}
7471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007472PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007474\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007475Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007476string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007477interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478
7479static PyObject *
7480unicode_count(PyUnicodeObject *self, PyObject *args)
7481{
7482 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007483 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007484 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485 PyObject *result;
7486
Jesus Ceaac451502011-04-20 17:09:23 +02007487 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
7488 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007490
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007491 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007492 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007493 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007494 substring->str, substring->length,
7495 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007496 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497
7498 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007499
Guido van Rossumd57fd912000-03-10 22:53:23 +00007500 return result;
7501}
7502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007503PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007504 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007506Encode S using the codec registered for encoding. Default encoding\n\
7507is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007508handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007509a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7510'xmlcharrefreplace' as well as any other name registered with\n\
7511codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512
7513static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007514unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007515{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007516 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517 char *encoding = NULL;
7518 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007519
Benjamin Peterson308d6372009-09-18 21:42:35 +00007520 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7521 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007523 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007524}
7525
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007526PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528\n\
7529Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007530If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531
7532static PyObject*
7533unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7534{
7535 Py_UNICODE *e;
7536 Py_UNICODE *p;
7537 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007538 Py_UNICODE *qe;
7539 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540 PyUnicodeObject *u;
7541 int tabsize = 8;
7542
7543 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007544 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545
Thomas Wouters7e474022000-07-16 12:04:32 +00007546 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007547 i = 0; /* chars up to and including most recent \n or \r */
7548 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7549 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550 for (p = self->str; p < e; p++)
7551 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007552 if (tabsize > 0) {
7553 incr = tabsize - (j % tabsize); /* cannot overflow */
7554 if (j > PY_SSIZE_T_MAX - incr)
7555 goto overflow1;
7556 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007557 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007558 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 if (j > PY_SSIZE_T_MAX - 1)
7561 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562 j++;
7563 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007564 if (i > PY_SSIZE_T_MAX - j)
7565 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007566 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007567 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568 }
7569 }
7570
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007571 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007573
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574 /* Second pass: create output string and fill it */
7575 u = _PyUnicode_New(i + j);
7576 if (!u)
7577 return NULL;
7578
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007579 j = 0; /* same as in first pass */
7580 q = u->str; /* next output char */
7581 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582
7583 for (p = self->str; p < e; p++)
7584 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 if (tabsize > 0) {
7586 i = tabsize - (j % tabsize);
7587 j += i;
7588 while (i--) {
7589 if (q >= qe)
7590 goto overflow2;
7591 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007592 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007594 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 else {
7596 if (q >= qe)
7597 goto overflow2;
7598 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007599 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 if (*p == '\n' || *p == '\r')
7601 j = 0;
7602 }
7603
7604 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007605
7606 overflow2:
7607 Py_DECREF(u);
7608 overflow1:
7609 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7610 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611}
7612
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007613PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615\n\
7616Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08007617such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618arguments start and end are interpreted as in slice notation.\n\
7619\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007620Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007621
7622static PyObject *
7623unicode_find(PyUnicodeObject *self, PyObject *args)
7624{
Jesus Ceaac451502011-04-20 17:09:23 +02007625 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007626 Py_ssize_t start;
7627 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007628 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629
Jesus Ceaac451502011-04-20 17:09:23 +02007630 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
7631 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633
Thomas Wouters477c8d52006-05-27 19:21:47 +00007634 result = stringlib_find_slice(
7635 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7636 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7637 start, end
7638 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639
7640 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007641
Christian Heimes217cfd12007-12-02 14:31:20 +00007642 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643}
7644
7645static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007646unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647{
7648 if (index < 0 || index >= self->length) {
7649 PyErr_SetString(PyExc_IndexError, "string index out of range");
7650 return NULL;
7651 }
7652
7653 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7654}
7655
Guido van Rossumc2504932007-09-18 19:42:40 +00007656/* Believe it or not, this produces the same value for ASCII strings
7657 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007658static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007659unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660{
Guido van Rossumc2504932007-09-18 19:42:40 +00007661 Py_ssize_t len;
7662 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007663 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007664
7665 if (self->hash != -1)
7666 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007667 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007668 p = self->str;
7669 x = *p << 7;
7670 while (--len >= 0)
7671 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007672 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007673 if (x == -1)
7674 x = -2;
7675 self->hash = x;
7676 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677}
7678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007679PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007682Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683
7684static PyObject *
7685unicode_index(PyUnicodeObject *self, PyObject *args)
7686{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007687 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02007688 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007689 Py_ssize_t start;
7690 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691
Jesus Ceaac451502011-04-20 17:09:23 +02007692 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
7693 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695
Thomas Wouters477c8d52006-05-27 19:21:47 +00007696 result = stringlib_find_slice(
7697 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7698 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7699 start, end
7700 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701
7702 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007703
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704 if (result < 0) {
7705 PyErr_SetString(PyExc_ValueError, "substring not found");
7706 return NULL;
7707 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007708
Christian Heimes217cfd12007-12-02 14:31:20 +00007709 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710}
7711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007712PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007713 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007715Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007716at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717
7718static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007719unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720{
7721 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7722 register const Py_UNICODE *e;
7723 int cased;
7724
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725 /* Shortcut for single character strings */
7726 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007727 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007729 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007730 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007732
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733 e = p + PyUnicode_GET_SIZE(self);
7734 cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007735 while (p < e) {
7736 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007737
Benjamin Peterson29060642009-01-31 22:14:21 +00007738 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7739 return PyBool_FromLong(0);
7740 else if (!cased && Py_UNICODE_ISLOWER(ch))
7741 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007743 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744}
7745
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007746PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007747 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007749Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007750at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751
7752static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007753unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754{
7755 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7756 register const Py_UNICODE *e;
7757 int cased;
7758
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759 /* Shortcut for single character strings */
7760 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007763 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007764 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007766
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767 e = p + PyUnicode_GET_SIZE(self);
7768 cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007769 while (p < e) {
7770 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007771
Benjamin Peterson29060642009-01-31 22:14:21 +00007772 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7773 return PyBool_FromLong(0);
7774 else if (!cased && Py_UNICODE_ISUPPER(ch))
7775 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007777 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778}
7779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007780PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007783Return True if S is a titlecased string and there is at least one\n\
7784character in S, i.e. upper- and titlecase characters may only\n\
7785follow uncased characters and lowercase characters only cased ones.\n\
7786Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787
7788static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007789unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790{
7791 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7792 register const Py_UNICODE *e;
7793 int cased, previous_is_cased;
7794
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795 /* Shortcut for single character strings */
7796 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7798 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007800 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007801 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007802 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007803
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804 e = p + PyUnicode_GET_SIZE(self);
7805 cased = 0;
7806 previous_is_cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007807 while (p < e) {
7808 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007809
Benjamin Peterson29060642009-01-31 22:14:21 +00007810 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7811 if (previous_is_cased)
7812 return PyBool_FromLong(0);
7813 previous_is_cased = 1;
7814 cased = 1;
7815 }
7816 else if (Py_UNICODE_ISLOWER(ch)) {
7817 if (!previous_is_cased)
7818 return PyBool_FromLong(0);
7819 previous_is_cased = 1;
7820 cased = 1;
7821 }
7822 else
7823 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007825 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826}
7827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007828PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007829 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007831Return True if all characters in S are whitespace\n\
7832and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833
7834static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007835unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836{
7837 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7838 register const Py_UNICODE *e;
7839
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840 /* Shortcut for single character strings */
7841 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 Py_UNICODE_ISSPACE(*p))
7843 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007845 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007846 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007848
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007850 while (p < e) {
7851 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7852 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007855 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856}
7857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007858PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007859 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007860\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007861Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007862and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007863
7864static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007865unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007866{
7867 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7868 register const Py_UNICODE *e;
7869
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007870 /* Shortcut for single character strings */
7871 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 Py_UNICODE_ISALPHA(*p))
7873 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007874
7875 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007876 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007878
7879 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007880 while (p < e) {
7881 if (!Py_UNICODE_ISALPHA(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007883 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007884 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007885}
7886
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007887PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007888 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007889\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007890Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007891and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007892
7893static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007894unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007895{
7896 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7897 register const Py_UNICODE *e;
7898
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007899 /* Shortcut for single character strings */
7900 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 Py_UNICODE_ISALNUM(*p))
7902 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007903
7904 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007905 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007907
7908 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007909 while (p < e) {
7910 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7911 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00007912 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007913 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007914 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007915}
7916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007917PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007920Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007921False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007922
7923static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007924unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007925{
7926 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7927 register const Py_UNICODE *e;
7928
Guido van Rossumd57fd912000-03-10 22:53:23 +00007929 /* Shortcut for single character strings */
7930 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 Py_UNICODE_ISDECIMAL(*p))
7932 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007934 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007935 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007937
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007939 while (p < e) {
7940 if (!Py_UNICODE_ISDECIMAL(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007943 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007944}
7945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007946PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007947 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007949Return True if all characters in S are digits\n\
7950and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951
7952static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007953unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954{
7955 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7956 register const Py_UNICODE *e;
7957
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958 /* Shortcut for single character strings */
7959 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 Py_UNICODE_ISDIGIT(*p))
7961 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007963 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007964 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007966
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007968 while (p < e) {
7969 if (!Py_UNICODE_ISDIGIT(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007970 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007972 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973}
7974
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007975PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007978Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007979False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980
7981static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007982unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983{
7984 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7985 register const Py_UNICODE *e;
7986
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 /* Shortcut for single character strings */
7988 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 Py_UNICODE_ISNUMERIC(*p))
7990 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007992 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007993 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007994 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007995
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007997 while (p < e) {
7998 if (!Py_UNICODE_ISNUMERIC(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007999 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008001 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002}
8003
Martin v. Löwis47383402007-08-15 07:32:56 +00008004int
8005PyUnicode_IsIdentifier(PyObject *self)
8006{
Benjamin Petersonf413b802011-08-12 22:17:18 -05008007 const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008008 const Py_UNICODE *e;
8009 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +00008010
8011 /* Special case for empty strings */
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008012 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008014
8015 /* PEP 3131 says that the first character must be in
8016 XID_Start and subsequent characters in XID_Continue,
8017 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00008018 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00008019 letters, digits, underscore). However, given the current
8020 definition of XID_Start and XID_Continue, it is sufficient
8021 to check just for these, except that _ must be allowed
8022 as starting an identifier. */
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008023 e = p + PyUnicode_GET_SIZE(self);
8024 first = _Py_UNICODE_NEXT(p, e);
Benjamin Petersonf413b802011-08-12 22:17:18 -05008025 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +00008026 return 0;
8027
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008028 while (p < e)
8029 if (!_PyUnicode_IsXidContinue(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008031 return 1;
8032}
8033
8034PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008036\n\
8037Return True if S is a valid identifier according\n\
8038to the language definition.");
8039
8040static PyObject*
8041unicode_isidentifier(PyObject *self)
8042{
8043 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8044}
8045
Georg Brandl559e5d72008-06-11 18:37:52 +00008046PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008048\n\
8049Return True if all characters in S are considered\n\
8050printable in repr() or S is empty, False otherwise.");
8051
8052static PyObject*
8053unicode_isprintable(PyObject *self)
8054{
8055 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8056 register const Py_UNICODE *e;
8057
8058 /* Shortcut for single character strings */
8059 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8060 Py_RETURN_TRUE;
8061 }
8062
8063 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008064 while (p < e) {
8065 if (!Py_UNICODE_ISPRINTABLE(_Py_UNICODE_NEXT(p, e))) {
Georg Brandl559e5d72008-06-11 18:37:52 +00008066 Py_RETURN_FALSE;
8067 }
8068 }
8069 Py_RETURN_TRUE;
8070}
8071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008072PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008073 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008074\n\
8075Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008076iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008077
8078static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008079unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008081 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082}
8083
Martin v. Löwis18e16552006-02-15 17:27:45 +00008084static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085unicode_length(PyUnicodeObject *self)
8086{
8087 return self->length;
8088}
8089
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008090PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008091 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008093Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008094done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095
8096static PyObject *
8097unicode_ljust(PyUnicodeObject *self, PyObject *args)
8098{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008099 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008100 Py_UNICODE fillchar = ' ';
8101
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008102 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103 return NULL;
8104
Tim Peters7a29bd52001-09-12 03:03:31 +00008105 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106 Py_INCREF(self);
8107 return (PyObject*) self;
8108 }
8109
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008110 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111}
8112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008113PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008116Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117
8118static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008119unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121 return fixup(self, fixlower);
8122}
8123
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008124#define LEFTSTRIP 0
8125#define RIGHTSTRIP 1
8126#define BOTHSTRIP 2
8127
8128/* Arrays indexed by above */
8129static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8130
8131#define STRIPNAME(i) (stripformat[i]+3)
8132
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008133/* externally visible for str.strip(unicode) */
8134PyObject *
8135_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8136{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008137 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8138 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8139 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8140 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8141 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008142
Benjamin Peterson29060642009-01-31 22:14:21 +00008143 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008144
Benjamin Peterson14339b62009-01-31 16:36:08 +00008145 i = 0;
8146 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8148 i++;
8149 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008150 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008151
Benjamin Peterson14339b62009-01-31 16:36:08 +00008152 j = len;
8153 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 do {
8155 j--;
8156 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8157 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008158 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008159
Benjamin Peterson14339b62009-01-31 16:36:08 +00008160 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 Py_INCREF(self);
8162 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008163 }
8164 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008166}
8167
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168
8169static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008170do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008171{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008172 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8173 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008174
Benjamin Peterson14339b62009-01-31 16:36:08 +00008175 i = 0;
8176 if (striptype != RIGHTSTRIP) {
8177 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8178 i++;
8179 }
8180 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008181
Benjamin Peterson14339b62009-01-31 16:36:08 +00008182 j = len;
8183 if (striptype != LEFTSTRIP) {
8184 do {
8185 j--;
8186 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8187 j++;
8188 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008189
Benjamin Peterson14339b62009-01-31 16:36:08 +00008190 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8191 Py_INCREF(self);
8192 return (PyObject*)self;
8193 }
8194 else
8195 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196}
8197
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008198
8199static PyObject *
8200do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8201{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008202 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008203
Benjamin Peterson14339b62009-01-31 16:36:08 +00008204 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8205 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008206
Benjamin Peterson14339b62009-01-31 16:36:08 +00008207 if (sep != NULL && sep != Py_None) {
8208 if (PyUnicode_Check(sep))
8209 return _PyUnicode_XStrip(self, striptype, sep);
8210 else {
8211 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 "%s arg must be None or str",
8213 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008214 return NULL;
8215 }
8216 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008217
Benjamin Peterson14339b62009-01-31 16:36:08 +00008218 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008219}
8220
8221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008222PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008223 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008224\n\
8225Return a copy of the string S with leading and trailing\n\
8226whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008227If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008228
8229static PyObject *
8230unicode_strip(PyUnicodeObject *self, PyObject *args)
8231{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008232 if (PyTuple_GET_SIZE(args) == 0)
8233 return do_strip(self, BOTHSTRIP); /* Common case */
8234 else
8235 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008236}
8237
8238
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008239PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008241\n\
8242Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008243If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008244
8245static PyObject *
8246unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8247{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008248 if (PyTuple_GET_SIZE(args) == 0)
8249 return do_strip(self, LEFTSTRIP); /* Common case */
8250 else
8251 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008252}
8253
8254
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008255PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008257\n\
8258Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008259If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008260
8261static PyObject *
8262unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8263{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008264 if (PyTuple_GET_SIZE(args) == 0)
8265 return do_strip(self, RIGHTSTRIP); /* Common case */
8266 else
8267 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008268}
8269
8270
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008272unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273{
8274 PyUnicodeObject *u;
8275 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008276 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008277 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278
Georg Brandl222de0f2009-04-12 12:01:50 +00008279 if (len < 1) {
8280 Py_INCREF(unicode_empty);
8281 return (PyObject *)unicode_empty;
8282 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008283
Tim Peters7a29bd52001-09-12 03:03:31 +00008284 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 /* no repeat, return original string */
8286 Py_INCREF(str);
8287 return (PyObject*) str;
8288 }
Tim Peters8f422462000-09-09 06:13:41 +00008289
8290 /* ensure # of chars needed doesn't overflow int and # of bytes
8291 * needed doesn't overflow size_t
8292 */
8293 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008294 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008295 PyErr_SetString(PyExc_OverflowError,
8296 "repeated string is too long");
8297 return NULL;
8298 }
8299 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8300 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8301 PyErr_SetString(PyExc_OverflowError,
8302 "repeated string is too long");
8303 return NULL;
8304 }
8305 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306 if (!u)
8307 return NULL;
8308
8309 p = u->str;
8310
Georg Brandl222de0f2009-04-12 12:01:50 +00008311 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008312 Py_UNICODE_FILL(p, str->str[0], len);
8313 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008314 Py_ssize_t done = str->length; /* number of characters copied this far */
8315 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008317 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008318 Py_UNICODE_COPY(p+done, p, n);
8319 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321 }
8322
8323 return (PyObject*) u;
8324}
8325
8326PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 PyObject *subobj,
8328 PyObject *replobj,
8329 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330{
8331 PyObject *self;
8332 PyObject *str1;
8333 PyObject *str2;
8334 PyObject *result;
8335
8336 self = PyUnicode_FromObject(obj);
8337 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008339 str1 = PyUnicode_FromObject(subobj);
8340 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 Py_DECREF(self);
8342 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343 }
8344 str2 = PyUnicode_FromObject(replobj);
8345 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 Py_DECREF(self);
8347 Py_DECREF(str1);
8348 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349 }
Tim Petersced69f82003-09-16 20:30:58 +00008350 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 (PyUnicodeObject *)str1,
8352 (PyUnicodeObject *)str2,
8353 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 Py_DECREF(self);
8355 Py_DECREF(str1);
8356 Py_DECREF(str2);
8357 return result;
8358}
8359
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008360PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008361 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362\n\
8363Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008364old replaced by new. If the optional argument count is\n\
8365given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366
8367static PyObject*
8368unicode_replace(PyUnicodeObject *self, PyObject *args)
8369{
8370 PyUnicodeObject *str1;
8371 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008372 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373 PyObject *result;
8374
Martin v. Löwis18e16552006-02-15 17:27:45 +00008375 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376 return NULL;
8377 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8378 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008381 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 Py_DECREF(str1);
8383 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008384 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008385
8386 result = replace(self, str1, str2, maxcount);
8387
8388 Py_DECREF(str1);
8389 Py_DECREF(str2);
8390 return result;
8391}
8392
8393static
8394PyObject *unicode_repr(PyObject *unicode)
8395{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008396 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008397 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008398 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8399 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8400
8401 /* XXX(nnorwitz): rather than over-allocating, it would be
8402 better to choose a different scheme. Perhaps scan the
8403 first N-chars of the string and allocate based on that size.
8404 */
8405 /* Initial allocation is based on the longest-possible unichr
8406 escape.
8407
8408 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8409 unichr, so in this case it's the longest unichr escape. In
8410 narrow (UTF-16) builds this is five chars per source unichr
8411 since there are two unichrs in the surrogate pair, so in narrow
8412 (UTF-16) builds it's not the longest unichr escape.
8413
8414 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8415 so in the narrow (UTF-16) build case it's the longest unichr
8416 escape.
8417 */
8418
Walter Dörwald1ab83302007-05-18 17:15:44 +00008419 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008421#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008423#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008425#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008427 if (repr == NULL)
8428 return NULL;
8429
Walter Dörwald1ab83302007-05-18 17:15:44 +00008430 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008431
8432 /* Add quote */
8433 *p++ = (findchar(s, size, '\'') &&
8434 !findchar(s, size, '"')) ? '"' : '\'';
8435 while (size-- > 0) {
8436 Py_UNICODE ch = *s++;
8437
8438 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008439 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008440 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008441 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008442 continue;
8443 }
8444
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008446 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008447 *p++ = '\\';
8448 *p++ = 't';
8449 }
8450 else if (ch == '\n') {
8451 *p++ = '\\';
8452 *p++ = 'n';
8453 }
8454 else if (ch == '\r') {
8455 *p++ = '\\';
8456 *p++ = 'r';
8457 }
8458
8459 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008460 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008461 *p++ = '\\';
8462 *p++ = 'x';
8463 *p++ = hexdigits[(ch >> 4) & 0x000F];
8464 *p++ = hexdigits[ch & 0x000F];
8465 }
8466
Georg Brandl559e5d72008-06-11 18:37:52 +00008467 /* Copy ASCII characters as-is */
8468 else if (ch < 0x7F) {
8469 *p++ = ch;
8470 }
8471
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008473 else {
8474 Py_UCS4 ucs = ch;
8475
8476#ifndef Py_UNICODE_WIDE
8477 Py_UNICODE ch2 = 0;
8478 /* Get code point from surrogate pair */
8479 if (size > 0) {
8480 ch2 = *s;
8481 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008483 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008485 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008486 size--;
8487 }
8488 }
8489#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008490 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008491 (categories Z* and C* except ASCII space)
8492 */
8493 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8494 /* Map 8-bit characters to '\xhh' */
8495 if (ucs <= 0xff) {
8496 *p++ = '\\';
8497 *p++ = 'x';
8498 *p++ = hexdigits[(ch >> 4) & 0x000F];
8499 *p++ = hexdigits[ch & 0x000F];
8500 }
8501 /* Map 21-bit characters to '\U00xxxxxx' */
8502 else if (ucs >= 0x10000) {
8503 *p++ = '\\';
8504 *p++ = 'U';
8505 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8506 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8507 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8508 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8509 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8510 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8511 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8512 *p++ = hexdigits[ucs & 0x0000000F];
8513 }
8514 /* Map 16-bit characters to '\uxxxx' */
8515 else {
8516 *p++ = '\\';
8517 *p++ = 'u';
8518 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8519 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8520 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8521 *p++ = hexdigits[ucs & 0x000F];
8522 }
8523 }
8524 /* Copy characters as-is */
8525 else {
8526 *p++ = ch;
8527#ifndef Py_UNICODE_WIDE
8528 if (ucs >= 0x10000)
8529 *p++ = ch2;
8530#endif
8531 }
8532 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008533 }
8534 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008535 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008536
8537 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008538 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008539 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540}
8541
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008542PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544\n\
8545Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08008546such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008547arguments start and end are interpreted as in slice notation.\n\
8548\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008549Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550
8551static PyObject *
8552unicode_rfind(PyUnicodeObject *self, PyObject *args)
8553{
Jesus Ceaac451502011-04-20 17:09:23 +02008554 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008555 Py_ssize_t start;
8556 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008557 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558
Jesus Ceaac451502011-04-20 17:09:23 +02008559 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
8560 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008561 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562
Thomas Wouters477c8d52006-05-27 19:21:47 +00008563 result = stringlib_rfind_slice(
8564 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8565 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8566 start, end
8567 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568
8569 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008570
Christian Heimes217cfd12007-12-02 14:31:20 +00008571 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572}
8573
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008574PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008577Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578
8579static PyObject *
8580unicode_rindex(PyUnicodeObject *self, PyObject *args)
8581{
Jesus Ceaac451502011-04-20 17:09:23 +02008582 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008583 Py_ssize_t start;
8584 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008585 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586
Jesus Ceaac451502011-04-20 17:09:23 +02008587 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
8588 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008589 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590
Thomas Wouters477c8d52006-05-27 19:21:47 +00008591 result = stringlib_rfind_slice(
8592 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8593 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8594 start, end
8595 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596
8597 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008598
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599 if (result < 0) {
8600 PyErr_SetString(PyExc_ValueError, "substring not found");
8601 return NULL;
8602 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008603 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604}
8605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008606PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008609Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008610done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611
8612static PyObject *
8613unicode_rjust(PyUnicodeObject *self, PyObject *args)
8614{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008615 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008616 Py_UNICODE fillchar = ' ';
8617
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008618 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619 return NULL;
8620
Tim Peters7a29bd52001-09-12 03:03:31 +00008621 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622 Py_INCREF(self);
8623 return (PyObject*) self;
8624 }
8625
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008626 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627}
8628
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 PyObject *sep,
8631 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632{
8633 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008634
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 s = PyUnicode_FromObject(s);
8636 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008637 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 if (sep != NULL) {
8639 sep = PyUnicode_FromObject(sep);
8640 if (sep == NULL) {
8641 Py_DECREF(s);
8642 return NULL;
8643 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644 }
8645
8646 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8647
8648 Py_DECREF(s);
8649 Py_XDECREF(sep);
8650 return result;
8651}
8652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008653PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655\n\
8656Return a list of the words in S, using sep as the\n\
8657delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008658splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008659whitespace string is a separator and empty strings are\n\
8660removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661
8662static PyObject*
8663unicode_split(PyUnicodeObject *self, PyObject *args)
8664{
8665 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008666 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667
Martin v. Löwis18e16552006-02-15 17:27:45 +00008668 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669 return NULL;
8670
8671 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677}
8678
Thomas Wouters477c8d52006-05-27 19:21:47 +00008679PyObject *
8680PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8681{
8682 PyObject* str_obj;
8683 PyObject* sep_obj;
8684 PyObject* out;
8685
8686 str_obj = PyUnicode_FromObject(str_in);
8687 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008689 sep_obj = PyUnicode_FromObject(sep_in);
8690 if (!sep_obj) {
8691 Py_DECREF(str_obj);
8692 return NULL;
8693 }
8694
8695 out = stringlib_partition(
8696 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8697 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8698 );
8699
8700 Py_DECREF(sep_obj);
8701 Py_DECREF(str_obj);
8702
8703 return out;
8704}
8705
8706
8707PyObject *
8708PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8709{
8710 PyObject* str_obj;
8711 PyObject* sep_obj;
8712 PyObject* out;
8713
8714 str_obj = PyUnicode_FromObject(str_in);
8715 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008717 sep_obj = PyUnicode_FromObject(sep_in);
8718 if (!sep_obj) {
8719 Py_DECREF(str_obj);
8720 return NULL;
8721 }
8722
8723 out = stringlib_rpartition(
8724 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8725 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8726 );
8727
8728 Py_DECREF(sep_obj);
8729 Py_DECREF(str_obj);
8730
8731 return out;
8732}
8733
8734PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008736\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008737Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008738the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008739found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008740
8741static PyObject*
8742unicode_partition(PyUnicodeObject *self, PyObject *separator)
8743{
8744 return PyUnicode_Partition((PyObject *)self, separator);
8745}
8746
8747PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008748 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008749\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008750Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008751the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008752separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008753
8754static PyObject*
8755unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8756{
8757 return PyUnicode_RPartition((PyObject *)self, separator);
8758}
8759
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008760PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 PyObject *sep,
8762 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008763{
8764 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008765
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008766 s = PyUnicode_FromObject(s);
8767 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008768 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 if (sep != NULL) {
8770 sep = PyUnicode_FromObject(sep);
8771 if (sep == NULL) {
8772 Py_DECREF(s);
8773 return NULL;
8774 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008775 }
8776
8777 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8778
8779 Py_DECREF(s);
8780 Py_XDECREF(sep);
8781 return result;
8782}
8783
8784PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008786\n\
8787Return a list of the words in S, using sep as the\n\
8788delimiter string, starting at the end of the string and\n\
8789working to the front. If maxsplit is given, at most maxsplit\n\
8790splits are done. If sep is not specified, any whitespace string\n\
8791is a separator.");
8792
8793static PyObject*
8794unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8795{
8796 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008797 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008798
Martin v. Löwis18e16552006-02-15 17:27:45 +00008799 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008800 return NULL;
8801
8802 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008804 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008806 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008807 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008808}
8809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008810PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008812\n\
8813Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008814Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008815is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816
8817static PyObject*
8818unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8819{
Guido van Rossum86662912000-04-11 15:38:46 +00008820 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008821
Guido van Rossum86662912000-04-11 15:38:46 +00008822 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008823 return NULL;
8824
Guido van Rossum86662912000-04-11 15:38:46 +00008825 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826}
8827
8828static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008829PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008830{
Walter Dörwald346737f2007-05-31 10:44:43 +00008831 if (PyUnicode_CheckExact(self)) {
8832 Py_INCREF(self);
8833 return self;
8834 } else
8835 /* Subtype -- return genuine unicode string with the same value. */
8836 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8837 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008838}
8839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008840PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008842\n\
8843Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008844and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008845
8846static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008847unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849 return fixup(self, fixswapcase);
8850}
8851
Georg Brandlceee0772007-11-27 23:48:05 +00008852PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008853 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008854\n\
8855Return a translation table usable for str.translate().\n\
8856If there is only one argument, it must be a dictionary mapping Unicode\n\
8857ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008858Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008859If there are two arguments, they must be strings of equal length, and\n\
8860in the resulting dictionary, each character in x will be mapped to the\n\
8861character at the same position in y. If there is a third argument, it\n\
8862must be a string, whose characters will be mapped to None in the result.");
8863
8864static PyObject*
8865unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8866{
8867 PyObject *x, *y = NULL, *z = NULL;
8868 PyObject *new = NULL, *key, *value;
8869 Py_ssize_t i = 0;
8870 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008871
Georg Brandlceee0772007-11-27 23:48:05 +00008872 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8873 return NULL;
8874 new = PyDict_New();
8875 if (!new)
8876 return NULL;
8877 if (y != NULL) {
8878 /* x must be a string too, of equal length */
8879 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8880 if (!PyUnicode_Check(x)) {
8881 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8882 "be a string if there is a second argument");
8883 goto err;
8884 }
8885 if (PyUnicode_GET_SIZE(x) != ylen) {
8886 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8887 "arguments must have equal length");
8888 goto err;
8889 }
8890 /* create entries for translating chars in x to those in y */
8891 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008892 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8893 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008894 if (!key || !value)
8895 goto err;
8896 res = PyDict_SetItem(new, key, value);
8897 Py_DECREF(key);
8898 Py_DECREF(value);
8899 if (res < 0)
8900 goto err;
8901 }
8902 /* create entries for deleting chars in z */
8903 if (z != NULL) {
8904 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008905 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008906 if (!key)
8907 goto err;
8908 res = PyDict_SetItem(new, key, Py_None);
8909 Py_DECREF(key);
8910 if (res < 0)
8911 goto err;
8912 }
8913 }
8914 } else {
8915 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008916 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008917 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8918 "to maketrans it must be a dict");
8919 goto err;
8920 }
8921 /* copy entries into the new dict, converting string keys to int keys */
8922 while (PyDict_Next(x, &i, &key, &value)) {
8923 if (PyUnicode_Check(key)) {
8924 /* convert string keys to integer keys */
8925 PyObject *newkey;
8926 if (PyUnicode_GET_SIZE(key) != 1) {
8927 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8928 "table must be of length 1");
8929 goto err;
8930 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008931 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008932 if (!newkey)
8933 goto err;
8934 res = PyDict_SetItem(new, newkey, value);
8935 Py_DECREF(newkey);
8936 if (res < 0)
8937 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008938 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008939 /* just keep integer keys */
8940 if (PyDict_SetItem(new, key, value) < 0)
8941 goto err;
8942 } else {
8943 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8944 "be strings or integers");
8945 goto err;
8946 }
8947 }
8948 }
8949 return new;
8950 err:
8951 Py_DECREF(new);
8952 return NULL;
8953}
8954
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008955PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008956 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957\n\
8958Return a copy of the string S, where all characters have been mapped\n\
8959through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008960Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008961Unmapped characters are left untouched. Characters mapped to None\n\
8962are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963
8964static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008965unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966{
Georg Brandlceee0772007-11-27 23:48:05 +00008967 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968}
8969
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008970PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008971 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008973Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974
8975static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008976unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978 return fixup(self, fixupper);
8979}
8980
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008981PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008982 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008984Pad a numeric string S with zeros on the left, to fill a field\n\
8985of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986
8987static PyObject *
8988unicode_zfill(PyUnicodeObject *self, PyObject *args)
8989{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008990 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991 PyUnicodeObject *u;
8992
Martin v. Löwis18e16552006-02-15 17:27:45 +00008993 Py_ssize_t width;
8994 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995 return NULL;
8996
8997 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008998 if (PyUnicode_CheckExact(self)) {
8999 Py_INCREF(self);
9000 return (PyObject*) self;
9001 }
9002 else
9003 return PyUnicode_FromUnicode(
9004 PyUnicode_AS_UNICODE(self),
9005 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00009006 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007 }
9008
9009 fill = width - self->length;
9010
9011 u = pad(self, fill, 0, '0');
9012
Walter Dörwald068325e2002-04-15 13:36:47 +00009013 if (u == NULL)
9014 return NULL;
9015
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016 if (u->str[fill] == '+' || u->str[fill] == '-') {
9017 /* move sign to beginning of string */
9018 u->str[0] = u->str[fill];
9019 u->str[fill] = '0';
9020 }
9021
9022 return (PyObject*) u;
9023}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024
9025#if 0
9026static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009027unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028{
Christian Heimes2202f872008-02-06 14:31:34 +00009029 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009031
9032static PyObject *
9033unicode__decimal2ascii(PyObject *self)
9034{
9035 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9036 PyUnicode_GET_SIZE(self));
9037}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038#endif
9039
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009040PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009041 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009043Return True if S starts with the specified prefix, False otherwise.\n\
9044With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009045With optional end, stop comparing S at that position.\n\
9046prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047
9048static PyObject *
9049unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009050 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009052 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009054 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009055 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009056 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057
Jesus Ceaac451502011-04-20 17:09:23 +02009058 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009059 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009060 if (PyTuple_Check(subobj)) {
9061 Py_ssize_t i;
9062 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9063 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009064 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009065 if (substring == NULL)
9066 return NULL;
9067 result = tailmatch(self, substring, start, end, -1);
9068 Py_DECREF(substring);
9069 if (result) {
9070 Py_RETURN_TRUE;
9071 }
9072 }
9073 /* nothing matched */
9074 Py_RETURN_FALSE;
9075 }
9076 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009077 if (substring == NULL) {
9078 if (PyErr_ExceptionMatches(PyExc_TypeError))
9079 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
9080 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009081 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009082 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009083 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009085 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086}
9087
9088
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009089PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009090 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009092Return True if S ends with the specified suffix, False otherwise.\n\
9093With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009094With optional end, stop comparing S at that position.\n\
9095suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096
9097static PyObject *
9098unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009099 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009100{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009101 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009103 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009104 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009105 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106
Jesus Ceaac451502011-04-20 17:09:23 +02009107 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009108 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009109 if (PyTuple_Check(subobj)) {
9110 Py_ssize_t i;
9111 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9112 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009113 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009114 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009115 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009116 result = tailmatch(self, substring, start, end, +1);
9117 Py_DECREF(substring);
9118 if (result) {
9119 Py_RETURN_TRUE;
9120 }
9121 }
9122 Py_RETURN_FALSE;
9123 }
9124 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009125 if (substring == NULL) {
9126 if (PyErr_ExceptionMatches(PyExc_TypeError))
9127 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
9128 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009129 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009130 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009131 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009133 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134}
9135
Eric Smith8c663262007-08-25 02:26:07 +00009136#include "stringlib/string_format.h"
9137
9138PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009140\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009141Return a formatted version of S, using substitutions from args and kwargs.\n\
9142The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009143
Eric Smith27bbca62010-11-04 17:06:58 +00009144PyDoc_STRVAR(format_map__doc__,
9145 "S.format_map(mapping) -> str\n\
9146\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009147Return a formatted version of S, using substitutions from mapping.\n\
9148The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009149
Eric Smith4a7d76d2008-05-30 18:10:19 +00009150static PyObject *
9151unicode__format__(PyObject* self, PyObject* args)
9152{
9153 PyObject *format_spec;
9154
9155 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9156 return NULL;
9157
9158 return _PyUnicode_FormatAdvanced(self,
9159 PyUnicode_AS_UNICODE(format_spec),
9160 PyUnicode_GET_SIZE(format_spec));
9161}
9162
Eric Smith8c663262007-08-25 02:26:07 +00009163PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009164 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009165\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009166Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009167
9168static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009169unicode__sizeof__(PyUnicodeObject *v)
9170{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009171 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9172 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009173}
9174
9175PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009176 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009177
9178static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009179unicode_getnewargs(PyUnicodeObject *v)
9180{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009181 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009182}
9183
Guido van Rossumd57fd912000-03-10 22:53:23 +00009184static PyMethodDef unicode_methods[] = {
9185
9186 /* Order is according to common usage: often used methods should
9187 appear first, since lookup is done sequentially. */
9188
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009189 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009190 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9191 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009192 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009193 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9194 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9195 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9196 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9197 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9198 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9199 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009200 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009201 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9202 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9203 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009204 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009205 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9206 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9207 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009208 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009209 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009210 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009211 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009212 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9213 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9214 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9215 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9216 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9217 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9218 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9219 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9220 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9221 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9222 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9223 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9224 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9225 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009226 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009227 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009228 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009229 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009230 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009231 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009232 {"maketrans", (PyCFunction) unicode_maketrans,
9233 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009234 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009235#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009236 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009237#endif
9238
9239#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009240 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009241 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009242 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009243#endif
9244
Benjamin Peterson14339b62009-01-31 16:36:08 +00009245 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009246 {NULL, NULL}
9247};
9248
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009249static PyObject *
9250unicode_mod(PyObject *v, PyObject *w)
9251{
Benjamin Peterson29060642009-01-31 22:14:21 +00009252 if (!PyUnicode_Check(v)) {
9253 Py_INCREF(Py_NotImplemented);
9254 return Py_NotImplemented;
9255 }
9256 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009257}
9258
9259static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009260 0, /*nb_add*/
9261 0, /*nb_subtract*/
9262 0, /*nb_multiply*/
9263 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009264};
9265
Guido van Rossumd57fd912000-03-10 22:53:23 +00009266static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009267 (lenfunc) unicode_length, /* sq_length */
9268 PyUnicode_Concat, /* sq_concat */
9269 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9270 (ssizeargfunc) unicode_getitem, /* sq_item */
9271 0, /* sq_slice */
9272 0, /* sq_ass_item */
9273 0, /* sq_ass_slice */
9274 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009275};
9276
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009277static PyObject*
9278unicode_subscript(PyUnicodeObject* self, PyObject* item)
9279{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009280 if (PyIndex_Check(item)) {
9281 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009282 if (i == -1 && PyErr_Occurred())
9283 return NULL;
9284 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009285 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009286 return unicode_getitem(self, i);
9287 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009288 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009289 Py_UNICODE* source_buf;
9290 Py_UNICODE* result_buf;
9291 PyObject* result;
9292
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009293 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009294 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009295 return NULL;
9296 }
9297
9298 if (slicelength <= 0) {
9299 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009300 } else if (start == 0 && step == 1 && slicelength == self->length &&
9301 PyUnicode_CheckExact(self)) {
9302 Py_INCREF(self);
9303 return (PyObject *)self;
9304 } else if (step == 1) {
9305 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009306 } else {
9307 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009308 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9309 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009310
Benjamin Peterson29060642009-01-31 22:14:21 +00009311 if (result_buf == NULL)
9312 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009313
9314 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9315 result_buf[i] = source_buf[cur];
9316 }
Tim Petersced69f82003-09-16 20:30:58 +00009317
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009318 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009319 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009320 return result;
9321 }
9322 } else {
9323 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9324 return NULL;
9325 }
9326}
9327
9328static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009329 (lenfunc)unicode_length, /* mp_length */
9330 (binaryfunc)unicode_subscript, /* mp_subscript */
9331 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009332};
9333
Guido van Rossumd57fd912000-03-10 22:53:23 +00009334
Guido van Rossumd57fd912000-03-10 22:53:23 +00009335/* Helpers for PyUnicode_Format() */
9336
9337static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009338getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009340 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009341 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009342 (*p_argidx)++;
9343 if (arglen < 0)
9344 return args;
9345 else
9346 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009347 }
9348 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009349 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009350 return NULL;
9351}
9352
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009353/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009355static PyObject *
9356formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009357{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009358 char *p;
9359 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009361
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362 x = PyFloat_AsDouble(v);
9363 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009364 return NULL;
9365
Guido van Rossumd57fd912000-03-10 22:53:23 +00009366 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009367 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009368
Eric Smith0923d1d2009-04-16 20:16:10 +00009369 p = PyOS_double_to_string(x, type, prec,
9370 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009371 if (p == NULL)
9372 return NULL;
9373 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009374 PyMem_Free(p);
9375 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376}
9377
Tim Peters38fd5b62000-09-21 05:43:11 +00009378static PyObject*
9379formatlong(PyObject *val, int flags, int prec, int type)
9380{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009381 char *buf;
9382 int len;
9383 PyObject *str; /* temporary string object. */
9384 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009385
Benjamin Peterson14339b62009-01-31 16:36:08 +00009386 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9387 if (!str)
9388 return NULL;
9389 result = PyUnicode_FromStringAndSize(buf, len);
9390 Py_DECREF(str);
9391 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009392}
9393
Guido van Rossumd57fd912000-03-10 22:53:23 +00009394static int
9395formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009396 size_t buflen,
9397 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009398{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009399 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009400 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009401 if (PyUnicode_GET_SIZE(v) == 1) {
9402 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9403 buf[1] = '\0';
9404 return 1;
9405 }
9406#ifndef Py_UNICODE_WIDE
9407 if (PyUnicode_GET_SIZE(v) == 2) {
9408 /* Decode a valid surrogate pair */
9409 int c0 = PyUnicode_AS_UNICODE(v)[0];
9410 int c1 = PyUnicode_AS_UNICODE(v)[1];
9411 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9412 0xDC00 <= c1 && c1 <= 0xDFFF) {
9413 buf[0] = c0;
9414 buf[1] = c1;
9415 buf[2] = '\0';
9416 return 2;
9417 }
9418 }
9419#endif
9420 goto onError;
9421 }
9422 else {
9423 /* Integer input truncated to a character */
9424 long x;
9425 x = PyLong_AsLong(v);
9426 if (x == -1 && PyErr_Occurred())
9427 goto onError;
9428
9429 if (x < 0 || x > 0x10ffff) {
9430 PyErr_SetString(PyExc_OverflowError,
9431 "%c arg not in range(0x110000)");
9432 return -1;
9433 }
9434
9435#ifndef Py_UNICODE_WIDE
9436 if (x > 0xffff) {
9437 x -= 0x10000;
9438 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9439 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9440 return 2;
9441 }
9442#endif
9443 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009444 buf[1] = '\0';
9445 return 1;
9446 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009447
Benjamin Peterson29060642009-01-31 22:14:21 +00009448 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009449 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009450 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009451 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452}
9453
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009454/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009455 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009456*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009457#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009458
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009460 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461{
9462 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009463 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009464 int args_owned = 0;
9465 PyUnicodeObject *result = NULL;
9466 PyObject *dict = NULL;
9467 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009468
Guido van Rossumd57fd912000-03-10 22:53:23 +00009469 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009470 PyErr_BadInternalCall();
9471 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009472 }
9473 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009474 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009475 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476 fmt = PyUnicode_AS_UNICODE(uformat);
9477 fmtcnt = PyUnicode_GET_SIZE(uformat);
9478
9479 reslen = rescnt = fmtcnt + 100;
9480 result = _PyUnicode_New(reslen);
9481 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009482 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483 res = PyUnicode_AS_UNICODE(result);
9484
9485 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009486 arglen = PyTuple_Size(args);
9487 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488 }
9489 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009490 arglen = -1;
9491 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009493 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009494 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009495 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496
9497 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009498 if (*fmt != '%') {
9499 if (--rescnt < 0) {
9500 rescnt = fmtcnt + 100;
9501 reslen += rescnt;
9502 if (_PyUnicode_Resize(&result, reslen) < 0)
9503 goto onError;
9504 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9505 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009506 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009507 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009508 }
9509 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 /* Got a format specifier */
9511 int flags = 0;
9512 Py_ssize_t width = -1;
9513 int prec = -1;
9514 Py_UNICODE c = '\0';
9515 Py_UNICODE fill;
9516 int isnumok;
9517 PyObject *v = NULL;
9518 PyObject *temp = NULL;
9519 Py_UNICODE *pbuf;
9520 Py_UNICODE sign;
9521 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009522 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523
Benjamin Peterson29060642009-01-31 22:14:21 +00009524 fmt++;
9525 if (*fmt == '(') {
9526 Py_UNICODE *keystart;
9527 Py_ssize_t keylen;
9528 PyObject *key;
9529 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009530
Benjamin Peterson29060642009-01-31 22:14:21 +00009531 if (dict == NULL) {
9532 PyErr_SetString(PyExc_TypeError,
9533 "format requires a mapping");
9534 goto onError;
9535 }
9536 ++fmt;
9537 --fmtcnt;
9538 keystart = fmt;
9539 /* Skip over balanced parentheses */
9540 while (pcount > 0 && --fmtcnt >= 0) {
9541 if (*fmt == ')')
9542 --pcount;
9543 else if (*fmt == '(')
9544 ++pcount;
9545 fmt++;
9546 }
9547 keylen = fmt - keystart - 1;
9548 if (fmtcnt < 0 || pcount > 0) {
9549 PyErr_SetString(PyExc_ValueError,
9550 "incomplete format key");
9551 goto onError;
9552 }
9553#if 0
9554 /* keys are converted to strings using UTF-8 and
9555 then looked up since Python uses strings to hold
9556 variables names etc. in its namespaces and we
9557 wouldn't want to break common idioms. */
9558 key = PyUnicode_EncodeUTF8(keystart,
9559 keylen,
9560 NULL);
9561#else
9562 key = PyUnicode_FromUnicode(keystart, keylen);
9563#endif
9564 if (key == NULL)
9565 goto onError;
9566 if (args_owned) {
9567 Py_DECREF(args);
9568 args_owned = 0;
9569 }
9570 args = PyObject_GetItem(dict, key);
9571 Py_DECREF(key);
9572 if (args == NULL) {
9573 goto onError;
9574 }
9575 args_owned = 1;
9576 arglen = -1;
9577 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009578 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009579 while (--fmtcnt >= 0) {
9580 switch (c = *fmt++) {
9581 case '-': flags |= F_LJUST; continue;
9582 case '+': flags |= F_SIGN; continue;
9583 case ' ': flags |= F_BLANK; continue;
9584 case '#': flags |= F_ALT; continue;
9585 case '0': flags |= F_ZERO; continue;
9586 }
9587 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009588 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009589 if (c == '*') {
9590 v = getnextarg(args, arglen, &argidx);
9591 if (v == NULL)
9592 goto onError;
9593 if (!PyLong_Check(v)) {
9594 PyErr_SetString(PyExc_TypeError,
9595 "* wants int");
9596 goto onError;
9597 }
9598 width = PyLong_AsLong(v);
9599 if (width == -1 && PyErr_Occurred())
9600 goto onError;
9601 if (width < 0) {
9602 flags |= F_LJUST;
9603 width = -width;
9604 }
9605 if (--fmtcnt >= 0)
9606 c = *fmt++;
9607 }
9608 else if (c >= '0' && c <= '9') {
9609 width = c - '0';
9610 while (--fmtcnt >= 0) {
9611 c = *fmt++;
9612 if (c < '0' || c > '9')
9613 break;
9614 if ((width*10) / 10 != width) {
9615 PyErr_SetString(PyExc_ValueError,
9616 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009617 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009618 }
9619 width = width*10 + (c - '0');
9620 }
9621 }
9622 if (c == '.') {
9623 prec = 0;
9624 if (--fmtcnt >= 0)
9625 c = *fmt++;
9626 if (c == '*') {
9627 v = getnextarg(args, arglen, &argidx);
9628 if (v == NULL)
9629 goto onError;
9630 if (!PyLong_Check(v)) {
9631 PyErr_SetString(PyExc_TypeError,
9632 "* wants int");
9633 goto onError;
9634 }
9635 prec = PyLong_AsLong(v);
9636 if (prec == -1 && PyErr_Occurred())
9637 goto onError;
9638 if (prec < 0)
9639 prec = 0;
9640 if (--fmtcnt >= 0)
9641 c = *fmt++;
9642 }
9643 else if (c >= '0' && c <= '9') {
9644 prec = c - '0';
9645 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009646 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009647 if (c < '0' || c > '9')
9648 break;
9649 if ((prec*10) / 10 != prec) {
9650 PyErr_SetString(PyExc_ValueError,
9651 "prec too big");
9652 goto onError;
9653 }
9654 prec = prec*10 + (c - '0');
9655 }
9656 }
9657 } /* prec */
9658 if (fmtcnt >= 0) {
9659 if (c == 'h' || c == 'l' || c == 'L') {
9660 if (--fmtcnt >= 0)
9661 c = *fmt++;
9662 }
9663 }
9664 if (fmtcnt < 0) {
9665 PyErr_SetString(PyExc_ValueError,
9666 "incomplete format");
9667 goto onError;
9668 }
9669 if (c != '%') {
9670 v = getnextarg(args, arglen, &argidx);
9671 if (v == NULL)
9672 goto onError;
9673 }
9674 sign = 0;
9675 fill = ' ';
9676 switch (c) {
9677
9678 case '%':
9679 pbuf = formatbuf;
9680 /* presume that buffer length is at least 1 */
9681 pbuf[0] = '%';
9682 len = 1;
9683 break;
9684
9685 case 's':
9686 case 'r':
9687 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009688 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009689 temp = v;
9690 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009691 }
9692 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009693 if (c == 's')
9694 temp = PyObject_Str(v);
9695 else if (c == 'r')
9696 temp = PyObject_Repr(v);
9697 else
9698 temp = PyObject_ASCII(v);
9699 if (temp == NULL)
9700 goto onError;
9701 if (PyUnicode_Check(temp))
9702 /* nothing to do */;
9703 else {
9704 Py_DECREF(temp);
9705 PyErr_SetString(PyExc_TypeError,
9706 "%s argument has non-string str()");
9707 goto onError;
9708 }
9709 }
9710 pbuf = PyUnicode_AS_UNICODE(temp);
9711 len = PyUnicode_GET_SIZE(temp);
9712 if (prec >= 0 && len > prec)
9713 len = prec;
9714 break;
9715
9716 case 'i':
9717 case 'd':
9718 case 'u':
9719 case 'o':
9720 case 'x':
9721 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +00009722 isnumok = 0;
9723 if (PyNumber_Check(v)) {
9724 PyObject *iobj=NULL;
9725
9726 if (PyLong_Check(v)) {
9727 iobj = v;
9728 Py_INCREF(iobj);
9729 }
9730 else {
9731 iobj = PyNumber_Long(v);
9732 }
9733 if (iobj!=NULL) {
9734 if (PyLong_Check(iobj)) {
9735 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -07009736 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +00009737 Py_DECREF(iobj);
9738 if (!temp)
9739 goto onError;
9740 pbuf = PyUnicode_AS_UNICODE(temp);
9741 len = PyUnicode_GET_SIZE(temp);
9742 sign = 1;
9743 }
9744 else {
9745 Py_DECREF(iobj);
9746 }
9747 }
9748 }
9749 if (!isnumok) {
9750 PyErr_Format(PyExc_TypeError,
9751 "%%%c format: a number is required, "
9752 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9753 goto onError;
9754 }
9755 if (flags & F_ZERO)
9756 fill = '0';
9757 break;
9758
9759 case 'e':
9760 case 'E':
9761 case 'f':
9762 case 'F':
9763 case 'g':
9764 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009765 temp = formatfloat(v, flags, prec, c);
9766 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009767 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009768 pbuf = PyUnicode_AS_UNICODE(temp);
9769 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009770 sign = 1;
9771 if (flags & F_ZERO)
9772 fill = '0';
9773 break;
9774
9775 case 'c':
9776 pbuf = formatbuf;
9777 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9778 if (len < 0)
9779 goto onError;
9780 break;
9781
9782 default:
9783 PyErr_Format(PyExc_ValueError,
9784 "unsupported format character '%c' (0x%x) "
9785 "at index %zd",
9786 (31<=c && c<=126) ? (char)c : '?',
9787 (int)c,
9788 (Py_ssize_t)(fmt - 1 -
9789 PyUnicode_AS_UNICODE(uformat)));
9790 goto onError;
9791 }
9792 if (sign) {
9793 if (*pbuf == '-' || *pbuf == '+') {
9794 sign = *pbuf++;
9795 len--;
9796 }
9797 else if (flags & F_SIGN)
9798 sign = '+';
9799 else if (flags & F_BLANK)
9800 sign = ' ';
9801 else
9802 sign = 0;
9803 }
9804 if (width < len)
9805 width = len;
9806 if (rescnt - (sign != 0) < width) {
9807 reslen -= rescnt;
9808 rescnt = width + fmtcnt + 100;
9809 reslen += rescnt;
9810 if (reslen < 0) {
9811 Py_XDECREF(temp);
9812 PyErr_NoMemory();
9813 goto onError;
9814 }
9815 if (_PyUnicode_Resize(&result, reslen) < 0) {
9816 Py_XDECREF(temp);
9817 goto onError;
9818 }
9819 res = PyUnicode_AS_UNICODE(result)
9820 + reslen - rescnt;
9821 }
9822 if (sign) {
9823 if (fill != ' ')
9824 *res++ = sign;
9825 rescnt--;
9826 if (width > len)
9827 width--;
9828 }
9829 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9830 assert(pbuf[0] == '0');
9831 assert(pbuf[1] == c);
9832 if (fill != ' ') {
9833 *res++ = *pbuf++;
9834 *res++ = *pbuf++;
9835 }
9836 rescnt -= 2;
9837 width -= 2;
9838 if (width < 0)
9839 width = 0;
9840 len -= 2;
9841 }
9842 if (width > len && !(flags & F_LJUST)) {
9843 do {
9844 --rescnt;
9845 *res++ = fill;
9846 } while (--width > len);
9847 }
9848 if (fill == ' ') {
9849 if (sign)
9850 *res++ = sign;
9851 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9852 assert(pbuf[0] == '0');
9853 assert(pbuf[1] == c);
9854 *res++ = *pbuf++;
9855 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009856 }
9857 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009858 Py_UNICODE_COPY(res, pbuf, len);
9859 res += len;
9860 rescnt -= len;
9861 while (--width >= len) {
9862 --rescnt;
9863 *res++ = ' ';
9864 }
9865 if (dict && (argidx < arglen) && c != '%') {
9866 PyErr_SetString(PyExc_TypeError,
9867 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009868 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009869 goto onError;
9870 }
9871 Py_XDECREF(temp);
9872 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873 } /* until end */
9874 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009875 PyErr_SetString(PyExc_TypeError,
9876 "not all arguments converted during string formatting");
9877 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009878 }
9879
Thomas Woutersa96affe2006-03-12 00:29:36 +00009880 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009881 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009882 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009883 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009884 }
9885 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009886 return (PyObject *)result;
9887
Benjamin Peterson29060642009-01-31 22:14:21 +00009888 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889 Py_XDECREF(result);
9890 Py_DECREF(uformat);
9891 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009892 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893 }
9894 return NULL;
9895}
9896
Jeremy Hylton938ace62002-07-17 16:30:39 +00009897static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009898unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9899
Tim Peters6d6c1a32001-08-02 04:15:00 +00009900static PyObject *
9901unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9902{
Benjamin Peterson29060642009-01-31 22:14:21 +00009903 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009904 static char *kwlist[] = {"object", "encoding", "errors", 0};
9905 char *encoding = NULL;
9906 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009907
Benjamin Peterson14339b62009-01-31 16:36:08 +00009908 if (type != &PyUnicode_Type)
9909 return unicode_subtype_new(type, args, kwds);
9910 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009911 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009912 return NULL;
9913 if (x == NULL)
9914 return (PyObject *)_PyUnicode_New(0);
9915 if (encoding == NULL && errors == NULL)
9916 return PyObject_Str(x);
9917 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009918 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009919}
9920
Guido van Rossume023fe02001-08-30 03:12:59 +00009921static PyObject *
9922unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9923{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009924 PyUnicodeObject *tmp, *pnew;
9925 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009926
Benjamin Peterson14339b62009-01-31 16:36:08 +00009927 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9928 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9929 if (tmp == NULL)
9930 return NULL;
9931 assert(PyUnicode_Check(tmp));
9932 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9933 if (pnew == NULL) {
9934 Py_DECREF(tmp);
9935 return NULL;
9936 }
9937 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9938 if (pnew->str == NULL) {
9939 _Py_ForgetReference((PyObject *)pnew);
9940 PyObject_Del(pnew);
9941 Py_DECREF(tmp);
9942 return PyErr_NoMemory();
9943 }
9944 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9945 pnew->length = n;
9946 pnew->hash = tmp->hash;
9947 Py_DECREF(tmp);
9948 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009949}
9950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009951PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009952 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009953\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009954Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009955encoding defaults to the current default string encoding.\n\
9956errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009957
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009958static PyObject *unicode_iter(PyObject *seq);
9959
Guido van Rossumd57fd912000-03-10 22:53:23 +00009960PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009961 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009962 "str", /* tp_name */
9963 sizeof(PyUnicodeObject), /* tp_size */
9964 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009965 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009966 (destructor)unicode_dealloc, /* tp_dealloc */
9967 0, /* tp_print */
9968 0, /* tp_getattr */
9969 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009970 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009971 unicode_repr, /* tp_repr */
9972 &unicode_as_number, /* tp_as_number */
9973 &unicode_as_sequence, /* tp_as_sequence */
9974 &unicode_as_mapping, /* tp_as_mapping */
9975 (hashfunc) unicode_hash, /* tp_hash*/
9976 0, /* tp_call*/
9977 (reprfunc) unicode_str, /* tp_str */
9978 PyObject_GenericGetAttr, /* tp_getattro */
9979 0, /* tp_setattro */
9980 0, /* tp_as_buffer */
9981 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009982 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009983 unicode_doc, /* tp_doc */
9984 0, /* tp_traverse */
9985 0, /* tp_clear */
9986 PyUnicode_RichCompare, /* tp_richcompare */
9987 0, /* tp_weaklistoffset */
9988 unicode_iter, /* tp_iter */
9989 0, /* tp_iternext */
9990 unicode_methods, /* tp_methods */
9991 0, /* tp_members */
9992 0, /* tp_getset */
9993 &PyBaseObject_Type, /* tp_base */
9994 0, /* tp_dict */
9995 0, /* tp_descr_get */
9996 0, /* tp_descr_set */
9997 0, /* tp_dictoffset */
9998 0, /* tp_init */
9999 0, /* tp_alloc */
10000 unicode_new, /* tp_new */
10001 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010002};
10003
10004/* Initialize the Unicode implementation */
10005
Thomas Wouters78890102000-07-22 19:25:51 +000010006void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010007{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010008 int i;
10009
Thomas Wouters477c8d52006-05-27 19:21:47 +000010010 /* XXX - move this array to unicodectype.c ? */
10011 Py_UNICODE linebreak[] = {
10012 0x000A, /* LINE FEED */
10013 0x000D, /* CARRIAGE RETURN */
10014 0x001C, /* FILE SEPARATOR */
10015 0x001D, /* GROUP SEPARATOR */
10016 0x001E, /* RECORD SEPARATOR */
10017 0x0085, /* NEXT LINE */
10018 0x2028, /* LINE SEPARATOR */
10019 0x2029, /* PARAGRAPH SEPARATOR */
10020 };
10021
Fred Drakee4315f52000-05-09 19:53:39 +000010022 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +000010023 free_list = NULL;
10024 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010026 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +000010027 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010028
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010029 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000010030 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000010031 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010032 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000010033
10034 /* initialize the linebreak bloom filter */
10035 bloom_linebreak = make_bloom_mask(
10036 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10037 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010038
10039 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040}
10041
10042/* Finalize the Unicode implementation */
10043
Christian Heimesa156e092008-02-16 07:38:31 +000010044int
10045PyUnicode_ClearFreeList(void)
10046{
10047 int freelist_size = numfree;
10048 PyUnicodeObject *u;
10049
10050 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010051 PyUnicodeObject *v = u;
10052 u = *(PyUnicodeObject **)u;
10053 if (v->str)
10054 PyObject_DEL(v->str);
10055 Py_XDECREF(v->defenc);
10056 PyObject_Del(v);
10057 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010058 }
10059 free_list = NULL;
10060 assert(numfree == 0);
10061 return freelist_size;
10062}
10063
Guido van Rossumd57fd912000-03-10 22:53:23 +000010064void
Thomas Wouters78890102000-07-22 19:25:51 +000010065_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010066{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010067 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010068
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010069 Py_XDECREF(unicode_empty);
10070 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010071
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010072 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010073 if (unicode_latin1[i]) {
10074 Py_DECREF(unicode_latin1[i]);
10075 unicode_latin1[i] = NULL;
10076 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010077 }
Christian Heimesa156e092008-02-16 07:38:31 +000010078 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010079}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010080
Walter Dörwald16807132007-05-25 13:52:07 +000010081void
10082PyUnicode_InternInPlace(PyObject **p)
10083{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010084 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10085 PyObject *t;
10086 if (s == NULL || !PyUnicode_Check(s))
10087 Py_FatalError(
10088 "PyUnicode_InternInPlace: unicode strings only please!");
10089 /* If it's a subclass, we don't really know what putting
10090 it in the interned dict might do. */
10091 if (!PyUnicode_CheckExact(s))
10092 return;
10093 if (PyUnicode_CHECK_INTERNED(s))
10094 return;
10095 if (interned == NULL) {
10096 interned = PyDict_New();
10097 if (interned == NULL) {
10098 PyErr_Clear(); /* Don't leave an exception */
10099 return;
10100 }
10101 }
10102 /* It might be that the GetItem call fails even
10103 though the key is present in the dictionary,
10104 namely when this happens during a stack overflow. */
10105 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010106 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010107 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010108
Benjamin Peterson29060642009-01-31 22:14:21 +000010109 if (t) {
10110 Py_INCREF(t);
10111 Py_DECREF(*p);
10112 *p = t;
10113 return;
10114 }
Walter Dörwald16807132007-05-25 13:52:07 +000010115
Benjamin Peterson14339b62009-01-31 16:36:08 +000010116 PyThreadState_GET()->recursion_critical = 1;
10117 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10118 PyErr_Clear();
10119 PyThreadState_GET()->recursion_critical = 0;
10120 return;
10121 }
10122 PyThreadState_GET()->recursion_critical = 0;
10123 /* The two references in interned are not counted by refcnt.
10124 The deallocator will take care of this */
10125 Py_REFCNT(s) -= 2;
10126 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010127}
10128
10129void
10130PyUnicode_InternImmortal(PyObject **p)
10131{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010132 PyUnicode_InternInPlace(p);
10133 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10134 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10135 Py_INCREF(*p);
10136 }
Walter Dörwald16807132007-05-25 13:52:07 +000010137}
10138
10139PyObject *
10140PyUnicode_InternFromString(const char *cp)
10141{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010142 PyObject *s = PyUnicode_FromString(cp);
10143 if (s == NULL)
10144 return NULL;
10145 PyUnicode_InternInPlace(&s);
10146 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010147}
10148
10149void _Py_ReleaseInternedUnicodeStrings(void)
10150{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010151 PyObject *keys;
10152 PyUnicodeObject *s;
10153 Py_ssize_t i, n;
10154 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010155
Benjamin Peterson14339b62009-01-31 16:36:08 +000010156 if (interned == NULL || !PyDict_Check(interned))
10157 return;
10158 keys = PyDict_Keys(interned);
10159 if (keys == NULL || !PyList_Check(keys)) {
10160 PyErr_Clear();
10161 return;
10162 }
Walter Dörwald16807132007-05-25 13:52:07 +000010163
Benjamin Peterson14339b62009-01-31 16:36:08 +000010164 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10165 detector, interned unicode strings are not forcibly deallocated;
10166 rather, we give them their stolen references back, and then clear
10167 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010168
Benjamin Peterson14339b62009-01-31 16:36:08 +000010169 n = PyList_GET_SIZE(keys);
10170 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010171 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010172 for (i = 0; i < n; i++) {
10173 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10174 switch (s->state) {
10175 case SSTATE_NOT_INTERNED:
10176 /* XXX Shouldn't happen */
10177 break;
10178 case SSTATE_INTERNED_IMMORTAL:
10179 Py_REFCNT(s) += 1;
10180 immortal_size += s->length;
10181 break;
10182 case SSTATE_INTERNED_MORTAL:
10183 Py_REFCNT(s) += 2;
10184 mortal_size += s->length;
10185 break;
10186 default:
10187 Py_FatalError("Inconsistent interned string state.");
10188 }
10189 s->state = SSTATE_NOT_INTERNED;
10190 }
10191 fprintf(stderr, "total size of all interned strings: "
10192 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10193 "mortal/immortal\n", mortal_size, immortal_size);
10194 Py_DECREF(keys);
10195 PyDict_Clear(interned);
10196 Py_DECREF(interned);
10197 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010198}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010199
10200
10201/********************* Unicode Iterator **************************/
10202
10203typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010204 PyObject_HEAD
10205 Py_ssize_t it_index;
10206 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010207} unicodeiterobject;
10208
10209static void
10210unicodeiter_dealloc(unicodeiterobject *it)
10211{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010212 _PyObject_GC_UNTRACK(it);
10213 Py_XDECREF(it->it_seq);
10214 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010215}
10216
10217static int
10218unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10219{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010220 Py_VISIT(it->it_seq);
10221 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010222}
10223
10224static PyObject *
10225unicodeiter_next(unicodeiterobject *it)
10226{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010227 PyUnicodeObject *seq;
10228 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010229
Benjamin Peterson14339b62009-01-31 16:36:08 +000010230 assert(it != NULL);
10231 seq = it->it_seq;
10232 if (seq == NULL)
10233 return NULL;
10234 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010235
Benjamin Peterson14339b62009-01-31 16:36:08 +000010236 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10237 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010238 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010239 if (item != NULL)
10240 ++it->it_index;
10241 return item;
10242 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010243
Benjamin Peterson14339b62009-01-31 16:36:08 +000010244 Py_DECREF(seq);
10245 it->it_seq = NULL;
10246 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010247}
10248
10249static PyObject *
10250unicodeiter_len(unicodeiterobject *it)
10251{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010252 Py_ssize_t len = 0;
10253 if (it->it_seq)
10254 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10255 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010256}
10257
10258PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10259
10260static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010261 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010262 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010263 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010264};
10265
10266PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010267 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10268 "str_iterator", /* tp_name */
10269 sizeof(unicodeiterobject), /* tp_basicsize */
10270 0, /* tp_itemsize */
10271 /* methods */
10272 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10273 0, /* tp_print */
10274 0, /* tp_getattr */
10275 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010276 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010277 0, /* tp_repr */
10278 0, /* tp_as_number */
10279 0, /* tp_as_sequence */
10280 0, /* tp_as_mapping */
10281 0, /* tp_hash */
10282 0, /* tp_call */
10283 0, /* tp_str */
10284 PyObject_GenericGetAttr, /* tp_getattro */
10285 0, /* tp_setattro */
10286 0, /* tp_as_buffer */
10287 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10288 0, /* tp_doc */
10289 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10290 0, /* tp_clear */
10291 0, /* tp_richcompare */
10292 0, /* tp_weaklistoffset */
10293 PyObject_SelfIter, /* tp_iter */
10294 (iternextfunc)unicodeiter_next, /* tp_iternext */
10295 unicodeiter_methods, /* tp_methods */
10296 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010297};
10298
10299static PyObject *
10300unicode_iter(PyObject *seq)
10301{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010302 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010303
Benjamin Peterson14339b62009-01-31 16:36:08 +000010304 if (!PyUnicode_Check(seq)) {
10305 PyErr_BadInternalCall();
10306 return NULL;
10307 }
10308 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10309 if (it == NULL)
10310 return NULL;
10311 it->it_index = 0;
10312 Py_INCREF(seq);
10313 it->it_seq = (PyUnicodeObject *)seq;
10314 _PyObject_GC_TRACK(it);
10315 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010316}
10317
Martin v. Löwis5b222132007-06-10 09:51:05 +000010318size_t
10319Py_UNICODE_strlen(const Py_UNICODE *u)
10320{
10321 int res = 0;
10322 while(*u++)
10323 res++;
10324 return res;
10325}
10326
10327Py_UNICODE*
10328Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10329{
10330 Py_UNICODE *u = s1;
10331 while ((*u++ = *s2++));
10332 return s1;
10333}
10334
10335Py_UNICODE*
10336Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10337{
10338 Py_UNICODE *u = s1;
10339 while ((*u++ = *s2++))
10340 if (n-- == 0)
10341 break;
10342 return s1;
10343}
10344
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010345Py_UNICODE*
10346Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10347{
10348 Py_UNICODE *u1 = s1;
10349 u1 += Py_UNICODE_strlen(u1);
10350 Py_UNICODE_strcpy(u1, s2);
10351 return s1;
10352}
10353
Martin v. Löwis5b222132007-06-10 09:51:05 +000010354int
10355Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10356{
10357 while (*s1 && *s2 && *s1 == *s2)
10358 s1++, s2++;
10359 if (*s1 && *s2)
10360 return (*s1 < *s2) ? -1 : +1;
10361 if (*s1)
10362 return 1;
10363 if (*s2)
10364 return -1;
10365 return 0;
10366}
10367
Victor Stinneref8d95c2010-08-16 22:03:11 +000010368int
10369Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10370{
10371 register Py_UNICODE u1, u2;
10372 for (; n != 0; n--) {
10373 u1 = *s1;
10374 u2 = *s2;
10375 if (u1 != u2)
10376 return (u1 < u2) ? -1 : +1;
10377 if (u1 == '\0')
10378 return 0;
10379 s1++;
10380 s2++;
10381 }
10382 return 0;
10383}
10384
Martin v. Löwis5b222132007-06-10 09:51:05 +000010385Py_UNICODE*
10386Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10387{
10388 const Py_UNICODE *p;
10389 for (p = s; *p; p++)
10390 if (*p == c)
10391 return (Py_UNICODE*)p;
10392 return NULL;
10393}
10394
Victor Stinner331ea922010-08-10 16:37:20 +000010395Py_UNICODE*
10396Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10397{
10398 const Py_UNICODE *p;
10399 p = s + Py_UNICODE_strlen(s);
10400 while (p != s) {
10401 p--;
10402 if (*p == c)
10403 return (Py_UNICODE*)p;
10404 }
10405 return NULL;
10406}
10407
Victor Stinner71133ff2010-09-01 23:43:53 +000010408Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010409PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010410{
10411 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10412 Py_UNICODE *copy;
10413 Py_ssize_t size;
10414
10415 /* Ensure we won't overflow the size. */
10416 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10417 PyErr_NoMemory();
10418 return NULL;
10419 }
10420 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10421 size *= sizeof(Py_UNICODE);
10422 copy = PyMem_Malloc(size);
10423 if (copy == NULL) {
10424 PyErr_NoMemory();
10425 return NULL;
10426 }
10427 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10428 return copy;
10429}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010430
Georg Brandl66c221e2010-10-14 07:04:07 +000010431/* A _string module, to export formatter_parser and formatter_field_name_split
10432 to the string.Formatter class implemented in Python. */
10433
10434static PyMethodDef _string_methods[] = {
10435 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10436 METH_O, PyDoc_STR("split the argument as a field name")},
10437 {"formatter_parser", (PyCFunction) formatter_parser,
10438 METH_O, PyDoc_STR("parse the argument as a format string")},
10439 {NULL, NULL}
10440};
10441
10442static struct PyModuleDef _string_module = {
10443 PyModuleDef_HEAD_INIT,
10444 "_string",
10445 PyDoc_STR("string helper module"),
10446 0,
10447 _string_methods,
10448 NULL,
10449 NULL,
10450 NULL,
10451 NULL
10452};
10453
10454PyMODINIT_FUNC
10455PyInit__string(void)
10456{
10457 return PyModule_Create(&_string_module);
10458}
10459
10460
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010461#ifdef __cplusplus
10462}
10463#endif