blob: ddd8d5307616a06fb41aacac43413a497d5aba4b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000145static PyObject *unicode_encode_call_errorhandler(const char *errors,
146 PyObject **errorHandler,const char *encoding, const char *reason,
147 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
148 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
149
Victor Stinner31be90b2010-04-22 19:38:16 +0000150static void raise_encode_exception(PyObject **exceptionObject,
151 const char *encoding,
152 const Py_UNICODE *unicode, Py_ssize_t size,
153 Py_ssize_t startpos, Py_ssize_t endpos,
154 const char *reason);
155
Christian Heimes190d79e2008-01-30 11:58:22 +0000156/* Same for linebreaks */
157static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000160/* 0x000B, * LINE TABULATION */
161/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x001C, * FILE SEPARATOR */
166/* 0x001D, * GROUP SEPARATOR */
167/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000168 0, 0, 0, 0, 1, 1, 1, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000173
Benjamin Peterson14339b62009-01-31 16:36:08 +0000174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000182};
183
184
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000185Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000186PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000188#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000189 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 /* This is actually an illegal character, so it should
192 not be passed to unichr. */
193 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#endif
195}
196
Thomas Wouters477c8d52006-05-27 19:21:47 +0000197/* --- Bloom Filters ----------------------------------------------------- */
198
199/* stuff to implement simple "bloom filters" for Unicode characters.
200 to keep things simple, we use a single bitmask, using the least 5
201 bits from each unicode characters as the bit index. */
202
203/* the linebreak mask is set up by Unicode_Init below */
204
Antoine Pitrouf068f942010-01-13 14:19:12 +0000205#if LONG_BIT >= 128
206#define BLOOM_WIDTH 128
207#elif LONG_BIT >= 64
208#define BLOOM_WIDTH 64
209#elif LONG_BIT >= 32
210#define BLOOM_WIDTH 32
211#else
212#error "LONG_BIT is smaller than 32"
213#endif
214
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215#define BLOOM_MASK unsigned long
216
217static BLOOM_MASK bloom_linebreak;
218
Antoine Pitrouf068f942010-01-13 14:19:12 +0000219#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
220#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221
Benjamin Peterson29060642009-01-31 22:14:21 +0000222#define BLOOM_LINEBREAK(ch) \
223 ((ch) < 128U ? ascii_linebreak[(ch)] : \
224 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000225
226Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
227{
228 /* calculate simple bloom-style bitmask for a given unicode string */
229
Antoine Pitrouf068f942010-01-13 14:19:12 +0000230 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231 Py_ssize_t i;
232
233 mask = 0;
234 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000235 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000236
237 return mask;
238}
239
240Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
241{
242 Py_ssize_t i;
243
244 for (i = 0; i < setlen; i++)
245 if (set[i] == chr)
246 return 1;
247
248 return 0;
249}
250
Benjamin Peterson29060642009-01-31 22:14:21 +0000251#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000252 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
253
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254/* --- Unicode Object ----------------------------------------------------- */
255
256static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000257int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000258 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259{
260 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000261
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000266 /* Resizing shared object (unicode_empty or single character
267 objects) in-place is not allowed. Use PyUnicode_Resize()
268 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000271 (unicode->length == 1 &&
272 unicode->str[0] < 256U &&
273 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000275 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 return -1;
277 }
278
Thomas Wouters477c8d52006-05-27 19:21:47 +0000279 /* We allocate one more byte to make sure the string is Ux0000 terminated.
280 The overallocation is also used by fastsearch, which assumes that it's
281 safe to look at str[length] (without making any assumptions about what
282 it contains). */
283
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000285 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000286 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 PyErr_NoMemory();
290 return -1;
291 }
292 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294
Benjamin Peterson29060642009-01-31 22:14:21 +0000295 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000297 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000298 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 }
300 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000301
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 return 0;
303}
304
305/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000306 Ux0000 terminated; some code (e.g. new_identifier)
307 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308
309 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000310 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312*/
313
314static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316{
317 register PyUnicodeObject *unicode;
318
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 if (length == 0 && unicode_empty != NULL) {
321 Py_INCREF(unicode_empty);
322 return unicode_empty;
323 }
324
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000325 /* Ensure we won't overflow the size. */
326 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
327 return (PyUnicodeObject *)PyErr_NoMemory();
328 }
329
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000331 if (free_list) {
332 unicode = free_list;
333 free_list = *(PyUnicodeObject **)unicode;
334 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000335 if (unicode->str) {
336 /* Keep-Alive optimization: we only upsize the buffer,
337 never downsize it. */
338 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000339 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000340 PyObject_DEL(unicode->str);
341 unicode->str = NULL;
342 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000343 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 }
348 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 }
350 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000351 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000352 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 if (unicode == NULL)
354 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
356 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 }
358
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000359 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000360 PyErr_NoMemory();
361 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000363 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000364 * the caller fails before initializing str -- unicode_resize()
365 * reads str[0], and the Keep-Alive optimization can keep memory
366 * allocated for str alive across a call to unicode_dealloc(unicode).
367 * We don't want unicode_resize to read uninitialized memory in
368 * that case.
369 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000370 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000374 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000375 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377
Benjamin Peterson29060642009-01-31 22:14:21 +0000378 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000379 /* XXX UNREF/NEWREF interface should be more symmetrical */
380 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000382 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000383 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384}
385
386static
Guido van Rossum9475a232001-10-05 20:51:39 +0000387void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388{
Walter Dörwald16807132007-05-25 13:52:07 +0000389 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 case SSTATE_NOT_INTERNED:
391 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000392
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_INTERNED_MORTAL:
394 /* revive dead object temporarily for DelItem */
395 Py_REFCNT(unicode) = 3;
396 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
397 Py_FatalError(
398 "deletion of interned string failed");
399 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000400
Benjamin Peterson29060642009-01-31 22:14:21 +0000401 case SSTATE_INTERNED_IMMORTAL:
402 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 default:
405 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406 }
407
Guido van Rossum604ddf82001-12-06 20:03:56 +0000408 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000409 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000410 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000411 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
412 PyObject_DEL(unicode->str);
413 unicode->str = NULL;
414 unicode->length = 0;
415 }
416 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000417 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000418 }
419 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000420 *(PyUnicodeObject **)unicode = free_list;
421 free_list = unicode;
422 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000425 PyObject_DEL(unicode->str);
426 Py_XDECREF(unicode->defenc);
427 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428 }
429}
430
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000431static
432int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433{
434 register PyUnicodeObject *v;
435
436 /* Argument checks */
437 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000441 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000442 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000443 PyErr_BadInternalCall();
444 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000445 }
446
447 /* Resizing unicode_empty and single character objects is not
448 possible since these are being shared. We simply return a fresh
449 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000450 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000451 (v == unicode_empty || v->length == 1)) {
452 PyUnicodeObject *w = _PyUnicode_New(length);
453 if (w == NULL)
454 return -1;
455 Py_UNICODE_COPY(w->str, v->str,
456 length < v->length ? length : v->length);
457 Py_DECREF(*unicode);
458 *unicode = w;
459 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 }
461
462 /* Note that we don't have to modify *unicode for unshared Unicode
463 objects, since we can modify them in-place. */
464 return unicode_resize(v, length);
465}
466
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000467int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
468{
469 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
470}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000471
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000473 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474{
475 PyUnicodeObject *unicode;
476
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000477 /* If the Unicode data is known at construction time, we can apply
478 some optimizations which share commonly used objects. */
479 if (u != NULL) {
480
Benjamin Peterson29060642009-01-31 22:14:21 +0000481 /* Optimization for empty strings */
482 if (size == 0 && unicode_empty != NULL) {
483 Py_INCREF(unicode_empty);
484 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000485 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000486
487 /* Single character Unicode objects in the Latin-1 range are
488 shared when using this constructor */
489 if (size == 1 && *u < 256) {
490 unicode = unicode_latin1[*u];
491 if (!unicode) {
492 unicode = _PyUnicode_New(1);
493 if (!unicode)
494 return NULL;
495 unicode->str[0] = *u;
496 unicode_latin1[*u] = unicode;
497 }
498 Py_INCREF(unicode);
499 return (PyObject *)unicode;
500 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000501 }
Tim Petersced69f82003-09-16 20:30:58 +0000502
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 unicode = _PyUnicode_New(size);
504 if (!unicode)
505 return NULL;
506
507 /* Copy the Unicode data into the new object */
508 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000509 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510
511 return (PyObject *)unicode;
512}
513
Walter Dörwaldd2034312007-05-18 16:29:38 +0000514PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000515{
516 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000517
Benjamin Peterson14339b62009-01-31 16:36:08 +0000518 if (size < 0) {
519 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000520 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 return NULL;
522 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000524 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000525 some optimizations which share commonly used objects.
526 Also, this means the input must be UTF-8, so fall back to the
527 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000528 if (u != NULL) {
529
Benjamin Peterson29060642009-01-31 22:14:21 +0000530 /* Optimization for empty strings */
531 if (size == 0 && unicode_empty != NULL) {
532 Py_INCREF(unicode_empty);
533 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000534 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000535
536 /* Single characters are shared when using this constructor.
537 Restrict to ASCII, since the input must be UTF-8. */
538 if (size == 1 && Py_CHARMASK(*u) < 128) {
539 unicode = unicode_latin1[Py_CHARMASK(*u)];
540 if (!unicode) {
541 unicode = _PyUnicode_New(1);
542 if (!unicode)
543 return NULL;
544 unicode->str[0] = Py_CHARMASK(*u);
545 unicode_latin1[Py_CHARMASK(*u)] = unicode;
546 }
547 Py_INCREF(unicode);
548 return (PyObject *)unicode;
549 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000550
551 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000552 }
553
Walter Dörwald55507312007-05-18 13:12:10 +0000554 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 if (!unicode)
556 return NULL;
557
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 return (PyObject *)unicode;
559}
560
Walter Dörwaldd2034312007-05-18 16:29:38 +0000561PyObject *PyUnicode_FromString(const char *u)
562{
563 size_t size = strlen(u);
564 if (size > PY_SSIZE_T_MAX) {
565 PyErr_SetString(PyExc_OverflowError, "input too long");
566 return NULL;
567 }
568
569 return PyUnicode_FromStringAndSize(u, size);
570}
571
Guido van Rossumd57fd912000-03-10 22:53:23 +0000572#ifdef HAVE_WCHAR_H
573
Mark Dickinson081dfee2009-03-18 14:47:41 +0000574#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
575# define CONVERT_WCHAR_TO_SURROGATES
576#endif
577
578#ifdef CONVERT_WCHAR_TO_SURROGATES
579
580/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
581 to convert from UTF32 to UTF16. */
582
583PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
584 Py_ssize_t size)
585{
586 PyUnicodeObject *unicode;
587 register Py_ssize_t i;
588 Py_ssize_t alloc;
589 const wchar_t *orig_w;
590
591 if (w == NULL) {
592 if (size == 0)
593 return PyUnicode_FromStringAndSize(NULL, 0);
594 PyErr_BadInternalCall();
595 return NULL;
596 }
597
598 if (size == -1) {
599 size = wcslen(w);
600 }
601
602 alloc = size;
603 orig_w = w;
604 for (i = size; i > 0; i--) {
605 if (*w > 0xFFFF)
606 alloc++;
607 w++;
608 }
609 w = orig_w;
610 unicode = _PyUnicode_New(alloc);
611 if (!unicode)
612 return NULL;
613
614 /* Copy the wchar_t data into the new object */
615 {
616 register Py_UNICODE *u;
617 u = PyUnicode_AS_UNICODE(unicode);
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF) {
620 wchar_t ordinal = *w++;
621 ordinal -= 0x10000;
622 *u++ = 0xD800 | (ordinal >> 10);
623 *u++ = 0xDC00 | (ordinal & 0x3FF);
624 }
625 else
626 *u++ = *w++;
627 }
628 }
629 return (PyObject *)unicode;
630}
631
632#else
633
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000635 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636{
637 PyUnicodeObject *unicode;
638
639 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000640 if (size == 0)
641 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000642 PyErr_BadInternalCall();
643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000644 }
645
Martin v. Löwis790465f2008-04-05 20:41:37 +0000646 if (size == -1) {
647 size = wcslen(w);
648 }
649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = _PyUnicode_New(size);
651 if (!unicode)
652 return NULL;
653
654 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000655#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000657#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000659 register Py_UNICODE *u;
660 register Py_ssize_t i;
661 u = PyUnicode_AS_UNICODE(unicode);
662 for (i = size; i > 0; i--)
663 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 }
665#endif
666
667 return (PyObject *)unicode;
668}
669
Mark Dickinson081dfee2009-03-18 14:47:41 +0000670#endif /* CONVERT_WCHAR_TO_SURROGATES */
671
672#undef CONVERT_WCHAR_TO_SURROGATES
673
Walter Dörwald346737f2007-05-31 10:44:43 +0000674static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000675makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
676 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000677{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000678 *fmt++ = '%';
679 if (width) {
680 if (zeropad)
681 *fmt++ = '0';
682 fmt += sprintf(fmt, "%d", width);
683 }
684 if (precision)
685 fmt += sprintf(fmt, ".%d", precision);
686 if (longflag)
687 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000688 else if (longlongflag) {
689 /* longlongflag should only ever be nonzero on machines with
690 HAVE_LONG_LONG defined */
691#ifdef HAVE_LONG_LONG
692 char *f = PY_FORMAT_LONG_LONG;
693 while (*f)
694 *fmt++ = *f++;
695#else
696 /* we shouldn't ever get here */
697 assert(0);
698 *fmt++ = 'l';
699#endif
700 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000701 else if (size_tflag) {
702 char *f = PY_FORMAT_SIZE_T;
703 while (*f)
704 *fmt++ = *f++;
705 }
706 *fmt++ = c;
707 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000708}
709
Walter Dörwaldd2034312007-05-18 16:29:38 +0000710#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
711
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000712/* size of fixed-size buffer for formatting single arguments */
713#define ITEM_BUFFER_LEN 21
714/* maximum number of characters required for output of %ld. 21 characters
715 allows for 64-bit integers (in decimal) and an optional sign. */
716#define MAX_LONG_CHARS 21
717/* maximum number of characters required for output of %lld.
718 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
719 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
720#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
721
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722PyObject *
723PyUnicode_FromFormatV(const char *format, va_list vargs)
724{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000725 va_list count;
726 Py_ssize_t callcount = 0;
727 PyObject **callresults = NULL;
728 PyObject **callresult = NULL;
729 Py_ssize_t n = 0;
730 int width = 0;
731 int precision = 0;
732 int zeropad;
733 const char* f;
734 Py_UNICODE *s;
735 PyObject *string;
736 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000737 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000738 /* use abuffer instead of buffer, if we need more space
739 * (which can happen if there's a format specifier with width). */
740 char *abuffer = NULL;
741 char *realbuffer;
742 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000743 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000744 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000745
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000746 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000747 /* step 1: count the number of %S/%R/%A/%s format specifications
748 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
749 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
750 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000751 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000752 if (*f == '%') {
753 if (*(f+1)=='%')
754 continue;
Victor Stinner2b574a22011-03-01 22:48:49 +0000755 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000756 ++callcount;
David Malcolm96960882010-11-05 17:23:41 +0000757 while (Py_ISDIGIT((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000758 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000759 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000760 ;
761 if (*f == 's')
762 ++callcount;
763 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000764 else if (128 <= (unsigned char)*f) {
765 PyErr_Format(PyExc_ValueError,
766 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000767 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000768 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000769 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 }
772 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000773 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000774 if (callcount) {
775 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
776 if (!callresults) {
777 PyErr_NoMemory();
778 return NULL;
779 }
780 callresult = callresults;
781 }
782 /* step 3: figure out how large a buffer we need */
783 for (f = format; *f; f++) {
784 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000785#ifdef HAVE_LONG_LONG
786 int longlongflag = 0;
787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000788 const char* p = f;
789 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000790 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000792 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000793 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794
Benjamin Peterson14339b62009-01-31 16:36:08 +0000795 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
796 * they don't affect the amount of space we reserve.
797 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000798 if (*f == 'l') {
799 if (f[1] == 'd' || f[1] == 'u') {
800 ++f;
801 }
802#ifdef HAVE_LONG_LONG
803 else if (f[1] == 'l' &&
804 (f[2] == 'd' || f[2] == 'u')) {
805 longlongflag = 1;
806 f += 2;
807 }
808#endif
809 }
810 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000811 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000812 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000813
Benjamin Peterson14339b62009-01-31 16:36:08 +0000814 switch (*f) {
815 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +0000816 {
817#ifndef Py_UNICODE_WIDE
818 int ordinal = va_arg(count, int);
819 if (ordinal > 0xffff)
820 n += 2;
821 else
822 n++;
823#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000824 (void)va_arg(count, int);
Victor Stinner659eb842011-02-23 12:14:22 +0000825 n++;
826#endif
827 break;
828 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 case '%':
830 n++;
831 break;
832 case 'd': case 'u': case 'i': case 'x':
833 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000834#ifdef HAVE_LONG_LONG
835 if (longlongflag) {
836 if (width < MAX_LONG_LONG_CHARS)
837 width = MAX_LONG_LONG_CHARS;
838 }
839 else
840#endif
841 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
842 including sign. Decimal takes the most space. This
843 isn't enough for octal. If a width is specified we
844 need more (which we allocate later). */
845 if (width < MAX_LONG_CHARS)
846 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000847 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000848 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000849 if (abuffersize < width)
850 abuffersize = width;
851 break;
852 case 's':
853 {
854 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000855 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000856 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
857 if (!str)
858 goto fail;
859 n += PyUnicode_GET_SIZE(str);
860 /* Remember the str and switch to the next slot */
861 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000862 break;
863 }
864 case 'U':
865 {
866 PyObject *obj = va_arg(count, PyObject *);
867 assert(obj && PyUnicode_Check(obj));
868 n += PyUnicode_GET_SIZE(obj);
869 break;
870 }
871 case 'V':
872 {
873 PyObject *obj = va_arg(count, PyObject *);
874 const char *str = va_arg(count, const char *);
Victor Stinner2b574a22011-03-01 22:48:49 +0000875 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000876 assert(obj || str);
877 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2b574a22011-03-01 22:48:49 +0000878 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000879 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2b574a22011-03-01 22:48:49 +0000880 *callresult++ = NULL;
881 }
882 else {
883 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
884 if (!str_obj)
885 goto fail;
886 n += PyUnicode_GET_SIZE(str_obj);
887 *callresult++ = str_obj;
888 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000889 break;
890 }
891 case 'S':
892 {
893 PyObject *obj = va_arg(count, PyObject *);
894 PyObject *str;
895 assert(obj);
896 str = PyObject_Str(obj);
897 if (!str)
898 goto fail;
899 n += PyUnicode_GET_SIZE(str);
900 /* Remember the str and switch to the next slot */
901 *callresult++ = str;
902 break;
903 }
904 case 'R':
905 {
906 PyObject *obj = va_arg(count, PyObject *);
907 PyObject *repr;
908 assert(obj);
909 repr = PyObject_Repr(obj);
910 if (!repr)
911 goto fail;
912 n += PyUnicode_GET_SIZE(repr);
913 /* Remember the repr and switch to the next slot */
914 *callresult++ = repr;
915 break;
916 }
917 case 'A':
918 {
919 PyObject *obj = va_arg(count, PyObject *);
920 PyObject *ascii;
921 assert(obj);
922 ascii = PyObject_ASCII(obj);
923 if (!ascii)
924 goto fail;
925 n += PyUnicode_GET_SIZE(ascii);
926 /* Remember the repr and switch to the next slot */
927 *callresult++ = ascii;
928 break;
929 }
930 case 'p':
931 (void) va_arg(count, int);
932 /* maximum 64-bit pointer representation:
933 * 0xffffffffffffffff
934 * so 19 characters is enough.
935 * XXX I count 18 -- what's the extra for?
936 */
937 n += 19;
938 break;
939 default:
940 /* if we stumble upon an unknown
941 formatting code, copy the rest of
942 the format string to the output
943 string. (we cannot just skip the
944 code, since there's no way to know
945 what's in the argument list) */
946 n += strlen(p);
947 goto expand;
948 }
949 } else
950 n++;
951 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000952 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000953 if (abuffersize > ITEM_BUFFER_LEN) {
954 /* add 1 for sprintf's trailing null byte */
955 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000956 if (!abuffer) {
957 PyErr_NoMemory();
958 goto fail;
959 }
960 realbuffer = abuffer;
961 }
962 else
963 realbuffer = buffer;
964 /* step 4: fill the buffer */
965 /* Since we've analyzed how much space we need for the worst case,
966 we don't have to resize the string.
967 There can be no errors beyond this point. */
968 string = PyUnicode_FromUnicode(NULL, n);
969 if (!string)
970 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000971
Benjamin Peterson14339b62009-01-31 16:36:08 +0000972 s = PyUnicode_AS_UNICODE(string);
973 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000974
Benjamin Peterson14339b62009-01-31 16:36:08 +0000975 for (f = format; *f; f++) {
976 if (*f == '%') {
977 const char* p = f++;
978 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000979 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000980 int size_tflag = 0;
981 zeropad = (*f == '0');
982 /* parse the width.precision part */
983 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000984 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000985 width = (width*10) + *f++ - '0';
986 precision = 0;
987 if (*f == '.') {
988 f++;
David Malcolm96960882010-11-05 17:23:41 +0000989 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000990 precision = (precision*10) + *f++ - '0';
991 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000992 /* Handle %ld, %lu, %lld and %llu. */
993 if (*f == 'l') {
994 if (f[1] == 'd' || f[1] == 'u') {
995 longflag = 1;
996 ++f;
997 }
998#ifdef HAVE_LONG_LONG
999 else if (f[1] == 'l' &&
1000 (f[2] == 'd' || f[2] == 'u')) {
1001 longlongflag = 1;
1002 f += 2;
1003 }
1004#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001005 }
1006 /* handle the size_t flag. */
1007 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1008 size_tflag = 1;
1009 ++f;
1010 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001011
Benjamin Peterson14339b62009-01-31 16:36:08 +00001012 switch (*f) {
1013 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +00001014 {
1015 int ordinal = va_arg(vargs, int);
1016#ifndef Py_UNICODE_WIDE
1017 if (ordinal > 0xffff) {
1018 ordinal -= 0x10000;
1019 *s++ = 0xD800 | (ordinal >> 10);
1020 *s++ = 0xDC00 | (ordinal & 0x3FF);
1021 } else
1022#endif
1023 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001024 break;
Victor Stinner659eb842011-02-23 12:14:22 +00001025 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001026 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001027 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1028 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001029 if (longflag)
1030 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001031#ifdef HAVE_LONG_LONG
1032 else if (longlongflag)
1033 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1034#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001035 else if (size_tflag)
1036 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1037 else
1038 sprintf(realbuffer, fmt, va_arg(vargs, int));
1039 appendstring(realbuffer);
1040 break;
1041 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001042 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1043 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001044 if (longflag)
1045 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001046#ifdef HAVE_LONG_LONG
1047 else if (longlongflag)
1048 sprintf(realbuffer, fmt, va_arg(vargs,
1049 unsigned PY_LONG_LONG));
1050#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 else if (size_tflag)
1052 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1053 else
1054 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1055 appendstring(realbuffer);
1056 break;
1057 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001058 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001059 sprintf(realbuffer, fmt, va_arg(vargs, int));
1060 appendstring(realbuffer);
1061 break;
1062 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001063 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001064 sprintf(realbuffer, fmt, va_arg(vargs, int));
1065 appendstring(realbuffer);
1066 break;
1067 case 's':
1068 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001069 /* unused, since we already have the result */
1070 (void) va_arg(vargs, char *);
1071 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1072 PyUnicode_GET_SIZE(*callresult));
1073 s += PyUnicode_GET_SIZE(*callresult);
1074 /* We're done with the unicode()/repr() => forget it */
1075 Py_DECREF(*callresult);
1076 /* switch to next unicode()/repr() result */
1077 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001078 break;
1079 }
1080 case 'U':
1081 {
1082 PyObject *obj = va_arg(vargs, PyObject *);
1083 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1084 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1085 s += size;
1086 break;
1087 }
1088 case 'V':
1089 {
1090 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2b574a22011-03-01 22:48:49 +00001091 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001092 if (obj) {
1093 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1094 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1095 s += size;
1096 } else {
Victor Stinner2b574a22011-03-01 22:48:49 +00001097 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1098 PyUnicode_GET_SIZE(*callresult));
1099 s += PyUnicode_GET_SIZE(*callresult);
1100 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001101 }
Victor Stinner2b574a22011-03-01 22:48:49 +00001102 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001103 break;
1104 }
1105 case 'S':
1106 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001107 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001108 {
1109 Py_UNICODE *ucopy;
1110 Py_ssize_t usize;
1111 Py_ssize_t upos;
1112 /* unused, since we already have the result */
1113 (void) va_arg(vargs, PyObject *);
1114 ucopy = PyUnicode_AS_UNICODE(*callresult);
1115 usize = PyUnicode_GET_SIZE(*callresult);
1116 for (upos = 0; upos<usize;)
1117 *s++ = ucopy[upos++];
1118 /* We're done with the unicode()/repr() => forget it */
1119 Py_DECREF(*callresult);
1120 /* switch to next unicode()/repr() result */
1121 ++callresult;
1122 break;
1123 }
1124 case 'p':
1125 sprintf(buffer, "%p", va_arg(vargs, void*));
1126 /* %p is ill-defined: ensure leading 0x. */
1127 if (buffer[1] == 'X')
1128 buffer[1] = 'x';
1129 else if (buffer[1] != 'x') {
1130 memmove(buffer+2, buffer, strlen(buffer)+1);
1131 buffer[0] = '0';
1132 buffer[1] = 'x';
1133 }
1134 appendstring(buffer);
1135 break;
1136 case '%':
1137 *s++ = '%';
1138 break;
1139 default:
1140 appendstring(p);
1141 goto end;
1142 }
Victor Stinner1205f272010-09-11 00:54:47 +00001143 }
Victor Stinner1205f272010-09-11 00:54:47 +00001144 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001145 *s++ = *f;
1146 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001147
Benjamin Peterson29060642009-01-31 22:14:21 +00001148 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001149 if (callresults)
1150 PyObject_Free(callresults);
1151 if (abuffer)
1152 PyObject_Free(abuffer);
1153 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1154 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001155 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001156 if (callresults) {
1157 PyObject **callresult2 = callresults;
1158 while (callresult2 < callresult) {
Victor Stinner2b574a22011-03-01 22:48:49 +00001159 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001160 ++callresult2;
1161 }
1162 PyObject_Free(callresults);
1163 }
1164 if (abuffer)
1165 PyObject_Free(abuffer);
1166 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001167}
1168
1169#undef appendstring
1170
1171PyObject *
1172PyUnicode_FromFormat(const char *format, ...)
1173{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001174 PyObject* ret;
1175 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001176
1177#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001178 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001179#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001180 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001181#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001182 ret = PyUnicode_FromFormatV(format, vargs);
1183 va_end(vargs);
1184 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001185}
1186
Victor Stinner5593d8a2010-10-02 11:11:27 +00001187/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1188 convert a Unicode object to a wide character string.
1189
Victor Stinnerd88d9832011-09-06 02:00:05 +02001190 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001191 character) required to convert the unicode object. Ignore size argument.
1192
Victor Stinnerd88d9832011-09-06 02:00:05 +02001193 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00001194 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02001195 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001196static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001197unicode_aswidechar(PyUnicodeObject *unicode,
1198 wchar_t *w,
1199 Py_ssize_t size)
1200{
1201#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001202 Py_ssize_t res;
1203 if (w != NULL) {
1204 res = PyUnicode_GET_SIZE(unicode);
1205 if (size > res)
1206 size = res + 1;
1207 else
1208 res = size;
1209 memcpy(w, unicode->str, size * sizeof(wchar_t));
1210 return res;
1211 }
1212 else
1213 return PyUnicode_GET_SIZE(unicode) + 1;
1214#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1215 register const Py_UNICODE *u;
1216 const Py_UNICODE *uend;
1217 const wchar_t *worig, *wend;
1218 Py_ssize_t nchar;
1219
Victor Stinner137c34c2010-09-29 10:25:54 +00001220 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001221 uend = u + PyUnicode_GET_SIZE(unicode);
1222 if (w != NULL) {
1223 worig = w;
1224 wend = w + size;
1225 while (u != uend && w != wend) {
1226 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1227 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1228 {
1229 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1230 u += 2;
1231 }
1232 else {
1233 *w = *u;
1234 u++;
1235 }
1236 w++;
1237 }
1238 if (w != wend)
1239 *w = L'\0';
1240 return w - worig;
1241 }
1242 else {
Victor Stinnerd88d9832011-09-06 02:00:05 +02001243 nchar = 1; /* null character at the end */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001244 while (u != uend) {
1245 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1246 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1247 u += 2;
1248 else
1249 u++;
1250 nchar++;
1251 }
1252 }
1253 return nchar;
1254#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1255 register Py_UNICODE *u, *uend, ordinal;
1256 register Py_ssize_t i;
1257 wchar_t *worig, *wend;
1258 Py_ssize_t nchar;
1259
1260 u = PyUnicode_AS_UNICODE(unicode);
1261 uend = u + PyUnicode_GET_SIZE(u);
1262 if (w != NULL) {
1263 worig = w;
1264 wend = w + size;
1265 while (u != uend && w != wend) {
1266 ordinal = *u;
1267 if (ordinal > 0xffff) {
1268 ordinal -= 0x10000;
1269 *w++ = 0xD800 | (ordinal >> 10);
1270 *w++ = 0xDC00 | (ordinal & 0x3FF);
1271 }
1272 else
1273 *w++ = ordinal;
1274 u++;
1275 }
1276 if (w != wend)
1277 *w = 0;
1278 return w - worig;
1279 }
1280 else {
Victor Stinnerd88d9832011-09-06 02:00:05 +02001281 nchar = 1; /* null character */
Victor Stinner5593d8a2010-10-02 11:11:27 +00001282 while (u != uend) {
1283 if (*u > 0xffff)
1284 nchar += 2;
1285 else
1286 nchar++;
1287 u++;
1288 }
1289 return nchar;
1290 }
1291#else
1292# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001293#endif
1294}
1295
1296Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001297PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001298 wchar_t *w,
1299 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300{
1301 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001302 PyErr_BadInternalCall();
1303 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001305 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306}
1307
Victor Stinner137c34c2010-09-29 10:25:54 +00001308wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001309PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001310 Py_ssize_t *size)
1311{
1312 wchar_t* buffer;
1313 Py_ssize_t buflen;
1314
1315 if (unicode == NULL) {
1316 PyErr_BadInternalCall();
1317 return NULL;
1318 }
1319
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001320 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001321 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001322 PyErr_NoMemory();
1323 return NULL;
1324 }
1325
Victor Stinner137c34c2010-09-29 10:25:54 +00001326 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1327 if (buffer == NULL) {
1328 PyErr_NoMemory();
1329 return NULL;
1330 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001331 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001332 if (size != NULL)
1333 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001334 return buffer;
1335}
1336
Guido van Rossumd57fd912000-03-10 22:53:23 +00001337#endif
1338
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001339PyObject *PyUnicode_FromOrdinal(int ordinal)
1340{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001341 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001342
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001343 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001344 PyErr_SetString(PyExc_ValueError,
1345 "chr() arg not in range(0x110000)");
1346 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001347 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001348
1349#ifndef Py_UNICODE_WIDE
1350 if (ordinal > 0xffff) {
1351 ordinal -= 0x10000;
1352 s[0] = 0xD800 | (ordinal >> 10);
1353 s[1] = 0xDC00 | (ordinal & 0x3FF);
1354 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001355 }
1356#endif
1357
Hye-Shik Chang40574832004-04-06 07:24:51 +00001358 s[0] = (Py_UNICODE)ordinal;
1359 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001360}
1361
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362PyObject *PyUnicode_FromObject(register PyObject *obj)
1363{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001364 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001365 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001366 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001367 Py_INCREF(obj);
1368 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001369 }
1370 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001371 /* For a Unicode subtype that's not a Unicode object,
1372 return a true Unicode object with the same data. */
1373 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1374 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001375 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001376 PyErr_Format(PyExc_TypeError,
1377 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001378 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001379 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001380}
1381
1382PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001383 const char *encoding,
1384 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001385{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001386 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001387 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001388
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 PyErr_BadInternalCall();
1391 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001393
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001394 /* Decoding bytes objects is the most common case and should be fast */
1395 if (PyBytes_Check(obj)) {
1396 if (PyBytes_GET_SIZE(obj) == 0) {
1397 Py_INCREF(unicode_empty);
1398 v = (PyObject *) unicode_empty;
1399 }
1400 else {
1401 v = PyUnicode_Decode(
1402 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1403 encoding, errors);
1404 }
1405 return v;
1406 }
1407
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001408 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001409 PyErr_SetString(PyExc_TypeError,
1410 "decoding str is not supported");
1411 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001412 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001413
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001414 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1415 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1416 PyErr_Format(PyExc_TypeError,
1417 "coercing to str: need bytes, bytearray "
1418 "or buffer-like object, %.80s found",
1419 Py_TYPE(obj)->tp_name);
1420 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001421 }
Tim Petersced69f82003-09-16 20:30:58 +00001422
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001423 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001424 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001425 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426 }
Tim Petersced69f82003-09-16 20:30:58 +00001427 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001428 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001429
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001430 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001431 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432}
1433
Victor Stinner600d3be2010-06-10 12:00:55 +00001434/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001435 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1436 1 on success. */
Victor Stinner20b654a2013-01-03 01:08:58 +01001437int
1438_Py_normalize_encoding(const char *encoding,
Victor Stinner37296e82010-06-10 13:36:23 +00001439 char *lower,
1440 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001442 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001443 char *l;
1444 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001446 e = encoding;
1447 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001448 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001449 while (*e) {
1450 if (l == l_end)
1451 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001452 if (Py_ISUPPER(*e)) {
1453 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001454 }
1455 else if (*e == '_') {
1456 *l++ = '-';
1457 e++;
1458 }
1459 else {
1460 *l++ = *e++;
1461 }
1462 }
1463 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001464 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001465}
1466
1467PyObject *PyUnicode_Decode(const char *s,
1468 Py_ssize_t size,
1469 const char *encoding,
1470 const char *errors)
1471{
1472 PyObject *buffer = NULL, *unicode;
1473 Py_buffer info;
1474 char lower[11]; /* Enough for any encoding shortcut */
1475
1476 if (encoding == NULL)
1477 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001478
1479 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01001480 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Victor Stinner37296e82010-06-10 13:36:23 +00001481 if (strcmp(lower, "utf-8") == 0)
1482 return PyUnicode_DecodeUTF8(s, size, errors);
1483 else if ((strcmp(lower, "latin-1") == 0) ||
1484 (strcmp(lower, "iso-8859-1") == 0))
1485 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001486#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001487 else if (strcmp(lower, "mbcs") == 0)
1488 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001489#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001490 else if (strcmp(lower, "ascii") == 0)
1491 return PyUnicode_DecodeASCII(s, size, errors);
1492 else if (strcmp(lower, "utf-16") == 0)
1493 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1494 else if (strcmp(lower, "utf-32") == 0)
1495 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497
1498 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001499 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001500 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001501 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001502 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503 if (buffer == NULL)
1504 goto onError;
1505 unicode = PyCodec_Decode(buffer, encoding, errors);
1506 if (unicode == NULL)
1507 goto onError;
1508 if (!PyUnicode_Check(unicode)) {
1509 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001510 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001511 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001512 Py_DECREF(unicode);
1513 goto onError;
1514 }
1515 Py_DECREF(buffer);
1516 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001517
Benjamin Peterson29060642009-01-31 22:14:21 +00001518 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519 Py_XDECREF(buffer);
1520 return NULL;
1521}
1522
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001523PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1524 const char *encoding,
1525 const char *errors)
1526{
1527 PyObject *v;
1528
1529 if (!PyUnicode_Check(unicode)) {
1530 PyErr_BadArgument();
1531 goto onError;
1532 }
1533
1534 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001535 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001536
1537 /* Decode via the codec registry */
1538 v = PyCodec_Decode(unicode, encoding, errors);
1539 if (v == NULL)
1540 goto onError;
1541 return v;
1542
Benjamin Peterson29060642009-01-31 22:14:21 +00001543 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001544 return NULL;
1545}
1546
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001547PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1548 const char *encoding,
1549 const char *errors)
1550{
1551 PyObject *v;
1552
1553 if (!PyUnicode_Check(unicode)) {
1554 PyErr_BadArgument();
1555 goto onError;
1556 }
1557
1558 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001559 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001560
1561 /* Decode via the codec registry */
1562 v = PyCodec_Decode(unicode, encoding, errors);
1563 if (v == NULL)
1564 goto onError;
1565 if (!PyUnicode_Check(v)) {
1566 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001567 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001568 Py_TYPE(v)->tp_name);
1569 Py_DECREF(v);
1570 goto onError;
1571 }
1572 return v;
1573
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001575 return NULL;
1576}
1577
Guido van Rossumd57fd912000-03-10 22:53:23 +00001578PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001579 Py_ssize_t size,
1580 const char *encoding,
1581 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001582{
1583 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001584
Guido van Rossumd57fd912000-03-10 22:53:23 +00001585 unicode = PyUnicode_FromUnicode(s, size);
1586 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001588 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1589 Py_DECREF(unicode);
1590 return v;
1591}
1592
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001593PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1594 const char *encoding,
1595 const char *errors)
1596{
1597 PyObject *v;
1598
1599 if (!PyUnicode_Check(unicode)) {
1600 PyErr_BadArgument();
1601 goto onError;
1602 }
1603
1604 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001605 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001606
1607 /* Encode via the codec registry */
1608 v = PyCodec_Encode(unicode, encoding, errors);
1609 if (v == NULL)
1610 goto onError;
1611 return v;
1612
Benjamin Peterson29060642009-01-31 22:14:21 +00001613 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001614 return NULL;
1615}
1616
Victor Stinnerad158722010-10-27 00:25:46 +00001617PyObject *
1618PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001619{
Victor Stinner313a1202010-06-11 23:56:51 +00001620#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001621 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1622 PyUnicode_GET_SIZE(unicode),
1623 NULL);
1624#elif defined(__APPLE__)
1625 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1626 PyUnicode_GET_SIZE(unicode),
1627 "surrogateescape");
1628#else
Victor Stinner3cbf14b2011-04-27 00:24:21 +02001629 PyInterpreterState *interp = PyThreadState_GET()->interp;
1630 /* Bootstrap check: if the filesystem codec is implemented in Python, we
1631 cannot use it to encode and decode filenames before it is loaded. Load
1632 the Python codec requires to encode at least its own filename. Use the C
1633 version of the locale codec until the codec registry is initialized and
1634 the Python codec is loaded.
1635
1636 Py_FileSystemDefaultEncoding is shared between all interpreters, we
1637 cannot only rely on it: check also interp->fscodec_initialized for
1638 subinterpreters. */
1639 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001640 return PyUnicode_AsEncodedString(unicode,
1641 Py_FileSystemDefaultEncoding,
1642 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001643 }
1644 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001645 /* locale encoding with surrogateescape */
1646 wchar_t *wchar;
1647 char *bytes;
1648 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001649 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001650
1651 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1652 if (wchar == NULL)
1653 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001654 bytes = _Py_wchar2char(wchar, &error_pos);
1655 if (bytes == NULL) {
1656 if (error_pos != (size_t)-1) {
1657 char *errmsg = strerror(errno);
1658 PyObject *exc = NULL;
1659 if (errmsg == NULL)
1660 errmsg = "Py_wchar2char() failed";
1661 raise_encode_exception(&exc,
1662 "filesystemencoding",
1663 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1664 error_pos, error_pos+1,
1665 errmsg);
1666 Py_XDECREF(exc);
1667 }
1668 else
1669 PyErr_NoMemory();
1670 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001671 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001672 }
1673 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001674
1675 bytes_obj = PyBytes_FromString(bytes);
1676 PyMem_Free(bytes);
1677 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001678 }
Victor Stinnerad158722010-10-27 00:25:46 +00001679#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001680}
1681
Guido van Rossumd57fd912000-03-10 22:53:23 +00001682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1683 const char *encoding,
1684 const char *errors)
1685{
1686 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001687 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001688
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689 if (!PyUnicode_Check(unicode)) {
1690 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001691 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 }
Fred Drakee4315f52000-05-09 19:53:39 +00001693
Tim Petersced69f82003-09-16 20:30:58 +00001694 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001695 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001696
1697 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01001698 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Victor Stinner37296e82010-06-10 13:36:23 +00001699 if (strcmp(lower, "utf-8") == 0)
1700 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1701 PyUnicode_GET_SIZE(unicode),
1702 errors);
1703 else if ((strcmp(lower, "latin-1") == 0) ||
1704 (strcmp(lower, "iso-8859-1") == 0))
1705 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1706 PyUnicode_GET_SIZE(unicode),
1707 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001708#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001709 else if (strcmp(lower, "mbcs") == 0)
1710 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1711 PyUnicode_GET_SIZE(unicode),
1712 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001713#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001714 else if (strcmp(lower, "ascii") == 0)
1715 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1716 PyUnicode_GET_SIZE(unicode),
1717 errors);
1718 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001719 /* During bootstrap, we may need to find the encodings
1720 package, to load the file system encoding, and require the
1721 file system encoding in order to load the encodings
1722 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001723
Victor Stinner59e62db2010-05-15 13:14:32 +00001724 Break out of this dependency by assuming that the path to
1725 the encodings module is ASCII-only. XXX could try wcstombs
1726 instead, if the file system encoding is the locale's
1727 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001728 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001729 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1730 !PyThreadState_GET()->interp->codecs_initialized)
1731 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1732 PyUnicode_GET_SIZE(unicode),
1733 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734
1735 /* Encode via the codec registry */
1736 v = PyCodec_Encode(unicode, encoding, errors);
1737 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001738 return NULL;
1739
1740 /* The normal path */
1741 if (PyBytes_Check(v))
1742 return v;
1743
1744 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001745 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001746 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001747 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001748
1749 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1750 "encoder %s returned bytearray instead of bytes",
1751 encoding);
1752 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001753 Py_DECREF(v);
1754 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001755 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001756
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001757 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1758 Py_DECREF(v);
1759 return b;
1760 }
1761
1762 PyErr_Format(PyExc_TypeError,
1763 "encoder did not return a bytes object (type=%.400s)",
1764 Py_TYPE(v)->tp_name);
1765 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001766 return NULL;
1767}
1768
1769PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1770 const char *encoding,
1771 const char *errors)
1772{
1773 PyObject *v;
1774
1775 if (!PyUnicode_Check(unicode)) {
1776 PyErr_BadArgument();
1777 goto onError;
1778 }
1779
1780 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001781 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001782
1783 /* Encode via the codec registry */
1784 v = PyCodec_Encode(unicode, encoding, errors);
1785 if (v == NULL)
1786 goto onError;
1787 if (!PyUnicode_Check(v)) {
1788 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001789 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001790 Py_TYPE(v)->tp_name);
1791 Py_DECREF(v);
1792 goto onError;
1793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001795
Benjamin Peterson29060642009-01-31 22:14:21 +00001796 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797 return NULL;
1798}
1799
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001800PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001801 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001802{
1803 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001804 if (v)
1805 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001806 if (errors != NULL)
1807 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001808 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001809 PyUnicode_GET_SIZE(unicode),
1810 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001811 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001812 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001813 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001814 return v;
1815}
1816
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001817PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001818PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001819 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001820 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1821}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001822
Christian Heimes5894ba72007-11-04 11:43:14 +00001823PyObject*
1824PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1825{
Victor Stinnerad158722010-10-27 00:25:46 +00001826#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1827 return PyUnicode_DecodeMBCS(s, size, NULL);
1828#elif defined(__APPLE__)
1829 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1830#else
Victor Stinner3cbf14b2011-04-27 00:24:21 +02001831 PyInterpreterState *interp = PyThreadState_GET()->interp;
1832 /* Bootstrap check: if the filesystem codec is implemented in Python, we
1833 cannot use it to encode and decode filenames before it is loaded. Load
1834 the Python codec requires to encode at least its own filename. Use the C
1835 version of the locale codec until the codec registry is initialized and
1836 the Python codec is loaded.
1837
1838 Py_FileSystemDefaultEncoding is shared between all interpreters, we
1839 cannot only rely on it: check also interp->fscodec_initialized for
1840 subinterpreters. */
1841 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001842 return PyUnicode_Decode(s, size,
1843 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001844 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001845 }
1846 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001847 /* locale encoding with surrogateescape */
1848 wchar_t *wchar;
1849 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001850 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001851
1852 if (s[size] != '\0' || size != strlen(s)) {
1853 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1854 return NULL;
1855 }
1856
Victor Stinner168e1172010-10-16 23:16:16 +00001857 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001858 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001859 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001860
Victor Stinner168e1172010-10-16 23:16:16 +00001861 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001862 PyMem_Free(wchar);
1863 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001864 }
Victor Stinnerad158722010-10-27 00:25:46 +00001865#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001866}
1867
Martin v. Löwis011e8422009-05-05 04:43:17 +00001868
1869int
Antoine Pitrou13348842012-01-29 18:36:34 +01001870_PyUnicode_HasNULChars(PyObject* s)
1871{
1872 static PyObject *nul = NULL;
1873
1874 if (nul == NULL)
1875 nul = PyUnicode_FromStringAndSize("\0", 1);
1876 if (nul == NULL)
1877 return -1;
1878 return PyUnicode_Contains(s, nul);
1879}
1880
1881
1882int
Martin v. Löwis011e8422009-05-05 04:43:17 +00001883PyUnicode_FSConverter(PyObject* arg, void* addr)
1884{
1885 PyObject *output = NULL;
1886 Py_ssize_t size;
1887 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001888 if (arg == NULL) {
1889 Py_DECREF(*(PyObject**)addr);
1890 return 1;
1891 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001892 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001893 output = arg;
1894 Py_INCREF(output);
1895 }
1896 else {
1897 arg = PyUnicode_FromObject(arg);
1898 if (!arg)
1899 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001900 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001901 Py_DECREF(arg);
1902 if (!output)
1903 return 0;
1904 if (!PyBytes_Check(output)) {
1905 Py_DECREF(output);
1906 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1907 return 0;
1908 }
1909 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001910 size = PyBytes_GET_SIZE(output);
1911 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001912 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05001913 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00001914 Py_DECREF(output);
1915 return 0;
1916 }
1917 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001918 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001919}
1920
1921
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001922int
1923PyUnicode_FSDecoder(PyObject* arg, void* addr)
1924{
1925 PyObject *output = NULL;
1926 Py_ssize_t size;
1927 void *data;
1928 if (arg == NULL) {
1929 Py_DECREF(*(PyObject**)addr);
1930 return 1;
1931 }
1932 if (PyUnicode_Check(arg)) {
1933 output = arg;
1934 Py_INCREF(output);
1935 }
1936 else {
1937 arg = PyBytes_FromObject(arg);
1938 if (!arg)
1939 return 0;
1940 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1941 PyBytes_GET_SIZE(arg));
1942 Py_DECREF(arg);
1943 if (!output)
1944 return 0;
1945 if (!PyUnicode_Check(output)) {
1946 Py_DECREF(output);
1947 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1948 return 0;
1949 }
1950 }
1951 size = PyUnicode_GET_SIZE(output);
1952 data = PyUnicode_AS_UNICODE(output);
1953 if (size != Py_UNICODE_strlen(data)) {
1954 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1955 Py_DECREF(output);
1956 return 0;
1957 }
1958 *(PyObject**)addr = output;
1959 return Py_CLEANUP_SUPPORTED;
1960}
1961
1962
Martin v. Löwis5b222132007-06-10 09:51:05 +00001963char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001964_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001965{
Christian Heimesf3863112007-11-22 07:46:41 +00001966 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001967 if (!PyUnicode_Check(unicode)) {
1968 PyErr_BadArgument();
1969 return NULL;
1970 }
Christian Heimesf3863112007-11-22 07:46:41 +00001971 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1972 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001973 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001974 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001975 *psize = PyBytes_GET_SIZE(bytes);
1976 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001977}
1978
1979char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001980_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001981{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001982 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001983}
1984
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1986{
1987 if (!PyUnicode_Check(unicode)) {
1988 PyErr_BadArgument();
1989 goto onError;
1990 }
1991 return PyUnicode_AS_UNICODE(unicode);
1992
Benjamin Peterson29060642009-01-31 22:14:21 +00001993 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994 return NULL;
1995}
1996
Martin v. Löwis18e16552006-02-15 17:27:45 +00001997Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998{
1999 if (!PyUnicode_Check(unicode)) {
2000 PyErr_BadArgument();
2001 goto onError;
2002 }
2003 return PyUnicode_GET_SIZE(unicode);
2004
Benjamin Peterson29060642009-01-31 22:14:21 +00002005 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002006 return -1;
2007}
2008
Thomas Wouters78890102000-07-22 19:25:51 +00002009const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002010{
Victor Stinner42cb4622010-09-01 19:39:01 +00002011 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002012}
2013
Victor Stinner554f3f02010-06-16 23:33:54 +00002014/* create or adjust a UnicodeDecodeError */
2015static void
2016make_decode_exception(PyObject **exceptionObject,
2017 const char *encoding,
2018 const char *input, Py_ssize_t length,
2019 Py_ssize_t startpos, Py_ssize_t endpos,
2020 const char *reason)
2021{
2022 if (*exceptionObject == NULL) {
2023 *exceptionObject = PyUnicodeDecodeError_Create(
2024 encoding, input, length, startpos, endpos, reason);
2025 }
2026 else {
2027 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2028 goto onError;
2029 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2030 goto onError;
2031 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2032 goto onError;
2033 }
2034 return;
2035
2036onError:
2037 Py_DECREF(*exceptionObject);
2038 *exceptionObject = NULL;
2039}
2040
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002041/* error handling callback helper:
2042 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002043 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002044 and adjust various state variables.
2045 return 0 on success, -1 on error
2046*/
2047
2048static
2049int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00002050 const char *encoding, const char *reason,
2051 const char **input, const char **inend, Py_ssize_t *startinpos,
2052 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2053 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002054{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002055 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056
2057 PyObject *restuple = NULL;
2058 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002059 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002060 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002061 Py_ssize_t requiredsize;
2062 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002063 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002064 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002065 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002066 int res = -1;
2067
2068 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002069 *errorHandler = PyCodec_LookupError(errors);
2070 if (*errorHandler == NULL)
2071 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002072 }
2073
Victor Stinner554f3f02010-06-16 23:33:54 +00002074 make_decode_exception(exceptionObject,
2075 encoding,
2076 *input, *inend - *input,
2077 *startinpos, *endinpos,
2078 reason);
2079 if (*exceptionObject == NULL)
2080 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002081
2082 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2083 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002084 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002085 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002086 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002087 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002088 }
2089 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002090 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002091
2092 /* Copy back the bytes variables, which might have been modified by the
2093 callback */
2094 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2095 if (!inputobj)
2096 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002097 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002098 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002099 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002100 *input = PyBytes_AS_STRING(inputobj);
2101 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002102 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002103 /* we can DECREF safely, as the exception has another reference,
2104 so the object won't go away. */
2105 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002106
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002107 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002108 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002109 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002110 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2111 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002112 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002113
2114 /* need more space? (at least enough for what we
2115 have+the replacement+the rest of the string (starting
2116 at the new input position), so we won't have to check space
2117 when there are no errors in the rest of the string) */
2118 repptr = PyUnicode_AS_UNICODE(repunicode);
2119 repsize = PyUnicode_GET_SIZE(repunicode);
2120 requiredsize = *outpos + repsize + insize-newpos;
2121 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002122 if (requiredsize<2*outsize)
2123 requiredsize = 2*outsize;
2124 if (_PyUnicode_Resize(output, requiredsize) < 0)
2125 goto onError;
2126 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127 }
2128 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002129 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002130 Py_UNICODE_COPY(*outptr, repptr, repsize);
2131 *outptr += repsize;
2132 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002133
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002134 /* we made it! */
2135 res = 0;
2136
Benjamin Peterson29060642009-01-31 22:14:21 +00002137 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002138 Py_XDECREF(restuple);
2139 return res;
2140}
2141
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002142/* --- UTF-7 Codec -------------------------------------------------------- */
2143
Antoine Pitrou244651a2009-05-04 18:56:13 +00002144/* See RFC2152 for details. We encode conservatively and decode liberally. */
2145
2146/* Three simple macros defining base-64. */
2147
2148/* Is c a base-64 character? */
2149
2150#define IS_BASE64(c) \
2151 (((c) >= 'A' && (c) <= 'Z') || \
2152 ((c) >= 'a' && (c) <= 'z') || \
2153 ((c) >= '0' && (c) <= '9') || \
2154 (c) == '+' || (c) == '/')
2155
2156/* given that c is a base-64 character, what is its base-64 value? */
2157
2158#define FROM_BASE64(c) \
2159 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2160 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2161 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2162 (c) == '+' ? 62 : 63)
2163
2164/* What is the base-64 character of the bottom 6 bits of n? */
2165
2166#define TO_BASE64(n) \
2167 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2168
2169/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2170 * decoded as itself. We are permissive on decoding; the only ASCII
2171 * byte not decoding to itself is the + which begins a base64
2172 * string. */
2173
2174#define DECODE_DIRECT(c) \
2175 ((c) <= 127 && (c) != '+')
2176
2177/* The UTF-7 encoder treats ASCII characters differently according to
2178 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2179 * the above). See RFC2152. This array identifies these different
2180 * sets:
2181 * 0 : "Set D"
2182 * alphanumeric and '(),-./:?
2183 * 1 : "Set O"
2184 * !"#$%&*;<=>@[]^_`{|}
2185 * 2 : "whitespace"
2186 * ht nl cr sp
2187 * 3 : special (must be base64 encoded)
2188 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2189 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002190
Tim Petersced69f82003-09-16 20:30:58 +00002191static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002192char utf7_category[128] = {
2193/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2194 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2195/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2196 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2197/* sp ! " # $ % & ' ( ) * + , - . / */
2198 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2199/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2201/* @ A B C D E F G H I J K L M N O */
2202 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2203/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2205/* ` a b c d e f g h i j k l m n o */
2206 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2207/* p q r s t u v w x y z { | } ~ del */
2208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002209};
2210
Antoine Pitrou244651a2009-05-04 18:56:13 +00002211/* ENCODE_DIRECT: this character should be encoded as itself. The
2212 * answer depends on whether we are encoding set O as itself, and also
2213 * on whether we are encoding whitespace as itself. RFC2152 makes it
2214 * clear that the answers to these questions vary between
2215 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002216
Antoine Pitrou244651a2009-05-04 18:56:13 +00002217#define ENCODE_DIRECT(c, directO, directWS) \
2218 ((c) < 128 && (c) > 0 && \
2219 ((utf7_category[(c)] == 0) || \
2220 (directWS && (utf7_category[(c)] == 2)) || \
2221 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002222
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002223PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002224 Py_ssize_t size,
2225 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002226{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002227 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2228}
2229
Antoine Pitrou244651a2009-05-04 18:56:13 +00002230/* The decoder. The only state we preserve is our read position,
2231 * i.e. how many characters we have consumed. So if we end in the
2232 * middle of a shift sequence we have to back off the read position
2233 * and the output to the beginning of the sequence, otherwise we lose
2234 * all the shift state (seen bits, number of bits seen, high
2235 * surrogate). */
2236
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002237PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002238 Py_ssize_t size,
2239 const char *errors,
2240 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002241{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002242 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002243 Py_ssize_t startinpos;
2244 Py_ssize_t endinpos;
2245 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002246 const char *e;
2247 PyUnicodeObject *unicode;
2248 Py_UNICODE *p;
2249 const char *errmsg = "";
2250 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002251 Py_UNICODE *shiftOutStart;
2252 unsigned int base64bits = 0;
2253 unsigned long base64buffer = 0;
2254 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002255 PyObject *errorHandler = NULL;
2256 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002257
2258 unicode = _PyUnicode_New(size);
2259 if (!unicode)
2260 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002261 if (size == 0) {
2262 if (consumed)
2263 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002264 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002265 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002266
2267 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002268 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002269 e = s + size;
2270
2271 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002272 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002273 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002274 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002275
Antoine Pitrou244651a2009-05-04 18:56:13 +00002276 if (inShift) { /* in a base-64 section */
2277 if (IS_BASE64(ch)) { /* consume a base-64 character */
2278 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2279 base64bits += 6;
2280 s++;
2281 if (base64bits >= 16) {
2282 /* we have enough bits for a UTF-16 value */
2283 Py_UNICODE outCh = (Py_UNICODE)
2284 (base64buffer >> (base64bits-16));
2285 base64bits -= 16;
2286 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2287 if (surrogate) {
2288 /* expecting a second surrogate */
2289 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2290#ifdef Py_UNICODE_WIDE
2291 *p++ = (((surrogate & 0x3FF)<<10)
2292 | (outCh & 0x3FF)) + 0x10000;
2293#else
2294 *p++ = surrogate;
2295 *p++ = outCh;
2296#endif
2297 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002298 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002299 }
2300 else {
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002301 *p++ = surrogate;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002302 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002303 }
2304 }
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002305 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002306 /* first surrogate */
2307 surrogate = outCh;
2308 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002309 else {
2310 *p++ = outCh;
2311 }
2312 }
2313 }
2314 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002315 inShift = 0;
2316 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002317 if (surrogate) {
Antoine Pitrou5418ee02011-11-15 01:42:21 +01002318 *p++ = surrogate;
2319 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002320 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002321 if (base64bits > 0) { /* left-over bits */
2322 if (base64bits >= 6) {
2323 /* We've seen at least one base-64 character */
2324 errmsg = "partial character in shift sequence";
2325 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002326 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002327 else {
2328 /* Some bits remain; they should be zero */
2329 if (base64buffer != 0) {
2330 errmsg = "non-zero padding bits in shift sequence";
2331 goto utf7Error;
2332 }
2333 }
2334 }
2335 if (ch != '-') {
2336 /* '-' is absorbed; other terminating
2337 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002338 *p++ = ch;
2339 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002340 }
2341 }
2342 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002343 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002344 s++; /* consume '+' */
2345 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002346 s++;
2347 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002348 }
2349 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002350 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002351 shiftOutStart = p;
2352 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002353 }
2354 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002355 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002356 *p++ = ch;
2357 s++;
2358 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002359 else {
2360 startinpos = s-starts;
2361 s++;
2362 errmsg = "unexpected special character";
2363 goto utf7Error;
2364 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002365 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002366utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002367 outpos = p-PyUnicode_AS_UNICODE(unicode);
2368 endinpos = s-starts;
2369 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002370 errors, &errorHandler,
2371 "utf7", errmsg,
2372 &starts, &e, &startinpos, &endinpos, &exc, &s,
2373 &unicode, &outpos, &p))
2374 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002375 }
2376
Antoine Pitrou244651a2009-05-04 18:56:13 +00002377 /* end of string */
2378
2379 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2380 /* if we're in an inconsistent state, that's an error */
2381 if (surrogate ||
2382 (base64bits >= 6) ||
2383 (base64bits > 0 && base64buffer != 0)) {
2384 outpos = p-PyUnicode_AS_UNICODE(unicode);
2385 endinpos = size;
2386 if (unicode_decode_call_errorhandler(
2387 errors, &errorHandler,
2388 "utf7", "unterminated shift sequence",
2389 &starts, &e, &startinpos, &endinpos, &exc, &s,
2390 &unicode, &outpos, &p))
2391 goto onError;
2392 if (s < e)
2393 goto restart;
2394 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002395 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002396
2397 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002398 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002399 if (inShift) {
2400 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002401 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002402 }
2403 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002404 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002405 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002406 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002407
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002408 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002409 goto onError;
2410
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002411 Py_XDECREF(errorHandler);
2412 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002413 return (PyObject *)unicode;
2414
Benjamin Peterson29060642009-01-31 22:14:21 +00002415 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002416 Py_XDECREF(errorHandler);
2417 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002418 Py_DECREF(unicode);
2419 return NULL;
2420}
2421
2422
2423PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002424 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002425 int base64SetO,
2426 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002427 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002428{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002429 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002430 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002431 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002432 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002433 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002434 unsigned int base64bits = 0;
2435 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002436 char * out;
2437 char * start;
2438
2439 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002440 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002441
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002442 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002443 return PyErr_NoMemory();
2444
Antoine Pitrou244651a2009-05-04 18:56:13 +00002445 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002446 if (v == NULL)
2447 return NULL;
2448
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002449 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002450 for (;i < size; ++i) {
2451 Py_UNICODE ch = s[i];
2452
Antoine Pitrou244651a2009-05-04 18:56:13 +00002453 if (inShift) {
2454 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2455 /* shifting out */
2456 if (base64bits) { /* output remaining bits */
2457 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2458 base64buffer = 0;
2459 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002460 }
2461 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002462 /* Characters not in the BASE64 set implicitly unshift the sequence
2463 so no '-' is required, except if the character is itself a '-' */
2464 if (IS_BASE64(ch) || ch == '-') {
2465 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002466 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002467 *out++ = (char) ch;
2468 }
2469 else {
2470 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002471 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002472 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002473 else { /* not in a shift sequence */
2474 if (ch == '+') {
2475 *out++ = '+';
2476 *out++ = '-';
2477 }
2478 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2479 *out++ = (char) ch;
2480 }
2481 else {
2482 *out++ = '+';
2483 inShift = 1;
2484 goto encode_char;
2485 }
2486 }
2487 continue;
2488encode_char:
2489#ifdef Py_UNICODE_WIDE
2490 if (ch >= 0x10000) {
2491 /* code first surrogate */
2492 base64bits += 16;
2493 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2494 while (base64bits >= 6) {
2495 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2496 base64bits -= 6;
2497 }
2498 /* prepare second surrogate */
2499 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2500 }
2501#endif
2502 base64bits += 16;
2503 base64buffer = (base64buffer << 16) | ch;
2504 while (base64bits >= 6) {
2505 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2506 base64bits -= 6;
2507 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002508 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002509 if (base64bits)
2510 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2511 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002512 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002513 if (_PyBytes_Resize(&v, out - start) < 0)
2514 return NULL;
2515 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002516}
2517
Antoine Pitrou244651a2009-05-04 18:56:13 +00002518#undef IS_BASE64
2519#undef FROM_BASE64
2520#undef TO_BASE64
2521#undef DECODE_DIRECT
2522#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002523
Guido van Rossumd57fd912000-03-10 22:53:23 +00002524/* --- UTF-8 Codec -------------------------------------------------------- */
2525
Tim Petersced69f82003-09-16 20:30:58 +00002526static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002528 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2529 illegal prefix. See RFC 3629 for details */
2530 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2531 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002532 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002533 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2534 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2538 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2540 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2542 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2543 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2544 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2545 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546};
2547
Guido van Rossumd57fd912000-03-10 22:53:23 +00002548PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002549 Py_ssize_t size,
2550 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551{
Walter Dörwald69652032004-09-07 20:24:22 +00002552 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2553}
2554
Antoine Pitrouab868312009-01-10 15:40:25 +00002555/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2556#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2557
2558/* Mask to quickly check whether a C 'long' contains a
2559 non-ASCII, UTF8-encoded char. */
2560#if (SIZEOF_LONG == 8)
2561# define ASCII_CHAR_MASK 0x8080808080808080L
2562#elif (SIZEOF_LONG == 4)
2563# define ASCII_CHAR_MASK 0x80808080L
2564#else
2565# error C 'long' size should be either 4 or 8!
2566#endif
2567
Walter Dörwald69652032004-09-07 20:24:22 +00002568PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002569 Py_ssize_t size,
2570 const char *errors,
2571 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002572{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002573 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002575 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002576 Py_ssize_t startinpos;
2577 Py_ssize_t endinpos;
2578 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002579 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 PyUnicodeObject *unicode;
2581 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002582 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002583 PyObject *errorHandler = NULL;
2584 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002585
2586 /* Note: size will always be longer than the resulting Unicode
2587 character count */
2588 unicode = _PyUnicode_New(size);
2589 if (!unicode)
2590 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002591 if (size == 0) {
2592 if (consumed)
2593 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596
2597 /* Unpack UTF-8 encoded data */
2598 p = unicode->str;
2599 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002600 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601
2602 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002603 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604
2605 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002606 /* Fast path for runs of ASCII characters. Given that common UTF-8
2607 input will consist of an overwhelming majority of ASCII
2608 characters, we try to optimize for this case by checking
2609 as many characters as a C 'long' can contain.
2610 First, check if we can do an aligned read, as most CPUs have
2611 a penalty for unaligned reads.
2612 */
2613 if (!((size_t) s & LONG_PTR_MASK)) {
2614 /* Help register allocation */
2615 register const char *_s = s;
2616 register Py_UNICODE *_p = p;
2617 while (_s < aligned_end) {
2618 /* Read a whole long at a time (either 4 or 8 bytes),
2619 and do a fast unrolled copy if it only contains ASCII
2620 characters. */
2621 unsigned long data = *(unsigned long *) _s;
2622 if (data & ASCII_CHAR_MASK)
2623 break;
2624 _p[0] = (unsigned char) _s[0];
2625 _p[1] = (unsigned char) _s[1];
2626 _p[2] = (unsigned char) _s[2];
2627 _p[3] = (unsigned char) _s[3];
2628#if (SIZEOF_LONG == 8)
2629 _p[4] = (unsigned char) _s[4];
2630 _p[5] = (unsigned char) _s[5];
2631 _p[6] = (unsigned char) _s[6];
2632 _p[7] = (unsigned char) _s[7];
2633#endif
2634 _s += SIZEOF_LONG;
2635 _p += SIZEOF_LONG;
2636 }
2637 s = _s;
2638 p = _p;
2639 if (s == e)
2640 break;
2641 ch = (unsigned char)*s;
2642 }
2643 }
2644
2645 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002646 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647 s++;
2648 continue;
2649 }
2650
2651 n = utf8_code_length[ch];
2652
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002653 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002654 if (consumed)
2655 break;
2656 else {
2657 errmsg = "unexpected end of data";
2658 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002659 endinpos = startinpos+1;
2660 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2661 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002662 goto utf8Error;
2663 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002664 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665
2666 switch (n) {
2667
2668 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002669 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002670 startinpos = s-starts;
2671 endinpos = startinpos+1;
2672 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002673
2674 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002675 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002676 startinpos = s-starts;
2677 endinpos = startinpos+1;
2678 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679
2680 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002681 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002682 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002683 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002684 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002685 goto utf8Error;
2686 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002688 assert ((ch > 0x007F) && (ch <= 0x07FF));
2689 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 break;
2691
2692 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002693 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2694 will result in surrogates in range d800-dfff. Surrogates are
2695 not valid UTF-8 so they are rejected.
2696 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2697 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002698 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002699 (s[2] & 0xc0) != 0x80 ||
2700 ((unsigned char)s[0] == 0xE0 &&
2701 (unsigned char)s[1] < 0xA0) ||
2702 ((unsigned char)s[0] == 0xED &&
2703 (unsigned char)s[1] > 0x9F)) {
2704 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002705 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002706 endinpos = startinpos + 1;
2707
2708 /* if s[1] first two bits are 1 and 0, then the invalid
2709 continuation byte is s[2], so increment endinpos by 1,
2710 if not, s[1] is invalid and endinpos doesn't need to
2711 be incremented. */
2712 if ((s[1] & 0xC0) == 0x80)
2713 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002714 goto utf8Error;
2715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002716 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002717 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2718 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002719 break;
2720
2721 case 4:
2722 if ((s[1] & 0xc0) != 0x80 ||
2723 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002724 (s[3] & 0xc0) != 0x80 ||
2725 ((unsigned char)s[0] == 0xF0 &&
2726 (unsigned char)s[1] < 0x90) ||
2727 ((unsigned char)s[0] == 0xF4 &&
2728 (unsigned char)s[1] > 0x8F)) {
2729 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002730 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002731 endinpos = startinpos + 1;
2732 if ((s[1] & 0xC0) == 0x80) {
2733 endinpos++;
2734 if ((s[2] & 0xC0) == 0x80)
2735 endinpos++;
2736 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002737 goto utf8Error;
2738 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002739 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002740 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2741 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2742
Fredrik Lundh8f455852001-06-27 18:59:43 +00002743#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002744 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002745#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002746 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002747
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002748 /* translate from 10000..10FFFF to 0..FFFF */
2749 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002750
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002751 /* high surrogate = top 10 bits added to D800 */
2752 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002753
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002754 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002755 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002756#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 }
2759 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002760 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002761
Benjamin Peterson29060642009-01-31 22:14:21 +00002762 utf8Error:
2763 outpos = p-PyUnicode_AS_UNICODE(unicode);
2764 if (unicode_decode_call_errorhandler(
2765 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01002766 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00002767 &starts, &e, &startinpos, &endinpos, &exc, &s,
2768 &unicode, &outpos, &p))
2769 goto onError;
2770 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771 }
Walter Dörwald69652032004-09-07 20:24:22 +00002772 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002773 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774
2775 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002776 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 goto onError;
2778
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002779 Py_XDECREF(errorHandler);
2780 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 return (PyObject *)unicode;
2782
Benjamin Peterson29060642009-01-31 22:14:21 +00002783 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002784 Py_XDECREF(errorHandler);
2785 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 Py_DECREF(unicode);
2787 return NULL;
2788}
2789
Antoine Pitrouab868312009-01-10 15:40:25 +00002790#undef ASCII_CHAR_MASK
2791
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002792#ifdef __APPLE__
2793
2794/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner27b1ca22012-12-03 12:47:59 +01002795 used to decode the command line arguments on Mac OS X.
2796
2797 Return a pointer to a newly allocated wide character string (use
2798 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002799
2800wchar_t*
2801_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2802{
2803 int n;
2804 const char *e;
2805 wchar_t *unicode, *p;
2806
2807 /* Note: size will always be longer than the resulting Unicode
2808 character count */
Victor Stinner27b1ca22012-12-03 12:47:59 +01002809 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002810 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002811 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2812 if (!unicode)
2813 return NULL;
2814
2815 /* Unpack UTF-8 encoded data */
2816 p = unicode;
2817 e = s + size;
2818 while (s < e) {
2819 Py_UCS4 ch = (unsigned char)*s;
2820
2821 if (ch < 0x80) {
2822 *p++ = (wchar_t)ch;
2823 s++;
2824 continue;
2825 }
2826
2827 n = utf8_code_length[ch];
2828 if (s + n > e) {
2829 goto surrogateescape;
2830 }
2831
2832 switch (n) {
2833 case 0:
2834 case 1:
2835 goto surrogateescape;
2836
2837 case 2:
2838 if ((s[1] & 0xc0) != 0x80)
2839 goto surrogateescape;
2840 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2841 assert ((ch > 0x007F) && (ch <= 0x07FF));
2842 *p++ = (wchar_t)ch;
2843 break;
2844
2845 case 3:
2846 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2847 will result in surrogates in range d800-dfff. Surrogates are
2848 not valid UTF-8 so they are rejected.
2849 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2850 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2851 if ((s[1] & 0xc0) != 0x80 ||
2852 (s[2] & 0xc0) != 0x80 ||
2853 ((unsigned char)s[0] == 0xE0 &&
2854 (unsigned char)s[1] < 0xA0) ||
2855 ((unsigned char)s[0] == 0xED &&
2856 (unsigned char)s[1] > 0x9F)) {
2857
2858 goto surrogateescape;
2859 }
2860 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2861 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2862 *p++ = (Py_UNICODE)ch;
2863 break;
2864
2865 case 4:
2866 if ((s[1] & 0xc0) != 0x80 ||
2867 (s[2] & 0xc0) != 0x80 ||
2868 (s[3] & 0xc0) != 0x80 ||
2869 ((unsigned char)s[0] == 0xF0 &&
2870 (unsigned char)s[1] < 0x90) ||
2871 ((unsigned char)s[0] == 0xF4 &&
2872 (unsigned char)s[1] > 0x8F)) {
2873 goto surrogateescape;
2874 }
2875 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2876 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2877 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2878
2879#if SIZEOF_WCHAR_T == 4
2880 *p++ = (wchar_t)ch;
2881#else
2882 /* compute and append the two surrogates: */
2883
2884 /* translate from 10000..10FFFF to 0..FFFF */
2885 ch -= 0x10000;
2886
2887 /* high surrogate = top 10 bits added to D800 */
2888 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2889
2890 /* low surrogate = bottom 10 bits added to DC00 */
2891 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2892#endif
2893 break;
2894 }
2895 s += n;
2896 continue;
2897
2898 surrogateescape:
2899 *p++ = 0xDC00 + ch;
2900 s++;
2901 }
2902 *p = L'\0';
2903 return unicode;
2904}
2905
2906#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002907
Tim Peters602f7402002-04-27 18:03:26 +00002908/* Allocation strategy: if the string is short, convert into a stack buffer
2909 and allocate exactly as much space needed at the end. Else allocate the
2910 maximum possible needed (4 result bytes per Unicode character), and return
2911 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002912*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002913PyObject *
2914PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002915 Py_ssize_t size,
2916 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002917{
Tim Peters602f7402002-04-27 18:03:26 +00002918#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002919
Guido van Rossum98297ee2007-11-06 21:34:58 +00002920 Py_ssize_t i; /* index into s of next input byte */
2921 PyObject *result; /* result string object */
2922 char *p; /* next free byte in output buffer */
2923 Py_ssize_t nallocated; /* number of result bytes allocated */
2924 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002925 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002926 PyObject *errorHandler = NULL;
2927 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002928
Tim Peters602f7402002-04-27 18:03:26 +00002929 assert(s != NULL);
2930 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002931
Tim Peters602f7402002-04-27 18:03:26 +00002932 if (size <= MAX_SHORT_UNICHARS) {
2933 /* Write into the stack buffer; nallocated can't overflow.
2934 * At the end, we'll allocate exactly as much heap space as it
2935 * turns out we need.
2936 */
2937 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002938 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002939 p = stackbuf;
2940 }
2941 else {
2942 /* Overallocate on the heap, and give the excess back at the end. */
2943 nallocated = size * 4;
2944 if (nallocated / 4 != size) /* overflow! */
2945 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002946 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002947 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002948 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002949 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002950 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002951
Tim Peters602f7402002-04-27 18:03:26 +00002952 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002953 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002954
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002955 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002956 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002957 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002958
Guido van Rossumd57fd912000-03-10 22:53:23 +00002959 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002960 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002961 *p++ = (char)(0xc0 | (ch >> 6));
2962 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002963 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002964#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002965 /* Special case: check for high and low surrogate */
2966 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2967 Py_UCS4 ch2 = s[i];
2968 /* Combine the two surrogates to form a UCS4 value */
2969 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2970 i++;
2971
2972 /* Encode UCS4 Unicode ordinals */
2973 *p++ = (char)(0xf0 | (ch >> 18));
2974 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002975 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2976 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002977 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002978#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002979 Py_ssize_t newpos;
2980 PyObject *rep;
2981 Py_ssize_t repsize, k;
2982 rep = unicode_encode_call_errorhandler
2983 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2984 s, size, &exc, i-1, i, &newpos);
2985 if (!rep)
2986 goto error;
2987
2988 if (PyBytes_Check(rep))
2989 repsize = PyBytes_GET_SIZE(rep);
2990 else
2991 repsize = PyUnicode_GET_SIZE(rep);
2992
2993 if (repsize > 4) {
2994 Py_ssize_t offset;
2995
2996 if (result == NULL)
2997 offset = p - stackbuf;
2998 else
2999 offset = p - PyBytes_AS_STRING(result);
3000
3001 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
3002 /* integer overflow */
3003 PyErr_NoMemory();
3004 goto error;
3005 }
3006 nallocated += repsize - 4;
3007 if (result != NULL) {
3008 if (_PyBytes_Resize(&result, nallocated) < 0)
3009 goto error;
3010 } else {
3011 result = PyBytes_FromStringAndSize(NULL, nallocated);
3012 if (result == NULL)
3013 goto error;
3014 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3015 }
3016 p = PyBytes_AS_STRING(result) + offset;
3017 }
3018
3019 if (PyBytes_Check(rep)) {
3020 char *prep = PyBytes_AS_STRING(rep);
3021 for(k = repsize; k > 0; k--)
3022 *p++ = *prep++;
3023 } else /* rep is unicode */ {
3024 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3025 Py_UNICODE c;
3026
3027 for(k=0; k<repsize; k++) {
3028 c = prep[k];
3029 if (0x80 <= c) {
3030 raise_encode_exception(&exc, "utf-8", s, size,
3031 i-1, i, "surrogates not allowed");
3032 goto error;
3033 }
3034 *p++ = (char)prep[k];
3035 }
3036 }
3037 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003038#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003039 }
Victor Stinner445a6232010-04-22 20:01:57 +00003040#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003041 } else if (ch < 0x10000) {
3042 *p++ = (char)(0xe0 | (ch >> 12));
3043 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3044 *p++ = (char)(0x80 | (ch & 0x3f));
3045 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003046 /* Encode UCS4 Unicode ordinals */
3047 *p++ = (char)(0xf0 | (ch >> 18));
3048 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3049 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3050 *p++ = (char)(0x80 | (ch & 0x3f));
3051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003053
Guido van Rossum98297ee2007-11-06 21:34:58 +00003054 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003055 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003056 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003057 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003058 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003059 }
3060 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003061 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003062 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003063 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003064 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003065 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003066 Py_XDECREF(errorHandler);
3067 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003068 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003069 error:
3070 Py_XDECREF(errorHandler);
3071 Py_XDECREF(exc);
3072 Py_XDECREF(result);
3073 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003074
Tim Peters602f7402002-04-27 18:03:26 +00003075#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076}
3077
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3079{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 if (!PyUnicode_Check(unicode)) {
3081 PyErr_BadArgument();
3082 return NULL;
3083 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003084 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003085 PyUnicode_GET_SIZE(unicode),
3086 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087}
3088
Walter Dörwald41980ca2007-08-16 21:55:45 +00003089/* --- UTF-32 Codec ------------------------------------------------------- */
3090
3091PyObject *
3092PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003093 Py_ssize_t size,
3094 const char *errors,
3095 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003096{
3097 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3098}
3099
3100PyObject *
3101PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003102 Py_ssize_t size,
3103 const char *errors,
3104 int *byteorder,
3105 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003106{
3107 const char *starts = s;
3108 Py_ssize_t startinpos;
3109 Py_ssize_t endinpos;
3110 Py_ssize_t outpos;
3111 PyUnicodeObject *unicode;
3112 Py_UNICODE *p;
3113#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003114 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003115 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003116#else
3117 const int pairs = 0;
3118#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003119 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003120 int bo = 0; /* assume native ordering by default */
3121 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003122 /* Offsets from q for retrieving bytes in the right order. */
3123#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3124 int iorder[] = {0, 1, 2, 3};
3125#else
3126 int iorder[] = {3, 2, 1, 0};
3127#endif
3128 PyObject *errorHandler = NULL;
3129 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003130
Walter Dörwald41980ca2007-08-16 21:55:45 +00003131 q = (unsigned char *)s;
3132 e = q + size;
3133
3134 if (byteorder)
3135 bo = *byteorder;
3136
3137 /* Check for BOM marks (U+FEFF) in the input and adjust current
3138 byte order setting accordingly. In native mode, the leading BOM
3139 mark is skipped, in all other modes, it is copied to the output
3140 stream as-is (giving a ZWNBSP character). */
3141 if (bo == 0) {
3142 if (size >= 4) {
3143 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003144 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003145#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003146 if (bom == 0x0000FEFF) {
3147 q += 4;
3148 bo = -1;
3149 }
3150 else if (bom == 0xFFFE0000) {
3151 q += 4;
3152 bo = 1;
3153 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003154#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003155 if (bom == 0x0000FEFF) {
3156 q += 4;
3157 bo = 1;
3158 }
3159 else if (bom == 0xFFFE0000) {
3160 q += 4;
3161 bo = -1;
3162 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003163#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003164 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003165 }
3166
3167 if (bo == -1) {
3168 /* force LE */
3169 iorder[0] = 0;
3170 iorder[1] = 1;
3171 iorder[2] = 2;
3172 iorder[3] = 3;
3173 }
3174 else if (bo == 1) {
3175 /* force BE */
3176 iorder[0] = 3;
3177 iorder[1] = 2;
3178 iorder[2] = 1;
3179 iorder[3] = 0;
3180 }
3181
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003182 /* On narrow builds we split characters outside the BMP into two
3183 codepoints => count how much extra space we need. */
3184#ifndef Py_UNICODE_WIDE
Serhiy Storchakadec798e2013-01-08 22:45:42 +02003185 for (qq = q; e - qq >= 4; qq += 4)
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003186 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3187 pairs++;
3188#endif
3189
3190 /* This might be one to much, because of a BOM */
3191 unicode = _PyUnicode_New((size+3)/4+pairs);
3192 if (!unicode)
3193 return NULL;
3194 if (size == 0)
3195 return (PyObject *)unicode;
3196
3197 /* Unpack UTF-32 encoded data */
3198 p = unicode->str;
3199
Walter Dörwald41980ca2007-08-16 21:55:45 +00003200 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003201 Py_UCS4 ch;
3202 /* remaining bytes at the end? (size should be divisible by 4) */
3203 if (e-q<4) {
3204 if (consumed)
3205 break;
3206 errmsg = "truncated data";
3207 startinpos = ((const char *)q)-starts;
3208 endinpos = ((const char *)e)-starts;
3209 goto utf32Error;
3210 /* The remaining input chars are ignored if the callback
3211 chooses to skip the input */
3212 }
3213 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3214 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003215
Benjamin Peterson29060642009-01-31 22:14:21 +00003216 if (ch >= 0x110000)
3217 {
3218 errmsg = "codepoint not in range(0x110000)";
3219 startinpos = ((const char *)q)-starts;
3220 endinpos = startinpos+4;
3221 goto utf32Error;
3222 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003223#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003224 if (ch >= 0x10000)
3225 {
3226 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3227 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3228 }
3229 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003230#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003231 *p++ = ch;
3232 q += 4;
3233 continue;
3234 utf32Error:
3235 outpos = p-PyUnicode_AS_UNICODE(unicode);
3236 if (unicode_decode_call_errorhandler(
3237 errors, &errorHandler,
3238 "utf32", errmsg,
3239 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3240 &unicode, &outpos, &p))
3241 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003242 }
3243
3244 if (byteorder)
3245 *byteorder = bo;
3246
3247 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003248 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003249
3250 /* Adjust length */
3251 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3252 goto onError;
3253
3254 Py_XDECREF(errorHandler);
3255 Py_XDECREF(exc);
3256 return (PyObject *)unicode;
3257
Benjamin Peterson29060642009-01-31 22:14:21 +00003258 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003259 Py_DECREF(unicode);
3260 Py_XDECREF(errorHandler);
3261 Py_XDECREF(exc);
3262 return NULL;
3263}
3264
3265PyObject *
3266PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 Py_ssize_t size,
3268 const char *errors,
3269 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003270{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003271 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003272 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003273 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003274#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003275 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003276#else
3277 const int pairs = 0;
3278#endif
3279 /* Offsets from p for storing byte pairs in the right order. */
3280#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3281 int iorder[] = {0, 1, 2, 3};
3282#else
3283 int iorder[] = {3, 2, 1, 0};
3284#endif
3285
Benjamin Peterson29060642009-01-31 22:14:21 +00003286#define STORECHAR(CH) \
3287 do { \
3288 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3289 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3290 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3291 p[iorder[0]] = (CH) & 0xff; \
3292 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003293 } while(0)
3294
3295 /* In narrow builds we can output surrogate pairs as one codepoint,
3296 so we need less space. */
3297#ifndef Py_UNICODE_WIDE
3298 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003299 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3300 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3301 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003302#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003303 nsize = (size - pairs + (byteorder == 0));
3304 bytesize = nsize * 4;
3305 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003306 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003307 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003308 if (v == NULL)
3309 return NULL;
3310
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003311 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003312 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003313 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003314 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003315 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003316
3317 if (byteorder == -1) {
3318 /* force LE */
3319 iorder[0] = 0;
3320 iorder[1] = 1;
3321 iorder[2] = 2;
3322 iorder[3] = 3;
3323 }
3324 else if (byteorder == 1) {
3325 /* force BE */
3326 iorder[0] = 3;
3327 iorder[1] = 2;
3328 iorder[2] = 1;
3329 iorder[3] = 0;
3330 }
3331
3332 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003333 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003334#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003335 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3336 Py_UCS4 ch2 = *s;
3337 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3338 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3339 s++;
3340 size--;
3341 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003342 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003343#endif
3344 STORECHAR(ch);
3345 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003346
3347 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003348 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003349#undef STORECHAR
3350}
3351
3352PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3353{
3354 if (!PyUnicode_Check(unicode)) {
3355 PyErr_BadArgument();
3356 return NULL;
3357 }
3358 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003359 PyUnicode_GET_SIZE(unicode),
3360 NULL,
3361 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003362}
3363
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364/* --- UTF-16 Codec ------------------------------------------------------- */
3365
Tim Peters772747b2001-08-09 22:21:55 +00003366PyObject *
3367PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003368 Py_ssize_t size,
3369 const char *errors,
3370 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371{
Walter Dörwald69652032004-09-07 20:24:22 +00003372 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3373}
3374
Antoine Pitrouab868312009-01-10 15:40:25 +00003375/* Two masks for fast checking of whether a C 'long' may contain
3376 UTF16-encoded surrogate characters. This is an efficient heuristic,
3377 assuming that non-surrogate characters with a code point >= 0x8000 are
3378 rare in most input.
3379 FAST_CHAR_MASK is used when the input is in native byte ordering,
3380 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003381*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003382#if (SIZEOF_LONG == 8)
3383# define FAST_CHAR_MASK 0x8000800080008000L
3384# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3385#elif (SIZEOF_LONG == 4)
3386# define FAST_CHAR_MASK 0x80008000L
3387# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3388#else
3389# error C 'long' size should be either 4 or 8!
3390#endif
3391
Walter Dörwald69652032004-09-07 20:24:22 +00003392PyObject *
3393PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003394 Py_ssize_t size,
3395 const char *errors,
3396 int *byteorder,
3397 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003398{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003399 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003400 Py_ssize_t startinpos;
3401 Py_ssize_t endinpos;
3402 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003403 PyUnicodeObject *unicode;
3404 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003405 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003406 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003407 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003408 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003409 /* Offsets from q for retrieving byte pairs in the right order. */
3410#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3411 int ihi = 1, ilo = 0;
3412#else
3413 int ihi = 0, ilo = 1;
3414#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415 PyObject *errorHandler = NULL;
3416 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003417
3418 /* Note: size will always be longer than the resulting Unicode
3419 character count */
3420 unicode = _PyUnicode_New(size);
3421 if (!unicode)
3422 return NULL;
3423 if (size == 0)
3424 return (PyObject *)unicode;
3425
3426 /* Unpack UTF-16 encoded data */
3427 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003428 q = (unsigned char *)s;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003429 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430
3431 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003432 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003434 /* Check for BOM marks (U+FEFF) in the input and adjust current
3435 byte order setting accordingly. In native mode, the leading BOM
3436 mark is skipped, in all other modes, it is copied to the output
3437 stream as-is (giving a ZWNBSP character). */
3438 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003439 if (size >= 2) {
3440 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003441#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003442 if (bom == 0xFEFF) {
3443 q += 2;
3444 bo = -1;
3445 }
3446 else if (bom == 0xFFFE) {
3447 q += 2;
3448 bo = 1;
3449 }
Tim Petersced69f82003-09-16 20:30:58 +00003450#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003451 if (bom == 0xFEFF) {
3452 q += 2;
3453 bo = 1;
3454 }
3455 else if (bom == 0xFFFE) {
3456 q += 2;
3457 bo = -1;
3458 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003459#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003460 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003461 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003462
Tim Peters772747b2001-08-09 22:21:55 +00003463 if (bo == -1) {
3464 /* force LE */
3465 ihi = 1;
3466 ilo = 0;
3467 }
3468 else if (bo == 1) {
3469 /* force BE */
3470 ihi = 0;
3471 ilo = 1;
3472 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003473#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3474 native_ordering = ilo < ihi;
3475#else
3476 native_ordering = ilo > ihi;
3477#endif
Tim Peters772747b2001-08-09 22:21:55 +00003478
Antoine Pitrouab868312009-01-10 15:40:25 +00003479 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003480 while (1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003481 Py_UNICODE ch;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003482 if (e - q < 2) {
3483 /* remaining byte at the end? (size should be even) */
3484 if (q == e || consumed)
3485 break;
3486 errmsg = "truncated data";
3487 startinpos = ((const char *)q) - starts;
3488 endinpos = ((const char *)e) - starts;
3489 outpos = p - PyUnicode_AS_UNICODE(unicode);
3490 goto utf16Error;
3491 /* The remaining input chars are ignored if the callback
3492 chooses to skip the input */
3493 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003494 /* First check for possible aligned read of a C 'long'. Unaligned
3495 reads are more expensive, better to defer to another iteration. */
3496 if (!((size_t) q & LONG_PTR_MASK)) {
3497 /* Fast path for runs of non-surrogate chars. */
3498 register const unsigned char *_q = q;
3499 Py_UNICODE *_p = p;
3500 if (native_ordering) {
3501 /* Native ordering is simple: as long as the input cannot
3502 possibly contain a surrogate char, do an unrolled copy
3503 of several 16-bit code points to the target object.
3504 The non-surrogate check is done on several input bytes
3505 at a time (as many as a C 'long' can contain). */
3506 while (_q < aligned_end) {
3507 unsigned long data = * (unsigned long *) _q;
3508 if (data & FAST_CHAR_MASK)
3509 break;
3510 _p[0] = ((unsigned short *) _q)[0];
3511 _p[1] = ((unsigned short *) _q)[1];
3512#if (SIZEOF_LONG == 8)
3513 _p[2] = ((unsigned short *) _q)[2];
3514 _p[3] = ((unsigned short *) _q)[3];
3515#endif
3516 _q += SIZEOF_LONG;
3517 _p += SIZEOF_LONG / 2;
3518 }
3519 }
3520 else {
3521 /* Byteswapped ordering is similar, but we must decompose
3522 the copy bytewise, and take care of zero'ing out the
3523 upper bytes if the target object is in 32-bit units
3524 (that is, in UCS-4 builds). */
3525 while (_q < aligned_end) {
3526 unsigned long data = * (unsigned long *) _q;
3527 if (data & SWAPPED_FAST_CHAR_MASK)
3528 break;
3529 /* Zero upper bytes in UCS-4 builds */
3530#if (Py_UNICODE_SIZE > 2)
3531 _p[0] = 0;
3532 _p[1] = 0;
3533#if (SIZEOF_LONG == 8)
3534 _p[2] = 0;
3535 _p[3] = 0;
3536#endif
3537#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003538 /* Issue #4916; UCS-4 builds on big endian machines must
3539 fill the two last bytes of each 4-byte unit. */
3540#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3541# define OFF 2
3542#else
3543# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003544#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003545 ((unsigned char *) _p)[OFF + 1] = _q[0];
3546 ((unsigned char *) _p)[OFF + 0] = _q[1];
3547 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3548 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3549#if (SIZEOF_LONG == 8)
3550 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3551 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3552 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3553 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3554#endif
3555#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003556 _q += SIZEOF_LONG;
3557 _p += SIZEOF_LONG / 2;
3558 }
3559 }
3560 p = _p;
3561 q = _q;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003562 if (e - q < 2)
3563 continue;
Antoine Pitrouab868312009-01-10 15:40:25 +00003564 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003565 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003566
Benjamin Peterson14339b62009-01-31 16:36:08 +00003567 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003568
3569 if (ch < 0xD800 || ch > 0xDFFF) {
3570 *p++ = ch;
3571 continue;
3572 }
3573
3574 /* UTF-16 code pair: */
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003575 if (e - q < 2) {
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02003576 q -= 2;
3577 if (consumed)
3578 break;
Benjamin Peterson29060642009-01-31 22:14:21 +00003579 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02003580 startinpos = ((const char *)q) - starts;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003581 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00003582 goto utf16Error;
3583 }
3584 if (0xD800 <= ch && ch <= 0xDBFF) {
3585 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3586 q += 2;
3587 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003588#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003589 *p++ = ch;
3590 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003591#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003592 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003593#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003594 continue;
3595 }
3596 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003597 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003598 startinpos = (((const char *)q)-4)-starts;
3599 endinpos = startinpos+2;
3600 goto utf16Error;
3601 }
3602
Benjamin Peterson14339b62009-01-31 16:36:08 +00003603 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003604 errmsg = "illegal encoding";
3605 startinpos = (((const char *)q)-2)-starts;
3606 endinpos = startinpos+2;
3607 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003608
Benjamin Peterson29060642009-01-31 22:14:21 +00003609 utf16Error:
3610 outpos = p - PyUnicode_AS_UNICODE(unicode);
3611 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003612 errors,
3613 &errorHandler,
3614 "utf16", errmsg,
3615 &starts,
3616 (const char **)&e,
3617 &startinpos,
3618 &endinpos,
3619 &exc,
3620 (const char **)&q,
3621 &unicode,
3622 &outpos,
3623 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003624 goto onError;
Antoine Pitroub4bbee22012-07-21 00:45:14 +02003625 /* Update data because unicode_decode_call_errorhandler might have
3626 changed the input object. */
3627 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Antoine Pitrouab868312009-01-10 15:40:25 +00003628 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629
3630 if (byteorder)
3631 *byteorder = bo;
3632
Walter Dörwald69652032004-09-07 20:24:22 +00003633 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003634 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003635
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003637 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638 goto onError;
3639
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 Py_XDECREF(errorHandler);
3641 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642 return (PyObject *)unicode;
3643
Benjamin Peterson29060642009-01-31 22:14:21 +00003644 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003646 Py_XDECREF(errorHandler);
3647 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003648 return NULL;
3649}
3650
Antoine Pitrouab868312009-01-10 15:40:25 +00003651#undef FAST_CHAR_MASK
3652#undef SWAPPED_FAST_CHAR_MASK
3653
Tim Peters772747b2001-08-09 22:21:55 +00003654PyObject *
3655PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003656 Py_ssize_t size,
3657 const char *errors,
3658 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003660 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003661 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003662 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003663#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003664 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003665#else
3666 const int pairs = 0;
3667#endif
Tim Peters772747b2001-08-09 22:21:55 +00003668 /* Offsets from p for storing byte pairs in the right order. */
3669#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3670 int ihi = 1, ilo = 0;
3671#else
3672 int ihi = 0, ilo = 1;
3673#endif
3674
Benjamin Peterson29060642009-01-31 22:14:21 +00003675#define STORECHAR(CH) \
3676 do { \
3677 p[ihi] = ((CH) >> 8) & 0xff; \
3678 p[ilo] = (CH) & 0xff; \
3679 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003680 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003682#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003683 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003684 if (s[i] >= 0x10000)
3685 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003686#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003687 /* 2 * (size + pairs + (byteorder == 0)) */
3688 if (size > PY_SSIZE_T_MAX ||
3689 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003690 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003691 nsize = size + pairs + (byteorder == 0);
3692 bytesize = nsize * 2;
3693 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003694 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003695 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696 if (v == NULL)
3697 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003699 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003701 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003702 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003703 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003704
3705 if (byteorder == -1) {
3706 /* force LE */
3707 ihi = 1;
3708 ilo = 0;
3709 }
3710 else if (byteorder == 1) {
3711 /* force BE */
3712 ihi = 0;
3713 ilo = 1;
3714 }
3715
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003716 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003717 Py_UNICODE ch = *s++;
3718 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003719#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003720 if (ch >= 0x10000) {
3721 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3722 ch = 0xD800 | ((ch-0x10000) >> 10);
3723 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003724#endif
Tim Peters772747b2001-08-09 22:21:55 +00003725 STORECHAR(ch);
3726 if (ch2)
3727 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003728 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003729
3730 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003731 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003732#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733}
3734
3735PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3736{
3737 if (!PyUnicode_Check(unicode)) {
3738 PyErr_BadArgument();
3739 return NULL;
3740 }
3741 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003742 PyUnicode_GET_SIZE(unicode),
3743 NULL,
3744 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745}
3746
3747/* --- Unicode Escape Codec ----------------------------------------------- */
3748
Fredrik Lundh06d12682001-01-24 07:59:11 +00003749static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003750
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003752 Py_ssize_t size,
3753 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003755 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003756 Py_ssize_t startinpos;
3757 Py_ssize_t endinpos;
3758 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003759 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003760 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003761 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003763 char* message;
3764 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 PyObject *errorHandler = NULL;
3766 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003767
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768 /* Escaped strings will always be longer than the resulting
3769 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770 length after conversion to the true value.
3771 (but if the error callback returns a long replacement string
3772 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 v = _PyUnicode_New(size);
3774 if (v == NULL)
3775 goto onError;
3776 if (size == 0)
3777 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003778
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003779 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003781
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 while (s < end) {
3783 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003784 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003785 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003786
3787 /* Non-escape characters are interpreted as Unicode ordinals */
3788 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003789 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 continue;
3791 }
3792
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003793 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794 /* \ - Escapes */
3795 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003796 c = *s++;
3797 if (s > end)
3798 c = '\0'; /* Invalid after \ */
3799 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800
Benjamin Peterson29060642009-01-31 22:14:21 +00003801 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 case '\n': break;
3803 case '\\': *p++ = '\\'; break;
3804 case '\'': *p++ = '\''; break;
3805 case '\"': *p++ = '\"'; break;
3806 case 'b': *p++ = '\b'; break;
3807 case 'f': *p++ = '\014'; break; /* FF */
3808 case 't': *p++ = '\t'; break;
3809 case 'n': *p++ = '\n'; break;
3810 case 'r': *p++ = '\r'; break;
3811 case 'v': *p++ = '\013'; break; /* VT */
3812 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3813
Benjamin Peterson29060642009-01-31 22:14:21 +00003814 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 case '0': case '1': case '2': case '3':
3816 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003817 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003818 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003819 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003820 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003821 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003823 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824 break;
3825
Benjamin Peterson29060642009-01-31 22:14:21 +00003826 /* hex escapes */
3827 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003829 digits = 2;
3830 message = "truncated \\xXX escape";
3831 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832
Benjamin Peterson29060642009-01-31 22:14:21 +00003833 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003835 digits = 4;
3836 message = "truncated \\uXXXX escape";
3837 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838
Benjamin Peterson29060642009-01-31 22:14:21 +00003839 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003840 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003841 digits = 8;
3842 message = "truncated \\UXXXXXXXX escape";
3843 hexescape:
3844 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003845 outpos = p-PyUnicode_AS_UNICODE(v);
3846 if (s+digits>end) {
3847 endinpos = size;
3848 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003849 errors, &errorHandler,
3850 "unicodeescape", "end of string in escape sequence",
3851 &starts, &end, &startinpos, &endinpos, &exc, &s,
3852 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003853 goto onError;
3854 goto nextByte;
3855 }
3856 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003857 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003858 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003859 endinpos = (s+i+1)-starts;
3860 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003861 errors, &errorHandler,
3862 "unicodeescape", message,
3863 &starts, &end, &startinpos, &endinpos, &exc, &s,
3864 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003865 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003866 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003867 }
3868 chr = (chr<<4) & ~0xF;
3869 if (c >= '0' && c <= '9')
3870 chr += c - '0';
3871 else if (c >= 'a' && c <= 'f')
3872 chr += 10 + c - 'a';
3873 else
3874 chr += 10 + c - 'A';
3875 }
3876 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003877 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003878 /* _decoding_error will have already written into the
3879 target buffer. */
3880 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003881 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003882 /* when we get here, chr is a 32-bit unicode character */
3883 if (chr <= 0xffff)
3884 /* UCS-2 character */
3885 *p++ = (Py_UNICODE) chr;
3886 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003887 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003888 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003889#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003890 *p++ = chr;
3891#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003892 chr -= 0x10000L;
3893 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003894 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003895#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003896 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003897 endinpos = s-starts;
3898 outpos = p-PyUnicode_AS_UNICODE(v);
3899 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003900 errors, &errorHandler,
3901 "unicodeescape", "illegal Unicode character",
3902 &starts, &end, &startinpos, &endinpos, &exc, &s,
3903 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003904 goto onError;
3905 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003906 break;
3907
Benjamin Peterson29060642009-01-31 22:14:21 +00003908 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003909 case 'N':
3910 message = "malformed \\N character escape";
3911 if (ucnhash_CAPI == NULL) {
3912 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003913 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003914 if (ucnhash_CAPI == NULL)
3915 goto ucnhashError;
3916 }
3917 if (*s == '{') {
3918 const char *start = s+1;
3919 /* look for the closing brace */
3920 while (*s != '}' && s < end)
3921 s++;
3922 if (s > start && s < end && *s == '}') {
3923 /* found a name. look it up in the unicode database */
3924 message = "unknown Unicode character name";
3925 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02003926 if (s - start - 1 <= INT_MAX &&
3927 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003928 goto store;
3929 }
3930 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003931 endinpos = s-starts;
3932 outpos = p-PyUnicode_AS_UNICODE(v);
3933 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003934 errors, &errorHandler,
3935 "unicodeescape", message,
3936 &starts, &end, &startinpos, &endinpos, &exc, &s,
3937 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003938 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003939 break;
3940
3941 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003942 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003943 message = "\\ at end of string";
3944 s--;
3945 endinpos = s-starts;
3946 outpos = p-PyUnicode_AS_UNICODE(v);
3947 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003948 errors, &errorHandler,
3949 "unicodeescape", message,
3950 &starts, &end, &startinpos, &endinpos, &exc, &s,
3951 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003952 goto onError;
3953 }
3954 else {
3955 *p++ = '\\';
3956 *p++ = (unsigned char)s[-1];
3957 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003958 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003959 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003960 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003963 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003965 Py_XDECREF(errorHandler);
3966 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003968
Benjamin Peterson29060642009-01-31 22:14:21 +00003969 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003970 PyErr_SetString(
3971 PyExc_UnicodeError,
3972 "\\N escapes not supported (can't load unicodedata module)"
3973 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003974 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003975 Py_XDECREF(errorHandler);
3976 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003977 return NULL;
3978
Benjamin Peterson29060642009-01-31 22:14:21 +00003979 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981 Py_XDECREF(errorHandler);
3982 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 return NULL;
3984}
3985
3986/* Return a Unicode-Escape string version of the Unicode object.
3987
3988 If quotes is true, the string is enclosed in u"" or u'' quotes as
3989 appropriate.
3990
3991*/
3992
Thomas Wouters477c8d52006-05-27 19:21:47 +00003993Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003994 Py_ssize_t size,
3995 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003996{
3997 /* like wcschr, but doesn't stop at NULL characters */
3998
3999 while (size-- > 0) {
4000 if (*s == ch)
4001 return s;
4002 s++;
4003 }
4004
4005 return NULL;
4006}
Barry Warsaw51ac5802000-03-20 16:36:48 +00004007
Walter Dörwald79e913e2007-05-12 11:08:06 +00004008static const char *hexdigits = "0123456789abcdef";
4009
4010PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004011 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004013 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004016#ifdef Py_UNICODE_WIDE
4017 const Py_ssize_t expandsize = 10;
4018#else
4019 const Py_ssize_t expandsize = 6;
4020#endif
4021
Thomas Wouters89f507f2006-12-13 04:49:30 +00004022 /* XXX(nnorwitz): rather than over-allocating, it would be
4023 better to choose a different scheme. Perhaps scan the
4024 first N-chars of the string and allocate based on that size.
4025 */
4026 /* Initial allocation is based on the longest-possible unichr
4027 escape.
4028
4029 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4030 unichr, so in this case it's the longest unichr escape. In
4031 narrow (UTF-16) builds this is five chars per source unichr
4032 since there are two unichrs in the surrogate pair, so in narrow
4033 (UTF-16) builds it's not the longest unichr escape.
4034
4035 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4036 so in the narrow (UTF-16) build case it's the longest unichr
4037 escape.
4038 */
4039
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004040 if (size == 0)
4041 return PyBytes_FromStringAndSize(NULL, 0);
4042
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004043 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004044 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004045
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004046 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004047 2
4048 + expandsize*size
4049 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004050 if (repr == NULL)
4051 return NULL;
4052
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004053 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055 while (size-- > 0) {
4056 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004057
Walter Dörwald79e913e2007-05-12 11:08:06 +00004058 /* Escape backslashes */
4059 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004060 *p++ = '\\';
4061 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004062 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004063 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004064
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004065#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004066 /* Map 21-bit characters to '\U00xxxxxx' */
4067 else if (ch >= 0x10000) {
4068 *p++ = '\\';
4069 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004070 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4071 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4072 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4073 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4074 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4075 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4076 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4077 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004078 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004079 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004080#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004081 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4082 else if (ch >= 0xD800 && ch < 0xDC00) {
4083 Py_UNICODE ch2;
4084 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004085
Benjamin Peterson29060642009-01-31 22:14:21 +00004086 ch2 = *s++;
4087 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004088 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004089 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4090 *p++ = '\\';
4091 *p++ = 'U';
4092 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4093 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4094 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4095 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4096 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4097 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4098 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4099 *p++ = hexdigits[ucs & 0x0000000F];
4100 continue;
4101 }
4102 /* Fall through: isolated surrogates are copied as-is */
4103 s--;
4104 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004105 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004106#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004107
Guido van Rossumd57fd912000-03-10 22:53:23 +00004108 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004109 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004110 *p++ = '\\';
4111 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004112 *p++ = hexdigits[(ch >> 12) & 0x000F];
4113 *p++ = hexdigits[(ch >> 8) & 0x000F];
4114 *p++ = hexdigits[(ch >> 4) & 0x000F];
4115 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004117
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004118 /* Map special whitespace to '\t', \n', '\r' */
4119 else if (ch == '\t') {
4120 *p++ = '\\';
4121 *p++ = 't';
4122 }
4123 else if (ch == '\n') {
4124 *p++ = '\\';
4125 *p++ = 'n';
4126 }
4127 else if (ch == '\r') {
4128 *p++ = '\\';
4129 *p++ = 'r';
4130 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004131
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004132 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004133 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004135 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004136 *p++ = hexdigits[(ch >> 4) & 0x000F];
4137 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004138 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004139
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 /* Copy everything else as-is */
4141 else
4142 *p++ = (char) ch;
4143 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004145 assert(p - PyBytes_AS_STRING(repr) > 0);
4146 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4147 return NULL;
4148 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149}
4150
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00004151PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004152{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004153 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154 if (!PyUnicode_Check(unicode)) {
4155 PyErr_BadArgument();
4156 return NULL;
4157 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004158 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4159 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004160 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004161}
4162
4163/* --- Raw Unicode Escape Codec ------------------------------------------- */
4164
4165PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004166 Py_ssize_t size,
4167 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004169 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004170 Py_ssize_t startinpos;
4171 Py_ssize_t endinpos;
4172 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175 const char *end;
4176 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004177 PyObject *errorHandler = NULL;
4178 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004179
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 /* Escaped strings will always be longer than the resulting
4181 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004182 length after conversion to the true value. (But decoding error
4183 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184 v = _PyUnicode_New(size);
4185 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004186 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004187 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004188 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190 end = s + size;
4191 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 unsigned char c;
4193 Py_UCS4 x;
4194 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004195 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196
Benjamin Peterson29060642009-01-31 22:14:21 +00004197 /* Non-escape characters are interpreted as Unicode ordinals */
4198 if (*s != '\\') {
4199 *p++ = (unsigned char)*s++;
4200 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004201 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004202 startinpos = s-starts;
4203
4204 /* \u-escapes are only interpreted iff the number of leading
4205 backslashes if odd */
4206 bs = s;
4207 for (;s < end;) {
4208 if (*s != '\\')
4209 break;
4210 *p++ = (unsigned char)*s++;
4211 }
4212 if (((s - bs) & 1) == 0 ||
4213 s >= end ||
4214 (*s != 'u' && *s != 'U')) {
4215 continue;
4216 }
4217 p--;
4218 count = *s=='u' ? 4 : 8;
4219 s++;
4220
4221 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4222 outpos = p-PyUnicode_AS_UNICODE(v);
4223 for (x = 0, i = 0; i < count; ++i, ++s) {
4224 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004225 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 endinpos = s-starts;
4227 if (unicode_decode_call_errorhandler(
4228 errors, &errorHandler,
4229 "rawunicodeescape", "truncated \\uXXXX",
4230 &starts, &end, &startinpos, &endinpos, &exc, &s,
4231 &v, &outpos, &p))
4232 goto onError;
4233 goto nextByte;
4234 }
4235 x = (x<<4) & ~0xF;
4236 if (c >= '0' && c <= '9')
4237 x += c - '0';
4238 else if (c >= 'a' && c <= 'f')
4239 x += 10 + c - 'a';
4240 else
4241 x += 10 + c - 'A';
4242 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004243 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 /* UCS-2 character */
4245 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004246 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 /* UCS-4 character. Either store directly, or as
4248 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004249#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004251#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004252 x -= 0x10000L;
4253 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4254 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004255#endif
4256 } else {
4257 endinpos = s-starts;
4258 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004259 if (unicode_decode_call_errorhandler(
4260 errors, &errorHandler,
4261 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004262 &starts, &end, &startinpos, &endinpos, &exc, &s,
4263 &v, &outpos, &p))
4264 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004265 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004266 nextByte:
4267 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004269 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004270 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271 Py_XDECREF(errorHandler);
4272 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004274
Benjamin Peterson29060642009-01-31 22:14:21 +00004275 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277 Py_XDECREF(errorHandler);
4278 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279 return NULL;
4280}
4281
4282PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004285 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286 char *p;
4287 char *q;
4288
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004289#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004290 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004291#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004292 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004293#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004294
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004295 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004296 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004297
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004298 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 if (repr == NULL)
4300 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004301 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004302 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004304 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305 while (size-- > 0) {
4306 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004307#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004308 /* Map 32-bit characters to '\Uxxxxxxxx' */
4309 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004310 *p++ = '\\';
4311 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004312 *p++ = hexdigits[(ch >> 28) & 0xf];
4313 *p++ = hexdigits[(ch >> 24) & 0xf];
4314 *p++ = hexdigits[(ch >> 20) & 0xf];
4315 *p++ = hexdigits[(ch >> 16) & 0xf];
4316 *p++ = hexdigits[(ch >> 12) & 0xf];
4317 *p++ = hexdigits[(ch >> 8) & 0xf];
4318 *p++ = hexdigits[(ch >> 4) & 0xf];
4319 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004320 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004321 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004322#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004323 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4324 if (ch >= 0xD800 && ch < 0xDC00) {
4325 Py_UNICODE ch2;
4326 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004327
Benjamin Peterson29060642009-01-31 22:14:21 +00004328 ch2 = *s++;
4329 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004330 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004331 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4332 *p++ = '\\';
4333 *p++ = 'U';
4334 *p++ = hexdigits[(ucs >> 28) & 0xf];
4335 *p++ = hexdigits[(ucs >> 24) & 0xf];
4336 *p++ = hexdigits[(ucs >> 20) & 0xf];
4337 *p++ = hexdigits[(ucs >> 16) & 0xf];
4338 *p++ = hexdigits[(ucs >> 12) & 0xf];
4339 *p++ = hexdigits[(ucs >> 8) & 0xf];
4340 *p++ = hexdigits[(ucs >> 4) & 0xf];
4341 *p++ = hexdigits[ucs & 0xf];
4342 continue;
4343 }
4344 /* Fall through: isolated surrogates are copied as-is */
4345 s--;
4346 size++;
4347 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004348#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004349 /* Map 16-bit characters to '\uxxxx' */
4350 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351 *p++ = '\\';
4352 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004353 *p++ = hexdigits[(ch >> 12) & 0xf];
4354 *p++ = hexdigits[(ch >> 8) & 0xf];
4355 *p++ = hexdigits[(ch >> 4) & 0xf];
4356 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004358 /* Copy everything else as-is */
4359 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004360 *p++ = (char) ch;
4361 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004362 size = p - q;
4363
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004364 assert(size > 0);
4365 if (_PyBytes_Resize(&repr, size) < 0)
4366 return NULL;
4367 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004368}
4369
4370PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4371{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004372 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004374 PyErr_BadArgument();
4375 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004377 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4378 PyUnicode_GET_SIZE(unicode));
4379
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004380 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381}
4382
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004383/* --- Unicode Internal Codec ------------------------------------------- */
4384
4385PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004386 Py_ssize_t size,
4387 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004388{
4389 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004390 Py_ssize_t startinpos;
4391 Py_ssize_t endinpos;
4392 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004393 PyUnicodeObject *v;
4394 Py_UNICODE *p;
4395 const char *end;
4396 const char *reason;
4397 PyObject *errorHandler = NULL;
4398 PyObject *exc = NULL;
4399
Neal Norwitzd43069c2006-01-08 01:12:10 +00004400#ifdef Py_UNICODE_WIDE
4401 Py_UNICODE unimax = PyUnicode_GetMax();
4402#endif
4403
Thomas Wouters89f507f2006-12-13 04:49:30 +00004404 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004405 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4406 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004408 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004409 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004410 p = PyUnicode_AS_UNICODE(v);
4411 end = s + size;
4412
4413 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004414 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004415 /* We have to sanity check the raw data, otherwise doom looms for
4416 some malformed UCS-4 data. */
4417 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004418#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004419 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004420#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004421 end-s < Py_UNICODE_SIZE
4422 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004423 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004424 startinpos = s - starts;
4425 if (end-s < Py_UNICODE_SIZE) {
4426 endinpos = end-starts;
4427 reason = "truncated input";
4428 }
4429 else {
4430 endinpos = s - starts + Py_UNICODE_SIZE;
4431 reason = "illegal code point (> 0x10FFFF)";
4432 }
4433 outpos = p - PyUnicode_AS_UNICODE(v);
4434 if (unicode_decode_call_errorhandler(
4435 errors, &errorHandler,
4436 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004437 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004438 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004439 goto onError;
4440 }
4441 }
4442 else {
4443 p++;
4444 s += Py_UNICODE_SIZE;
4445 }
4446 }
4447
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004448 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004449 goto onError;
4450 Py_XDECREF(errorHandler);
4451 Py_XDECREF(exc);
4452 return (PyObject *)v;
4453
Benjamin Peterson29060642009-01-31 22:14:21 +00004454 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004455 Py_XDECREF(v);
4456 Py_XDECREF(errorHandler);
4457 Py_XDECREF(exc);
4458 return NULL;
4459}
4460
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461/* --- Latin-1 Codec ------------------------------------------------------ */
4462
4463PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004464 Py_ssize_t size,
4465 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466{
4467 PyUnicodeObject *v;
4468 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004469 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004470
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004472 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004473 Py_UNICODE r = *(unsigned char*)s;
4474 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004475 }
4476
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 v = _PyUnicode_New(size);
4478 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004479 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004482 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004483 e = s + size;
4484 /* Unrolling the copy makes it much faster by reducing the looping
4485 overhead. This is similar to what many memcpy() implementations do. */
4486 unrolled_end = e - 4;
4487 while (s < unrolled_end) {
4488 p[0] = (unsigned char) s[0];
4489 p[1] = (unsigned char) s[1];
4490 p[2] = (unsigned char) s[2];
4491 p[3] = (unsigned char) s[3];
4492 s += 4;
4493 p += 4;
4494 }
4495 while (s < e)
4496 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004498
Benjamin Peterson29060642009-01-31 22:14:21 +00004499 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500 Py_XDECREF(v);
4501 return NULL;
4502}
4503
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004504/* create or adjust a UnicodeEncodeError */
4505static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004506 const char *encoding,
4507 const Py_UNICODE *unicode, Py_ssize_t size,
4508 Py_ssize_t startpos, Py_ssize_t endpos,
4509 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 *exceptionObject = PyUnicodeEncodeError_Create(
4513 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514 }
4515 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004516 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4517 goto onError;
4518 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4519 goto onError;
4520 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4521 goto onError;
4522 return;
4523 onError:
4524 Py_DECREF(*exceptionObject);
4525 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526 }
4527}
4528
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004529/* raises a UnicodeEncodeError */
4530static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 const char *encoding,
4532 const Py_UNICODE *unicode, Py_ssize_t size,
4533 Py_ssize_t startpos, Py_ssize_t endpos,
4534 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535{
4536 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004537 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004540}
4541
4542/* error handling callback helper:
4543 build arguments, call the callback and check the arguments,
4544 put the result into newpos and return the replacement string, which
4545 has to be freed by the caller */
4546static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004547 PyObject **errorHandler,
4548 const char *encoding, const char *reason,
4549 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4550 Py_ssize_t startpos, Py_ssize_t endpos,
4551 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004553 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554
4555 PyObject *restuple;
4556 PyObject *resunicode;
4557
4558 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004559 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004560 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004561 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004562 }
4563
4564 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004565 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004566 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004567 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568
4569 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004572 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004574 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004575 Py_DECREF(restuple);
4576 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004578 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004579 &resunicode, newpos)) {
4580 Py_DECREF(restuple);
4581 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004583 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4584 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4585 Py_DECREF(restuple);
4586 return NULL;
4587 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004589 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004590 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004591 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4592 Py_DECREF(restuple);
4593 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004594 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004595 Py_INCREF(resunicode);
4596 Py_DECREF(restuple);
4597 return resunicode;
4598}
4599
4600static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004601 Py_ssize_t size,
4602 const char *errors,
4603 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604{
4605 /* output object */
4606 PyObject *res;
4607 /* pointers to the beginning and end+1 of input */
4608 const Py_UNICODE *startp = p;
4609 const Py_UNICODE *endp = p + size;
4610 /* pointer to the beginning of the unencodable characters */
4611 /* const Py_UNICODE *badp = NULL; */
4612 /* pointer into the output */
4613 char *str;
4614 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004615 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004616 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4617 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618 PyObject *errorHandler = NULL;
4619 PyObject *exc = NULL;
4620 /* the following variable is used for caching string comparisons
4621 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4622 int known_errorHandler = -1;
4623
4624 /* allocate enough for a simple encoding without
4625 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004626 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004627 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004628 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004630 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004631 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004632 ressize = size;
4633
4634 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004635 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636
Benjamin Peterson29060642009-01-31 22:14:21 +00004637 /* can we encode this? */
4638 if (c<limit) {
4639 /* no overflow check, because we know that the space is enough */
4640 *str++ = (char)c;
4641 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004642 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004643 else {
4644 Py_ssize_t unicodepos = p-startp;
4645 Py_ssize_t requiredsize;
4646 PyObject *repunicode;
4647 Py_ssize_t repsize;
4648 Py_ssize_t newpos;
4649 Py_ssize_t respos;
4650 Py_UNICODE *uni2;
4651 /* startpos for collecting unencodable chars */
4652 const Py_UNICODE *collstart = p;
4653 const Py_UNICODE *collend = p;
4654 /* find all unecodable characters */
4655 while ((collend < endp) && ((*collend)>=limit))
4656 ++collend;
4657 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4658 if (known_errorHandler==-1) {
4659 if ((errors==NULL) || (!strcmp(errors, "strict")))
4660 known_errorHandler = 1;
4661 else if (!strcmp(errors, "replace"))
4662 known_errorHandler = 2;
4663 else if (!strcmp(errors, "ignore"))
4664 known_errorHandler = 3;
4665 else if (!strcmp(errors, "xmlcharrefreplace"))
4666 known_errorHandler = 4;
4667 else
4668 known_errorHandler = 0;
4669 }
4670 switch (known_errorHandler) {
4671 case 1: /* strict */
4672 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4673 goto onError;
4674 case 2: /* replace */
4675 while (collstart++<collend)
4676 *str++ = '?'; /* fall through */
4677 case 3: /* ignore */
4678 p = collend;
4679 break;
4680 case 4: /* xmlcharrefreplace */
4681 respos = str - PyBytes_AS_STRING(res);
4682 /* determine replacement size (temporarily (mis)uses p) */
4683 for (p = collstart, repsize = 0; p < collend; ++p) {
4684 if (*p<10)
4685 repsize += 2+1+1;
4686 else if (*p<100)
4687 repsize += 2+2+1;
4688 else if (*p<1000)
4689 repsize += 2+3+1;
4690 else if (*p<10000)
4691 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004692#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004693 else
4694 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004695#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004696 else if (*p<100000)
4697 repsize += 2+5+1;
4698 else if (*p<1000000)
4699 repsize += 2+6+1;
4700 else
4701 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004702#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004703 }
4704 requiredsize = respos+repsize+(endp-collend);
4705 if (requiredsize > ressize) {
4706 if (requiredsize<2*ressize)
4707 requiredsize = 2*ressize;
4708 if (_PyBytes_Resize(&res, requiredsize))
4709 goto onError;
4710 str = PyBytes_AS_STRING(res) + respos;
4711 ressize = requiredsize;
4712 }
4713 /* generate replacement (temporarily (mis)uses p) */
4714 for (p = collstart; p < collend; ++p) {
4715 str += sprintf(str, "&#%d;", (int)*p);
4716 }
4717 p = collend;
4718 break;
4719 default:
4720 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4721 encoding, reason, startp, size, &exc,
4722 collstart-startp, collend-startp, &newpos);
4723 if (repunicode == NULL)
4724 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004725 if (PyBytes_Check(repunicode)) {
4726 /* Directly copy bytes result to output. */
4727 repsize = PyBytes_Size(repunicode);
4728 if (repsize > 1) {
4729 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004730 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004731 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4732 Py_DECREF(repunicode);
4733 goto onError;
4734 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004735 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004736 ressize += repsize-1;
4737 }
4738 memcpy(str, PyBytes_AsString(repunicode), repsize);
4739 str += repsize;
4740 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004741 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004742 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004743 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004744 /* need more space? (at least enough for what we
4745 have+the replacement+the rest of the string, so
4746 we won't have to check space for encodable characters) */
4747 respos = str - PyBytes_AS_STRING(res);
4748 repsize = PyUnicode_GET_SIZE(repunicode);
4749 requiredsize = respos+repsize+(endp-collend);
4750 if (requiredsize > ressize) {
4751 if (requiredsize<2*ressize)
4752 requiredsize = 2*ressize;
4753 if (_PyBytes_Resize(&res, requiredsize)) {
4754 Py_DECREF(repunicode);
4755 goto onError;
4756 }
4757 str = PyBytes_AS_STRING(res) + respos;
4758 ressize = requiredsize;
4759 }
4760 /* check if there is anything unencodable in the replacement
4761 and copy it to the output */
4762 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4763 c = *uni2;
4764 if (c >= limit) {
4765 raise_encode_exception(&exc, encoding, startp, size,
4766 unicodepos, unicodepos+1, reason);
4767 Py_DECREF(repunicode);
4768 goto onError;
4769 }
4770 *str = (char)c;
4771 }
4772 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004773 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004774 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004775 }
4776 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004777 /* Resize if we allocated to much */
4778 size = str - PyBytes_AS_STRING(res);
4779 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004780 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004781 if (_PyBytes_Resize(&res, size) < 0)
4782 goto onError;
4783 }
4784
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004785 Py_XDECREF(errorHandler);
4786 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004787 return res;
4788
4789 onError:
4790 Py_XDECREF(res);
4791 Py_XDECREF(errorHandler);
4792 Py_XDECREF(exc);
4793 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004794}
4795
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004797 Py_ssize_t size,
4798 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004800 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801}
4802
4803PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4804{
4805 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004806 PyErr_BadArgument();
4807 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808 }
4809 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004810 PyUnicode_GET_SIZE(unicode),
4811 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812}
4813
4814/* --- 7-bit ASCII Codec -------------------------------------------------- */
4815
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004817 Py_ssize_t size,
4818 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004820 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821 PyUnicodeObject *v;
4822 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004823 Py_ssize_t startinpos;
4824 Py_ssize_t endinpos;
4825 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826 const char *e;
4827 PyObject *errorHandler = NULL;
4828 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004829
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004831 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004832 Py_UNICODE r = *(unsigned char*)s;
4833 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004834 }
Tim Petersced69f82003-09-16 20:30:58 +00004835
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836 v = _PyUnicode_New(size);
4837 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004838 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004840 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004842 e = s + size;
4843 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 register unsigned char c = (unsigned char)*s;
4845 if (c < 128) {
4846 *p++ = c;
4847 ++s;
4848 }
4849 else {
4850 startinpos = s-starts;
4851 endinpos = startinpos + 1;
4852 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4853 if (unicode_decode_call_errorhandler(
4854 errors, &errorHandler,
4855 "ascii", "ordinal not in range(128)",
4856 &starts, &e, &startinpos, &endinpos, &exc, &s,
4857 &v, &outpos, &p))
4858 goto onError;
4859 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004861 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004862 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4863 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004864 Py_XDECREF(errorHandler);
4865 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004867
Benjamin Peterson29060642009-01-31 22:14:21 +00004868 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004870 Py_XDECREF(errorHandler);
4871 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872 return NULL;
4873}
4874
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004876 Py_ssize_t size,
4877 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004879 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880}
4881
4882PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4883{
4884 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004885 PyErr_BadArgument();
4886 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887 }
4888 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004889 PyUnicode_GET_SIZE(unicode),
4890 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891}
4892
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004893#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004894
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004895/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004896
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004897#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004898#define NEED_RETRY
4899#endif
4900
4901/* XXX This code is limited to "true" double-byte encodings, as
4902 a) it assumes an incomplete character consists of a single byte, and
4903 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004904 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004905
4906static int is_dbcs_lead_byte(const char *s, int offset)
4907{
4908 const char *curr = s + offset;
4909
4910 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004911 const char *prev = CharPrev(s, curr);
4912 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004913 }
4914 return 0;
4915}
4916
4917/*
4918 * Decode MBCS string into unicode object. If 'final' is set, converts
4919 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4920 */
4921static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 const char *s, /* MBCS string */
4923 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004924 int final,
4925 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004926{
4927 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004928 Py_ssize_t n;
4929 DWORD usize;
4930 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004931
4932 assert(size >= 0);
4933
Victor Stinner554f3f02010-06-16 23:33:54 +00004934 /* check and handle 'errors' arg */
4935 if (errors==NULL || strcmp(errors, "strict")==0)
4936 flags = MB_ERR_INVALID_CHARS;
4937 else if (strcmp(errors, "ignore")==0)
4938 flags = 0;
4939 else {
4940 PyErr_Format(PyExc_ValueError,
4941 "mbcs encoding does not support errors='%s'",
4942 errors);
4943 return -1;
4944 }
4945
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004946 /* Skip trailing lead-byte unless 'final' is set */
4947 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004948 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004949
4950 /* First get the size of the result */
4951 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004952 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4953 if (usize==0)
4954 goto mbcs_decode_error;
4955 } else
4956 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004957
4958 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004959 /* Create unicode object */
4960 *v = _PyUnicode_New(usize);
4961 if (*v == NULL)
4962 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004963 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004964 }
4965 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004966 /* Extend unicode object */
4967 n = PyUnicode_GET_SIZE(*v);
4968 if (_PyUnicode_Resize(v, n + usize) < 0)
4969 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004970 }
4971
4972 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004973 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004975 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4976 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004977 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004978 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004979 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004980
4981mbcs_decode_error:
4982 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4983 we raise a UnicodeDecodeError - else it is a 'generic'
4984 windows error
4985 */
4986 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4987 /* Ideally, we should get reason from FormatMessage - this
4988 is the Windows 2000 English version of the message
4989 */
4990 PyObject *exc = NULL;
4991 const char *reason = "No mapping for the Unicode character exists "
4992 "in the target multi-byte code page.";
4993 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4994 if (exc != NULL) {
4995 PyCodec_StrictErrors(exc);
4996 Py_DECREF(exc);
4997 }
4998 } else {
4999 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5000 }
5001 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005002}
5003
5004PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005005 Py_ssize_t size,
5006 const char *errors,
5007 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005008{
5009 PyUnicodeObject *v = NULL;
5010 int done;
5011
5012 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005014
5015#ifdef NEED_RETRY
5016 retry:
5017 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005018 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005019 else
5020#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005021 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005022
5023 if (done < 0) {
5024 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005026 }
5027
5028 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005030
5031#ifdef NEED_RETRY
5032 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 s += done;
5034 size -= done;
5035 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005036 }
5037#endif
5038
5039 return (PyObject *)v;
5040}
5041
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005042PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005043 Py_ssize_t size,
5044 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005045{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005046 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5047}
5048
5049/*
5050 * Convert unicode into string object (MBCS).
5051 * Returns 0 if succeed, -1 otherwise.
5052 */
5053static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00005055 int size, /* size of unicode */
5056 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005057{
Victor Stinner554f3f02010-06-16 23:33:54 +00005058 BOOL usedDefaultChar = FALSE;
5059 BOOL *pusedDefaultChar;
5060 int mbcssize;
5061 Py_ssize_t n;
5062 PyObject *exc = NULL;
5063 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005064
5065 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005066
Victor Stinner554f3f02010-06-16 23:33:54 +00005067 /* check and handle 'errors' arg */
5068 if (errors==NULL || strcmp(errors, "strict")==0) {
5069 flags = WC_NO_BEST_FIT_CHARS;
5070 pusedDefaultChar = &usedDefaultChar;
5071 } else if (strcmp(errors, "replace")==0) {
5072 flags = 0;
5073 pusedDefaultChar = NULL;
5074 } else {
5075 PyErr_Format(PyExc_ValueError,
5076 "mbcs encoding does not support errors='%s'",
5077 errors);
5078 return -1;
5079 }
5080
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005081 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005082 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005083 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5084 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005085 if (mbcssize == 0) {
5086 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5087 return -1;
5088 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005089 /* If we used a default char, then we failed! */
5090 if (pusedDefaultChar && *pusedDefaultChar)
5091 goto mbcs_encode_error;
5092 } else {
5093 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005094 }
5095
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005096 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005097 /* Create string object */
5098 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5099 if (*repr == NULL)
5100 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005101 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005102 }
5103 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 /* Extend string object */
5105 n = PyBytes_Size(*repr);
5106 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5107 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005108 }
5109
5110 /* Do the conversion */
5111 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005112 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005113 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5114 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005115 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5116 return -1;
5117 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005118 if (pusedDefaultChar && *pusedDefaultChar)
5119 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005120 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005121 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005122
5123mbcs_encode_error:
5124 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5125 Py_XDECREF(exc);
5126 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005127}
5128
5129PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005130 Py_ssize_t size,
5131 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005132{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005133 PyObject *repr = NULL;
5134 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005135
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005136#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005137 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005138 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005139 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005140 else
5141#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005142 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005143
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005144 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005145 Py_XDECREF(repr);
5146 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005147 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005148
5149#ifdef NEED_RETRY
5150 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005151 p += INT_MAX;
5152 size -= INT_MAX;
5153 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005154 }
5155#endif
5156
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005157 return repr;
5158}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005159
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005160PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5161{
5162 if (!PyUnicode_Check(unicode)) {
5163 PyErr_BadArgument();
5164 return NULL;
5165 }
5166 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 PyUnicode_GET_SIZE(unicode),
5168 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005169}
5170
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005171#undef NEED_RETRY
5172
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005173#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005174
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175/* --- Character Mapping Codec -------------------------------------------- */
5176
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005178 Py_ssize_t size,
5179 PyObject *mapping,
5180 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005182 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005183 Py_ssize_t startinpos;
5184 Py_ssize_t endinpos;
5185 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005186 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187 PyUnicodeObject *v;
5188 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005189 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005190 PyObject *errorHandler = NULL;
5191 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005192 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005193 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005194
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195 /* Default to Latin-1 */
5196 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198
5199 v = _PyUnicode_New(size);
5200 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005201 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005203 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005205 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005206 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005207 mapstring = PyUnicode_AS_UNICODE(mapping);
5208 maplen = PyUnicode_GET_SIZE(mapping);
5209 while (s < e) {
5210 unsigned char ch = *s;
5211 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212
Benjamin Peterson29060642009-01-31 22:14:21 +00005213 if (ch < maplen)
5214 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
Benjamin Peterson29060642009-01-31 22:14:21 +00005216 if (x == 0xfffe) {
5217 /* undefined mapping */
5218 outpos = p-PyUnicode_AS_UNICODE(v);
5219 startinpos = s-starts;
5220 endinpos = startinpos+1;
5221 if (unicode_decode_call_errorhandler(
5222 errors, &errorHandler,
5223 "charmap", "character maps to <undefined>",
5224 &starts, &e, &startinpos, &endinpos, &exc, &s,
5225 &v, &outpos, &p)) {
5226 goto onError;
5227 }
5228 continue;
5229 }
5230 *p++ = x;
5231 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005232 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005233 }
5234 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005235 while (s < e) {
5236 unsigned char ch = *s;
5237 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005238
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5240 w = PyLong_FromLong((long)ch);
5241 if (w == NULL)
5242 goto onError;
5243 x = PyObject_GetItem(mapping, w);
5244 Py_DECREF(w);
5245 if (x == NULL) {
5246 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5247 /* No mapping found means: mapping is undefined. */
5248 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005249 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 } else
5251 goto onError;
5252 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005253
Benjamin Peterson29060642009-01-31 22:14:21 +00005254 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005255 if (x == Py_None)
5256 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 if (PyLong_Check(x)) {
5258 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005259 if (value == 0xFFFE)
5260 goto Undefined;
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02005261 if (value < 0 || value > 0x10FFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005262 PyErr_SetString(PyExc_TypeError,
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02005263 "character mapping must be in range(0x110000)");
Benjamin Peterson29060642009-01-31 22:14:21 +00005264 Py_DECREF(x);
5265 goto onError;
5266 }
Antoine Pitrou6f80f5d2012-09-23 19:55:21 +02005267
5268#ifndef Py_UNICODE_WIDE
5269 if (value > 0xFFFF) {
5270 /* see the code for 1-n mapping below */
5271 if (extrachars < 2) {
5272 /* resize first */
5273 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5274 Py_ssize_t needed = 10 - extrachars;
5275 extrachars += needed;
5276 /* XXX overflow detection missing */
5277 if (_PyUnicode_Resize(&v,
5278 PyUnicode_GET_SIZE(v) + needed) < 0) {
5279 Py_DECREF(x);
5280 goto onError;
5281 }
5282 p = PyUnicode_AS_UNICODE(v) + oldpos;
5283 }
5284 value -= 0x10000;
5285 *p++ = 0xD800 | (value >> 10);
5286 *p++ = 0xDC00 | (value & 0x3FF);
5287 extrachars -= 2;
5288 }
5289 else
5290#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 *p++ = (Py_UNICODE)value;
5292 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005293 else if (PyUnicode_Check(x)) {
5294 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005295
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005296 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005297 /* 1-1 mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005298 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
5299 if (value == 0xFFFE)
5300 goto Undefined;
5301 *p++ = value;
5302 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005303 else if (targetsize > 1) {
5304 /* 1-n mapping */
5305 if (targetsize > extrachars) {
5306 /* resize first */
5307 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5308 Py_ssize_t needed = (targetsize - extrachars) + \
5309 (targetsize << 2);
5310 extrachars += needed;
5311 /* XXX overflow detection missing */
5312 if (_PyUnicode_Resize(&v,
5313 PyUnicode_GET_SIZE(v) + needed) < 0) {
5314 Py_DECREF(x);
5315 goto onError;
5316 }
5317 p = PyUnicode_AS_UNICODE(v) + oldpos;
5318 }
5319 Py_UNICODE_COPY(p,
5320 PyUnicode_AS_UNICODE(x),
5321 targetsize);
5322 p += targetsize;
5323 extrachars -= targetsize;
5324 }
5325 /* 1-0 mapping: skip the character */
5326 }
5327 else {
5328 /* wrong return value */
5329 PyErr_SetString(PyExc_TypeError,
5330 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005331 Py_DECREF(x);
5332 goto onError;
5333 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 Py_DECREF(x);
5335 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02005336 continue;
5337Undefined:
5338 /* undefined mapping */
5339 Py_XDECREF(x);
5340 outpos = p-PyUnicode_AS_UNICODE(v);
5341 startinpos = s-starts;
5342 endinpos = startinpos+1;
5343 if (unicode_decode_call_errorhandler(
5344 errors, &errorHandler,
5345 "charmap", "character maps to <undefined>",
5346 &starts, &e, &startinpos, &endinpos, &exc, &s,
5347 &v, &outpos, &p)) {
5348 goto onError;
5349 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005350 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351 }
5352 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5354 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005355 Py_XDECREF(errorHandler);
5356 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005358
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005360 Py_XDECREF(errorHandler);
5361 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 Py_XDECREF(v);
5363 return NULL;
5364}
5365
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005366/* Charmap encoding: the lookup table */
5367
5368struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 PyObject_HEAD
5370 unsigned char level1[32];
5371 int count2, count3;
5372 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005373};
5374
5375static PyObject*
5376encoding_map_size(PyObject *obj, PyObject* args)
5377{
5378 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005379 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005380 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005381}
5382
5383static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005384 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005385 PyDoc_STR("Return the size (in bytes) of this object") },
5386 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005387};
5388
5389static void
5390encoding_map_dealloc(PyObject* o)
5391{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005392 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005393}
5394
5395static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005396 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005397 "EncodingMap", /*tp_name*/
5398 sizeof(struct encoding_map), /*tp_basicsize*/
5399 0, /*tp_itemsize*/
5400 /* methods */
5401 encoding_map_dealloc, /*tp_dealloc*/
5402 0, /*tp_print*/
5403 0, /*tp_getattr*/
5404 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005405 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005406 0, /*tp_repr*/
5407 0, /*tp_as_number*/
5408 0, /*tp_as_sequence*/
5409 0, /*tp_as_mapping*/
5410 0, /*tp_hash*/
5411 0, /*tp_call*/
5412 0, /*tp_str*/
5413 0, /*tp_getattro*/
5414 0, /*tp_setattro*/
5415 0, /*tp_as_buffer*/
5416 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5417 0, /*tp_doc*/
5418 0, /*tp_traverse*/
5419 0, /*tp_clear*/
5420 0, /*tp_richcompare*/
5421 0, /*tp_weaklistoffset*/
5422 0, /*tp_iter*/
5423 0, /*tp_iternext*/
5424 encoding_map_methods, /*tp_methods*/
5425 0, /*tp_members*/
5426 0, /*tp_getset*/
5427 0, /*tp_base*/
5428 0, /*tp_dict*/
5429 0, /*tp_descr_get*/
5430 0, /*tp_descr_set*/
5431 0, /*tp_dictoffset*/
5432 0, /*tp_init*/
5433 0, /*tp_alloc*/
5434 0, /*tp_new*/
5435 0, /*tp_free*/
5436 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005437};
5438
5439PyObject*
5440PyUnicode_BuildEncodingMap(PyObject* string)
5441{
5442 Py_UNICODE *decode;
5443 PyObject *result;
5444 struct encoding_map *mresult;
5445 int i;
5446 int need_dict = 0;
5447 unsigned char level1[32];
5448 unsigned char level2[512];
5449 unsigned char *mlevel1, *mlevel2, *mlevel3;
5450 int count2 = 0, count3 = 0;
5451
5452 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5453 PyErr_BadArgument();
5454 return NULL;
5455 }
5456 decode = PyUnicode_AS_UNICODE(string);
5457 memset(level1, 0xFF, sizeof level1);
5458 memset(level2, 0xFF, sizeof level2);
5459
5460 /* If there isn't a one-to-one mapping of NULL to \0,
5461 or if there are non-BMP characters, we need to use
5462 a mapping dictionary. */
5463 if (decode[0] != 0)
5464 need_dict = 1;
5465 for (i = 1; i < 256; i++) {
5466 int l1, l2;
5467 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005468#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005469 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005470#endif
5471 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005472 need_dict = 1;
5473 break;
5474 }
5475 if (decode[i] == 0xFFFE)
5476 /* unmapped character */
5477 continue;
5478 l1 = decode[i] >> 11;
5479 l2 = decode[i] >> 7;
5480 if (level1[l1] == 0xFF)
5481 level1[l1] = count2++;
5482 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005483 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005484 }
5485
5486 if (count2 >= 0xFF || count3 >= 0xFF)
5487 need_dict = 1;
5488
5489 if (need_dict) {
5490 PyObject *result = PyDict_New();
5491 PyObject *key, *value;
5492 if (!result)
5493 return NULL;
5494 for (i = 0; i < 256; i++) {
5495 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005496 key = PyLong_FromLong(decode[i]);
5497 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005498 if (!key || !value)
5499 goto failed1;
5500 if (PyDict_SetItem(result, key, value) == -1)
5501 goto failed1;
5502 Py_DECREF(key);
5503 Py_DECREF(value);
5504 }
5505 return result;
5506 failed1:
5507 Py_XDECREF(key);
5508 Py_XDECREF(value);
5509 Py_DECREF(result);
5510 return NULL;
5511 }
5512
5513 /* Create a three-level trie */
5514 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5515 16*count2 + 128*count3 - 1);
5516 if (!result)
5517 return PyErr_NoMemory();
5518 PyObject_Init(result, &EncodingMapType);
5519 mresult = (struct encoding_map*)result;
5520 mresult->count2 = count2;
5521 mresult->count3 = count3;
5522 mlevel1 = mresult->level1;
5523 mlevel2 = mresult->level23;
5524 mlevel3 = mresult->level23 + 16*count2;
5525 memcpy(mlevel1, level1, 32);
5526 memset(mlevel2, 0xFF, 16*count2);
5527 memset(mlevel3, 0, 128*count3);
5528 count3 = 0;
5529 for (i = 1; i < 256; i++) {
5530 int o1, o2, o3, i2, i3;
5531 if (decode[i] == 0xFFFE)
5532 /* unmapped character */
5533 continue;
5534 o1 = decode[i]>>11;
5535 o2 = (decode[i]>>7) & 0xF;
5536 i2 = 16*mlevel1[o1] + o2;
5537 if (mlevel2[i2] == 0xFF)
5538 mlevel2[i2] = count3++;
5539 o3 = decode[i] & 0x7F;
5540 i3 = 128*mlevel2[i2] + o3;
5541 mlevel3[i3] = i;
5542 }
5543 return result;
5544}
5545
5546static int
5547encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5548{
5549 struct encoding_map *map = (struct encoding_map*)mapping;
5550 int l1 = c>>11;
5551 int l2 = (c>>7) & 0xF;
5552 int l3 = c & 0x7F;
5553 int i;
5554
5555#ifdef Py_UNICODE_WIDE
5556 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005557 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005558 }
5559#endif
5560 if (c == 0)
5561 return 0;
5562 /* level 1*/
5563 i = map->level1[l1];
5564 if (i == 0xFF) {
5565 return -1;
5566 }
5567 /* level 2*/
5568 i = map->level23[16*i+l2];
5569 if (i == 0xFF) {
5570 return -1;
5571 }
5572 /* level 3 */
5573 i = map->level23[16*map->count2 + 128*i + l3];
5574 if (i == 0) {
5575 return -1;
5576 }
5577 return i;
5578}
5579
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580/* Lookup the character ch in the mapping. If the character
5581 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005582 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005583static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584{
Christian Heimes217cfd12007-12-02 14:31:20 +00005585 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005586 PyObject *x;
5587
5588 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005590 x = PyObject_GetItem(mapping, w);
5591 Py_DECREF(w);
5592 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005593 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5594 /* No mapping found means: mapping is undefined. */
5595 PyErr_Clear();
5596 x = Py_None;
5597 Py_INCREF(x);
5598 return x;
5599 } else
5600 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005602 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005603 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005604 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 long value = PyLong_AS_LONG(x);
5606 if (value < 0 || value > 255) {
5607 PyErr_SetString(PyExc_TypeError,
5608 "character mapping must be in range(256)");
5609 Py_DECREF(x);
5610 return NULL;
5611 }
5612 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005614 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005615 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 /* wrong return value */
5618 PyErr_Format(PyExc_TypeError,
5619 "character mapping must return integer, bytes or None, not %.400s",
5620 x->ob_type->tp_name);
5621 Py_DECREF(x);
5622 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 }
5624}
5625
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005626static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005627charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005628{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005629 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5630 /* exponentially overallocate to minimize reallocations */
5631 if (requiredsize < 2*outsize)
5632 requiredsize = 2*outsize;
5633 if (_PyBytes_Resize(outobj, requiredsize))
5634 return -1;
5635 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005636}
5637
Benjamin Peterson14339b62009-01-31 16:36:08 +00005638typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005639 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005640}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005641/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005642 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005643 space is available. Return a new reference to the object that
5644 was put in the output buffer, or Py_None, if the mapping was undefined
5645 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005646 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005647static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005648charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005649 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005650{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005651 PyObject *rep;
5652 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005653 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005654
Christian Heimes90aa7642007-12-19 02:45:37 +00005655 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005656 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005658 if (res == -1)
5659 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 if (outsize<requiredsize)
5661 if (charmapencode_resize(outobj, outpos, requiredsize))
5662 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005663 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 outstart[(*outpos)++] = (char)res;
5665 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005666 }
5667
5668 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005669 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005670 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005671 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 Py_DECREF(rep);
5673 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005674 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005675 if (PyLong_Check(rep)) {
5676 Py_ssize_t requiredsize = *outpos+1;
5677 if (outsize<requiredsize)
5678 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5679 Py_DECREF(rep);
5680 return enc_EXCEPTION;
5681 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005682 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005684 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005685 else {
5686 const char *repchars = PyBytes_AS_STRING(rep);
5687 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5688 Py_ssize_t requiredsize = *outpos+repsize;
5689 if (outsize<requiredsize)
5690 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5691 Py_DECREF(rep);
5692 return enc_EXCEPTION;
5693 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005694 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 memcpy(outstart + *outpos, repchars, repsize);
5696 *outpos += repsize;
5697 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005698 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005699 Py_DECREF(rep);
5700 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701}
5702
5703/* handle an error in PyUnicode_EncodeCharmap
5704 Return 0 on success, -1 on error */
5705static
5706int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005707 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005708 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005709 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005710 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005711{
5712 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005713 Py_ssize_t repsize;
5714 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715 Py_UNICODE *uni2;
5716 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005717 Py_ssize_t collstartpos = *inpos;
5718 Py_ssize_t collendpos = *inpos+1;
5719 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005720 char *encoding = "charmap";
5721 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005722 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005723
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 /* find all unencodable characters */
5725 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005726 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005727 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005728 int res = encoding_map_lookup(p[collendpos], mapping);
5729 if (res != -1)
5730 break;
5731 ++collendpos;
5732 continue;
5733 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005734
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 rep = charmapencode_lookup(p[collendpos], mapping);
5736 if (rep==NULL)
5737 return -1;
5738 else if (rep!=Py_None) {
5739 Py_DECREF(rep);
5740 break;
5741 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005742 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005744 }
5745 /* cache callback name lookup
5746 * (if not done yet, i.e. it's the first error) */
5747 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005748 if ((errors==NULL) || (!strcmp(errors, "strict")))
5749 *known_errorHandler = 1;
5750 else if (!strcmp(errors, "replace"))
5751 *known_errorHandler = 2;
5752 else if (!strcmp(errors, "ignore"))
5753 *known_errorHandler = 3;
5754 else if (!strcmp(errors, "xmlcharrefreplace"))
5755 *known_errorHandler = 4;
5756 else
5757 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005758 }
5759 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005760 case 1: /* strict */
5761 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5762 return -1;
5763 case 2: /* replace */
5764 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 x = charmapencode_output('?', mapping, res, respos);
5766 if (x==enc_EXCEPTION) {
5767 return -1;
5768 }
5769 else if (x==enc_FAILED) {
5770 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5771 return -1;
5772 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005773 }
5774 /* fall through */
5775 case 3: /* ignore */
5776 *inpos = collendpos;
5777 break;
5778 case 4: /* xmlcharrefreplace */
5779 /* generate replacement (temporarily (mis)uses p) */
5780 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005781 char buffer[2+29+1+1];
5782 char *cp;
5783 sprintf(buffer, "&#%d;", (int)p[collpos]);
5784 for (cp = buffer; *cp; ++cp) {
5785 x = charmapencode_output(*cp, mapping, res, respos);
5786 if (x==enc_EXCEPTION)
5787 return -1;
5788 else if (x==enc_FAILED) {
5789 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5790 return -1;
5791 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005792 }
5793 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005794 *inpos = collendpos;
5795 break;
5796 default:
5797 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 encoding, reason, p, size, exceptionObject,
5799 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005800 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005802 if (PyBytes_Check(repunicode)) {
5803 /* Directly copy bytes result to output. */
5804 Py_ssize_t outsize = PyBytes_Size(*res);
5805 Py_ssize_t requiredsize;
5806 repsize = PyBytes_Size(repunicode);
5807 requiredsize = *respos + repsize;
5808 if (requiredsize > outsize)
5809 /* Make room for all additional bytes. */
5810 if (charmapencode_resize(res, respos, requiredsize)) {
5811 Py_DECREF(repunicode);
5812 return -1;
5813 }
5814 memcpy(PyBytes_AsString(*res) + *respos,
5815 PyBytes_AsString(repunicode), repsize);
5816 *respos += repsize;
5817 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005818 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005819 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005820 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005821 /* generate replacement */
5822 repsize = PyUnicode_GET_SIZE(repunicode);
5823 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 x = charmapencode_output(*uni2, mapping, res, respos);
5825 if (x==enc_EXCEPTION) {
5826 return -1;
5827 }
5828 else if (x==enc_FAILED) {
5829 Py_DECREF(repunicode);
5830 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5831 return -1;
5832 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005833 }
5834 *inpos = newpos;
5835 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005836 }
5837 return 0;
5838}
5839
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 Py_ssize_t size,
5842 PyObject *mapping,
5843 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005845 /* output object */
5846 PyObject *res = NULL;
5847 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005848 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005849 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005850 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005851 PyObject *errorHandler = NULL;
5852 PyObject *exc = NULL;
5853 /* the following variable is used for caching string comparisons
5854 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5855 * 3=ignore, 4=xmlcharrefreplace */
5856 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857
5858 /* Default to Latin-1 */
5859 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005862 /* allocate enough for a simple encoding without
5863 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005864 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005865 if (res == NULL)
5866 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005867 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005870 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005871 /* try to encode it */
5872 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5873 if (x==enc_EXCEPTION) /* error */
5874 goto onError;
5875 if (x==enc_FAILED) { /* unencodable character */
5876 if (charmap_encoding_error(p, size, &inpos, mapping,
5877 &exc,
5878 &known_errorHandler, &errorHandler, errors,
5879 &res, &respos)) {
5880 goto onError;
5881 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005882 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 else
5884 /* done with this character => adjust input position */
5885 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005888 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005889 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005890 if (_PyBytes_Resize(&res, respos) < 0)
5891 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005892
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005893 Py_XDECREF(exc);
5894 Py_XDECREF(errorHandler);
5895 return res;
5896
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 Py_XDECREF(res);
5899 Py_XDECREF(exc);
5900 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 return NULL;
5902}
5903
5904PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906{
5907 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 PyErr_BadArgument();
5909 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 }
5911 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 PyUnicode_GET_SIZE(unicode),
5913 mapping,
5914 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915}
5916
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005917/* create or adjust a UnicodeTranslateError */
5918static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005919 const Py_UNICODE *unicode, Py_ssize_t size,
5920 Py_ssize_t startpos, Py_ssize_t endpos,
5921 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005923 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005924 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 }
5927 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5929 goto onError;
5930 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5931 goto onError;
5932 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5933 goto onError;
5934 return;
5935 onError:
5936 Py_DECREF(*exceptionObject);
5937 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938 }
5939}
5940
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005941/* raises a UnicodeTranslateError */
5942static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 const Py_UNICODE *unicode, Py_ssize_t size,
5944 Py_ssize_t startpos, Py_ssize_t endpos,
5945 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005946{
5947 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005949 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005951}
5952
5953/* error handling callback helper:
5954 build arguments, call the callback and check the arguments,
5955 put the result into newpos and return the replacement string, which
5956 has to be freed by the caller */
5957static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005958 PyObject **errorHandler,
5959 const char *reason,
5960 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5961 Py_ssize_t startpos, Py_ssize_t endpos,
5962 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005963{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005964 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005965
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005966 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005967 PyObject *restuple;
5968 PyObject *resunicode;
5969
5970 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005972 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005973 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005974 }
5975
5976 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005978 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005979 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005980
5981 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005982 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005983 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005985 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005986 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 Py_DECREF(restuple);
5988 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005989 }
5990 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 &resunicode, &i_newpos)) {
5992 Py_DECREF(restuple);
5993 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005994 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005995 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005997 else
5998 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005999 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6001 Py_DECREF(restuple);
6002 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006003 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006004 Py_INCREF(resunicode);
6005 Py_DECREF(restuple);
6006 return resunicode;
6007}
6008
6009/* Lookup the character ch in the mapping and put the result in result,
6010 which must be decrefed by the caller.
6011 Return 0 on success, -1 on error */
6012static
6013int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
6014{
Christian Heimes217cfd12007-12-02 14:31:20 +00006015 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006016 PyObject *x;
6017
6018 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006020 x = PyObject_GetItem(mapping, w);
6021 Py_DECREF(w);
6022 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6024 /* No mapping found means: use 1:1 mapping. */
6025 PyErr_Clear();
6026 *result = NULL;
6027 return 0;
6028 } else
6029 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006030 }
6031 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006032 *result = x;
6033 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006034 }
Christian Heimes217cfd12007-12-02 14:31:20 +00006035 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 long value = PyLong_AS_LONG(x);
6037 long max = PyUnicode_GetMax();
6038 if (value < 0 || value > max) {
6039 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00006040 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 Py_DECREF(x);
6042 return -1;
6043 }
6044 *result = x;
6045 return 0;
6046 }
6047 else if (PyUnicode_Check(x)) {
6048 *result = x;
6049 return 0;
6050 }
6051 else {
6052 /* wrong return value */
6053 PyErr_SetString(PyExc_TypeError,
6054 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006055 Py_DECREF(x);
6056 return -1;
6057 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006058}
6059/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 if not reallocate and adjust various state variables.
6061 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006062static
Walter Dörwald4894c302003-10-24 14:25:28 +00006063int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006065{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006066 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006067 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 /* remember old output position */
6069 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6070 /* exponentially overallocate to minimize reallocations */
6071 if (requiredsize < 2 * oldsize)
6072 requiredsize = 2 * oldsize;
6073 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6074 return -1;
6075 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006076 }
6077 return 0;
6078}
6079/* lookup the character, put the result in the output string and adjust
6080 various state variables. Return a new reference to the object that
6081 was put in the output buffer in *result, or Py_None, if the mapping was
6082 undefined (in which case no character was written).
6083 The called must decref result.
6084 Return 0 on success, -1 on error. */
6085static
Walter Dörwald4894c302003-10-24 14:25:28 +00006086int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6088 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006089{
Walter Dörwald4894c302003-10-24 14:25:28 +00006090 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006092 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 /* not found => default to 1:1 mapping */
6094 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006095 }
6096 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006098 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 /* no overflow check, because we know that the space is enough */
6100 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006101 }
6102 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6104 if (repsize==1) {
6105 /* no overflow check, because we know that the space is enough */
6106 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6107 }
6108 else if (repsize!=0) {
6109 /* more than one character */
6110 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6111 (insize - (curinp-startinp)) +
6112 repsize - 1;
6113 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6114 return -1;
6115 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6116 *outp += repsize;
6117 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006118 }
6119 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006121 return 0;
6122}
6123
6124PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 Py_ssize_t size,
6126 PyObject *mapping,
6127 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006129 /* output object */
6130 PyObject *res = NULL;
6131 /* pointers to the beginning and end+1 of input */
6132 const Py_UNICODE *startp = p;
6133 const Py_UNICODE *endp = p + size;
6134 /* pointer into the output */
6135 Py_UNICODE *str;
6136 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006137 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138 char *reason = "character maps to <undefined>";
6139 PyObject *errorHandler = NULL;
6140 PyObject *exc = NULL;
6141 /* the following variable is used for caching string comparisons
6142 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6143 * 3=ignore, 4=xmlcharrefreplace */
6144 int known_errorHandler = -1;
6145
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 PyErr_BadArgument();
6148 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006150
6151 /* allocate enough for a simple 1:1 translation without
6152 replacements, if we need more, we'll resize */
6153 res = PyUnicode_FromUnicode(NULL, size);
6154 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006160 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 /* try to encode it */
6162 PyObject *x = NULL;
6163 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6164 Py_XDECREF(x);
6165 goto onError;
6166 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006167 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 if (x!=Py_None) /* it worked => adjust input pointer */
6169 ++p;
6170 else { /* untranslatable character */
6171 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6172 Py_ssize_t repsize;
6173 Py_ssize_t newpos;
6174 Py_UNICODE *uni2;
6175 /* startpos for collecting untranslatable chars */
6176 const Py_UNICODE *collstart = p;
6177 const Py_UNICODE *collend = p+1;
6178 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179
Benjamin Peterson29060642009-01-31 22:14:21 +00006180 /* find all untranslatable characters */
6181 while (collend < endp) {
6182 if (charmaptranslate_lookup(*collend, mapping, &x))
6183 goto onError;
6184 Py_XDECREF(x);
6185 if (x!=Py_None)
6186 break;
6187 ++collend;
6188 }
6189 /* cache callback name lookup
6190 * (if not done yet, i.e. it's the first error) */
6191 if (known_errorHandler==-1) {
6192 if ((errors==NULL) || (!strcmp(errors, "strict")))
6193 known_errorHandler = 1;
6194 else if (!strcmp(errors, "replace"))
6195 known_errorHandler = 2;
6196 else if (!strcmp(errors, "ignore"))
6197 known_errorHandler = 3;
6198 else if (!strcmp(errors, "xmlcharrefreplace"))
6199 known_errorHandler = 4;
6200 else
6201 known_errorHandler = 0;
6202 }
6203 switch (known_errorHandler) {
6204 case 1: /* strict */
6205 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006206 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 case 2: /* replace */
6208 /* No need to check for space, this is a 1:1 replacement */
6209 for (coll = collstart; coll<collend; ++coll)
6210 *str++ = '?';
6211 /* fall through */
6212 case 3: /* ignore */
6213 p = collend;
6214 break;
6215 case 4: /* xmlcharrefreplace */
6216 /* generate replacement (temporarily (mis)uses p) */
6217 for (p = collstart; p < collend; ++p) {
6218 char buffer[2+29+1+1];
6219 char *cp;
6220 sprintf(buffer, "&#%d;", (int)*p);
6221 if (charmaptranslate_makespace(&res, &str,
6222 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6223 goto onError;
6224 for (cp = buffer; *cp; ++cp)
6225 *str++ = *cp;
6226 }
6227 p = collend;
6228 break;
6229 default:
6230 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6231 reason, startp, size, &exc,
6232 collstart-startp, collend-startp, &newpos);
6233 if (repunicode == NULL)
6234 goto onError;
6235 /* generate replacement */
6236 repsize = PyUnicode_GET_SIZE(repunicode);
6237 if (charmaptranslate_makespace(&res, &str,
6238 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6239 Py_DECREF(repunicode);
6240 goto onError;
6241 }
6242 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6243 *str++ = *uni2;
6244 p = startp + newpos;
6245 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006246 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006247 }
6248 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006249 /* Resize if we allocated to much */
6250 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006251 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 if (PyUnicode_Resize(&res, respos) < 0)
6253 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006254 }
6255 Py_XDECREF(exc);
6256 Py_XDECREF(errorHandler);
6257 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006258
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006260 Py_XDECREF(res);
6261 Py_XDECREF(exc);
6262 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 return NULL;
6264}
6265
6266PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006267 PyObject *mapping,
6268 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269{
6270 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006271
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 str = PyUnicode_FromObject(str);
6273 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006274 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 PyUnicode_GET_SIZE(str),
6277 mapping,
6278 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006279 Py_DECREF(str);
6280 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006281
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 Py_XDECREF(str);
6284 return NULL;
6285}
Tim Petersced69f82003-09-16 20:30:58 +00006286
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006287PyObject *
6288PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6289 Py_ssize_t length)
6290{
6291 PyObject *result;
6292 Py_UNICODE *p; /* write pointer into result */
6293 Py_ssize_t i;
6294 /* Copy to a new string */
6295 result = (PyObject *)_PyUnicode_New(length);
6296 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6297 if (result == NULL)
6298 return result;
6299 p = PyUnicode_AS_UNICODE(result);
6300 /* Iterate over code points */
6301 for (i = 0; i < length; i++) {
6302 Py_UNICODE ch =s[i];
6303 if (ch > 127) {
6304 int decimal = Py_UNICODE_TODECIMAL(ch);
6305 if (decimal >= 0)
6306 p[i] = '0' + decimal;
6307 }
6308 }
6309 return result;
6310}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006311/* --- Decimal Encoder ---------------------------------------------------- */
6312
6313int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006314 Py_ssize_t length,
6315 char *output,
6316 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006317{
6318 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006319 PyObject *errorHandler = NULL;
6320 PyObject *exc = NULL;
6321 const char *encoding = "decimal";
6322 const char *reason = "invalid decimal Unicode string";
6323 /* the following variable is used for caching string comparisons
6324 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6325 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006326
6327 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006328 PyErr_BadArgument();
6329 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006330 }
6331
6332 p = s;
6333 end = s + length;
6334 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006335 register Py_UNICODE ch = *p;
6336 int decimal;
6337 PyObject *repunicode;
6338 Py_ssize_t repsize;
6339 Py_ssize_t newpos;
6340 Py_UNICODE *uni2;
6341 Py_UNICODE *collstart;
6342 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006343
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006345 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 ++p;
6347 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006348 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 decimal = Py_UNICODE_TODECIMAL(ch);
6350 if (decimal >= 0) {
6351 *output++ = '0' + decimal;
6352 ++p;
6353 continue;
6354 }
6355 if (0 < ch && ch < 256) {
6356 *output++ = (char)ch;
6357 ++p;
6358 continue;
6359 }
6360 /* All other characters are considered unencodable */
6361 collstart = p;
Victor Stinnerab1d16b2011-11-22 01:45:37 +01006362 for (collend = p+1; collend < end; collend++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 if ((0 < *collend && *collend < 256) ||
Victor Stinnerab1d16b2011-11-22 01:45:37 +01006364 Py_UNICODE_ISSPACE(*collend) ||
6365 0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 break;
6367 }
6368 /* cache callback name lookup
6369 * (if not done yet, i.e. it's the first error) */
6370 if (known_errorHandler==-1) {
6371 if ((errors==NULL) || (!strcmp(errors, "strict")))
6372 known_errorHandler = 1;
6373 else if (!strcmp(errors, "replace"))
6374 known_errorHandler = 2;
6375 else if (!strcmp(errors, "ignore"))
6376 known_errorHandler = 3;
6377 else if (!strcmp(errors, "xmlcharrefreplace"))
6378 known_errorHandler = 4;
6379 else
6380 known_errorHandler = 0;
6381 }
6382 switch (known_errorHandler) {
6383 case 1: /* strict */
6384 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6385 goto onError;
6386 case 2: /* replace */
6387 for (p = collstart; p < collend; ++p)
6388 *output++ = '?';
6389 /* fall through */
6390 case 3: /* ignore */
6391 p = collend;
6392 break;
6393 case 4: /* xmlcharrefreplace */
6394 /* generate replacement (temporarily (mis)uses p) */
6395 for (p = collstart; p < collend; ++p)
6396 output += sprintf(output, "&#%d;", (int)*p);
6397 p = collend;
6398 break;
6399 default:
6400 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6401 encoding, reason, s, length, &exc,
6402 collstart-s, collend-s, &newpos);
6403 if (repunicode == NULL)
6404 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006405 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006406 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006407 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6408 Py_DECREF(repunicode);
6409 goto onError;
6410 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 /* generate replacement */
6412 repsize = PyUnicode_GET_SIZE(repunicode);
6413 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6414 Py_UNICODE ch = *uni2;
6415 if (Py_UNICODE_ISSPACE(ch))
6416 *output++ = ' ';
6417 else {
6418 decimal = Py_UNICODE_TODECIMAL(ch);
6419 if (decimal >= 0)
6420 *output++ = '0' + decimal;
6421 else if (0 < ch && ch < 256)
6422 *output++ = (char)ch;
6423 else {
6424 Py_DECREF(repunicode);
6425 raise_encode_exception(&exc, encoding,
6426 s, length, collstart-s, collend-s, reason);
6427 goto onError;
6428 }
6429 }
6430 }
6431 p = s + newpos;
6432 Py_DECREF(repunicode);
6433 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006434 }
6435 /* 0-terminate the output string */
6436 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006437 Py_XDECREF(exc);
6438 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006439 return 0;
6440
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006442 Py_XDECREF(exc);
6443 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006444 return -1;
6445}
6446
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447/* --- Helpers ------------------------------------------------------------ */
6448
Eric Smith8c663262007-08-25 02:26:07 +00006449#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006450#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006451
Thomas Wouters477c8d52006-05-27 19:21:47 +00006452#include "stringlib/count.h"
6453#include "stringlib/find.h"
6454#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006455#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006456
Eric Smith5807c412008-05-11 21:00:57 +00006457#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006458#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006459#include "stringlib/localeutil.h"
6460
Thomas Wouters477c8d52006-05-27 19:21:47 +00006461/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006462#define ADJUST_INDICES(start, end, len) \
6463 if (end > len) \
6464 end = len; \
6465 else if (end < 0) { \
6466 end += len; \
6467 if (end < 0) \
6468 end = 0; \
6469 } \
6470 if (start < 0) { \
6471 start += len; \
6472 if (start < 0) \
6473 start = 0; \
6474 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006475
Ezio Melotti93e7afc2011-08-22 14:08:38 +03006476/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
6477 * by 'ptr', possibly combining surrogate pairs on narrow builds.
6478 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
6479 * that should be returned and 'end' pointing to the end of the buffer.
6480 * ('end' is used on narrow builds to detect a lone surrogate at the
6481 * end of the buffer that should be returned unchanged.)
6482 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
6483 * The type of the returned char is always Py_UCS4.
6484 *
6485 * Note: the macro advances ptr to next char, so it might have side-effects
6486 * (especially if used with other macros).
6487 */
6488
6489/* helper macros used by _Py_UNICODE_NEXT */
6490#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
6491#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
6492/* Join two surrogate characters and return a single Py_UCS4 value. */
6493#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
6494 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
6495 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
6496
6497#ifdef Py_UNICODE_WIDE
6498#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
6499#else
6500#define _Py_UNICODE_NEXT(ptr, end) \
6501 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
6502 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
6503 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
6504 (Py_UCS4)*(ptr)++)
6505#endif
6506
Martin v. Löwis18e16552006-02-15 17:27:45 +00006507Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006508 PyObject *substr,
6509 Py_ssize_t start,
6510 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006512 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006513 PyUnicodeObject* str_obj;
6514 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006515
Thomas Wouters477c8d52006-05-27 19:21:47 +00006516 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6517 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006519 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6520 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 Py_DECREF(str_obj);
6522 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523 }
Tim Petersced69f82003-09-16 20:30:58 +00006524
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006525 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006526 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006527 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6528 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006529 );
6530
6531 Py_DECREF(sub_obj);
6532 Py_DECREF(str_obj);
6533
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 return result;
6535}
6536
Martin v. Löwis18e16552006-02-15 17:27:45 +00006537Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006538 PyObject *sub,
6539 Py_ssize_t start,
6540 Py_ssize_t end,
6541 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006543 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006544
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006546 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006548 sub = PyUnicode_FromObject(sub);
6549 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 Py_DECREF(str);
6551 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 }
Tim Petersced69f82003-09-16 20:30:58 +00006553
Thomas Wouters477c8d52006-05-27 19:21:47 +00006554 if (direction > 0)
6555 result = stringlib_find_slice(
6556 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6557 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6558 start, end
6559 );
6560 else
6561 result = stringlib_rfind_slice(
6562 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6563 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6564 start, end
6565 );
6566
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006568 Py_DECREF(sub);
6569
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 return result;
6571}
6572
Tim Petersced69f82003-09-16 20:30:58 +00006573static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 PyUnicodeObject *substring,
6576 Py_ssize_t start,
6577 Py_ssize_t end,
6578 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 if (substring->length == 0)
6581 return 1;
6582
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006583 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 end -= substring->length;
6585 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587
6588 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 if (Py_UNICODE_MATCH(self, end, substring))
6590 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 } else {
6592 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006594 }
6595
6596 return 0;
6597}
6598
Martin v. Löwis18e16552006-02-15 17:27:45 +00006599Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006600 PyObject *substr,
6601 Py_ssize_t start,
6602 Py_ssize_t end,
6603 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006605 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006606
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 str = PyUnicode_FromObject(str);
6608 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006609 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 substr = PyUnicode_FromObject(substr);
6611 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006612 Py_DECREF(str);
6613 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 }
Tim Petersced69f82003-09-16 20:30:58 +00006615
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006617 (PyUnicodeObject *)substr,
6618 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 Py_DECREF(str);
6620 Py_DECREF(substr);
6621 return result;
6622}
6623
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624/* Apply fixfct filter to the Unicode object self and return a
6625 reference to the modified object */
6626
Tim Petersced69f82003-09-16 20:30:58 +00006627static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006629 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630{
6631
6632 PyUnicodeObject *u;
6633
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006634 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006636 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006637
6638 Py_UNICODE_COPY(u->str, self->str, self->length);
6639
Tim Peters7a29bd52001-09-12 03:03:31 +00006640 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 /* fixfct should return TRUE if it modified the buffer. If
6642 FALSE, return a reference to the original buffer instead
6643 (to save space, not time) */
6644 Py_INCREF(self);
6645 Py_DECREF(u);
6646 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 }
6648 return (PyObject*) u;
6649}
6650
Tim Petersced69f82003-09-16 20:30:58 +00006651static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652int fixupper(PyUnicodeObject *self)
6653{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006654 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 Py_UNICODE *s = self->str;
6656 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006657
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006659 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006660
Benjamin Peterson29060642009-01-31 22:14:21 +00006661 ch = Py_UNICODE_TOUPPER(*s);
6662 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006664 *s = ch;
6665 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666 s++;
6667 }
6668
6669 return status;
6670}
6671
Tim Petersced69f82003-09-16 20:30:58 +00006672static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673int fixlower(PyUnicodeObject *self)
6674{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006675 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 Py_UNICODE *s = self->str;
6677 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006678
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006680 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006681
Benjamin Peterson29060642009-01-31 22:14:21 +00006682 ch = Py_UNICODE_TOLOWER(*s);
6683 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 *s = ch;
6686 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 s++;
6688 }
6689
6690 return status;
6691}
6692
Tim Petersced69f82003-09-16 20:30:58 +00006693static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694int fixswapcase(PyUnicodeObject *self)
6695{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006696 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 Py_UNICODE *s = self->str;
6698 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006699
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700 while (len-- > 0) {
6701 if (Py_UNICODE_ISUPPER(*s)) {
6702 *s = Py_UNICODE_TOLOWER(*s);
6703 status = 1;
6704 } else if (Py_UNICODE_ISLOWER(*s)) {
6705 *s = Py_UNICODE_TOUPPER(*s);
6706 status = 1;
6707 }
6708 s++;
6709 }
6710
6711 return status;
6712}
6713
Tim Petersced69f82003-09-16 20:30:58 +00006714static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715int fixcapitalize(PyUnicodeObject *self)
6716{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006717 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006718 Py_UNICODE *s = self->str;
6719 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006720
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006721 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 return 0;
Ezio Melottiee8d9982011-08-15 09:09:57 +03006723 if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 *s = Py_UNICODE_TOUPPER(*s);
6725 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006727 s++;
6728 while (--len > 0) {
Ezio Melottiee8d9982011-08-15 09:09:57 +03006729 if (!Py_UNICODE_ISLOWER(*s)) {
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006730 *s = Py_UNICODE_TOLOWER(*s);
6731 status = 1;
6732 }
6733 s++;
6734 }
6735 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736}
6737
6738static
6739int fixtitle(PyUnicodeObject *self)
6740{
6741 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6742 register Py_UNICODE *e;
6743 int previous_is_cased;
6744
6745 /* Shortcut for single character strings */
6746 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006747 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6748 if (*p != ch) {
6749 *p = ch;
6750 return 1;
6751 }
6752 else
6753 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 }
Tim Petersced69f82003-09-16 20:30:58 +00006755
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 e = p + PyUnicode_GET_SIZE(self);
6757 previous_is_cased = 0;
6758 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006760
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 if (previous_is_cased)
6762 *p = Py_UNICODE_TOLOWER(ch);
6763 else
6764 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006765
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 if (Py_UNICODE_ISLOWER(ch) ||
6767 Py_UNICODE_ISUPPER(ch) ||
6768 Py_UNICODE_ISTITLE(ch))
6769 previous_is_cased = 1;
6770 else
6771 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772 }
6773 return 1;
6774}
6775
Tim Peters8ce9f162004-08-27 01:49:32 +00006776PyObject *
6777PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778{
Skip Montanaro6543b452004-09-16 03:28:13 +00006779 const Py_UNICODE blank = ' ';
6780 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006781 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006782 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006783 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6784 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006785 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6786 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006787 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006788 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789
Tim Peters05eba1f2004-08-27 21:32:02 +00006790 fseq = PySequence_Fast(seq, "");
6791 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006792 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006793 }
6794
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006795 /* NOTE: the following code can't call back into Python code,
6796 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006797 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006798
Tim Peters05eba1f2004-08-27 21:32:02 +00006799 seqlen = PySequence_Fast_GET_SIZE(fseq);
6800 /* If empty sequence, return u"". */
6801 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006802 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6803 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006804 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006805 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006806 /* If singleton sequence with an exact Unicode, return that. */
6807 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 item = items[0];
6809 if (PyUnicode_CheckExact(item)) {
6810 Py_INCREF(item);
6811 res = (PyUnicodeObject *)item;
6812 goto Done;
6813 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006814 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006815 else {
6816 /* Set up sep and seplen */
6817 if (separator == NULL) {
6818 sep = &blank;
6819 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006820 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006821 else {
6822 if (!PyUnicode_Check(separator)) {
6823 PyErr_Format(PyExc_TypeError,
6824 "separator: expected str instance,"
6825 " %.80s found",
6826 Py_TYPE(separator)->tp_name);
6827 goto onError;
6828 }
6829 sep = PyUnicode_AS_UNICODE(separator);
6830 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006831 }
6832 }
6833
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006834 /* There are at least two things to join, or else we have a subclass
6835 * of str in the sequence.
6836 * Do a pre-pass to figure out the total amount of space we'll
6837 * need (sz), and see whether all argument are strings.
6838 */
6839 sz = 0;
6840 for (i = 0; i < seqlen; i++) {
6841 const Py_ssize_t old_sz = sz;
6842 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006843 if (!PyUnicode_Check(item)) {
6844 PyErr_Format(PyExc_TypeError,
6845 "sequence item %zd: expected str instance,"
6846 " %.80s found",
6847 i, Py_TYPE(item)->tp_name);
6848 goto onError;
6849 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006850 sz += PyUnicode_GET_SIZE(item);
6851 if (i != 0)
6852 sz += seplen;
6853 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6854 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006856 goto onError;
6857 }
6858 }
Tim Petersced69f82003-09-16 20:30:58 +00006859
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006860 res = _PyUnicode_New(sz);
6861 if (res == NULL)
6862 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006863
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006864 /* Catenate everything. */
6865 res_p = PyUnicode_AS_UNICODE(res);
6866 for (i = 0; i < seqlen; ++i) {
6867 Py_ssize_t itemlen;
6868 item = items[i];
6869 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 /* Copy item, and maybe the separator. */
6871 if (i) {
6872 Py_UNICODE_COPY(res_p, sep, seplen);
6873 res_p += seplen;
6874 }
6875 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6876 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006877 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006878
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006880 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 return (PyObject *)res;
6882
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006884 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006885 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886 return NULL;
6887}
6888
Tim Petersced69f82003-09-16 20:30:58 +00006889static
6890PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 Py_ssize_t left,
6892 Py_ssize_t right,
6893 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894{
6895 PyUnicodeObject *u;
6896
6897 if (left < 0)
6898 left = 0;
6899 if (right < 0)
6900 right = 0;
6901
Tim Peters7a29bd52001-09-12 03:03:31 +00006902 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006903 Py_INCREF(self);
6904 return self;
6905 }
6906
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006907 if (left > PY_SSIZE_T_MAX - self->length ||
6908 right > PY_SSIZE_T_MAX - (left + self->length)) {
6909 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6910 return NULL;
6911 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006912 u = _PyUnicode_New(left + self->length + right);
6913 if (u) {
6914 if (left)
6915 Py_UNICODE_FILL(u->str, fill, left);
6916 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6917 if (right)
6918 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6919 }
6920
6921 return u;
6922}
6923
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006924PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927
6928 string = PyUnicode_FromObject(string);
6929 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006930 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006932 list = stringlib_splitlines(
6933 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6934 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935
6936 Py_DECREF(string);
6937 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938}
6939
Tim Petersced69f82003-09-16 20:30:58 +00006940static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006942 PyUnicodeObject *substring,
6943 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006946 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006949 return stringlib_split_whitespace(
6950 (PyObject*) self, self->str, self->length, maxcount
6951 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006953 return stringlib_split(
6954 (PyObject*) self, self->str, self->length,
6955 substring->str, substring->length,
6956 maxcount
6957 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958}
6959
Tim Petersced69f82003-09-16 20:30:58 +00006960static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006961PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006962 PyUnicodeObject *substring,
6963 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006964{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006965 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006966 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006967
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006968 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006969 return stringlib_rsplit_whitespace(
6970 (PyObject*) self, self->str, self->length, maxcount
6971 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006972
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006973 return stringlib_rsplit(
6974 (PyObject*) self, self->str, self->length,
6975 substring->str, substring->length,
6976 maxcount
6977 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006978}
6979
6980static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006982 PyUnicodeObject *str1,
6983 PyUnicodeObject *str2,
6984 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006985{
6986 PyUnicodeObject *u;
6987
6988 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006990 else if (maxcount == 0 || self->length == 0)
6991 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992
Thomas Wouters477c8d52006-05-27 19:21:47 +00006993 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006994 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006995 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006996 if (str1->length == 0)
6997 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006998 if (str1->length == 1) {
6999 /* replace characters */
7000 Py_UNICODE u1, u2;
7001 if (!findchar(self->str, self->length, str1->str[0]))
7002 goto nothing;
7003 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7004 if (!u)
7005 return NULL;
7006 Py_UNICODE_COPY(u->str, self->str, self->length);
7007 u1 = str1->str[0];
7008 u2 = str2->str[0];
7009 for (i = 0; i < u->length; i++)
7010 if (u->str[i] == u1) {
7011 if (--maxcount < 0)
7012 break;
7013 u->str[i] = u2;
7014 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007016 i = stringlib_find(
7017 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00007019 if (i < 0)
7020 goto nothing;
7021 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7022 if (!u)
7023 return NULL;
7024 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007025
7026 /* change everything in-place, starting with this one */
7027 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7028 i += str1->length;
7029
7030 while ( --maxcount > 0) {
7031 i = stringlib_find(self->str+i, self->length-i,
7032 str1->str, str1->length,
7033 i);
7034 if (i == -1)
7035 break;
7036 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7037 i += str1->length;
7038 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007041
Victor Stinnerab1d16b2011-11-22 01:45:37 +01007042 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007043 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044 Py_UNICODE *p;
7045
7046 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007047 n = stringlib_count(self->str, self->length, str1->str, str1->length,
7048 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007049 if (n == 0)
7050 goto nothing;
7051 /* new_size = self->length + n * (str2->length - str1->length)); */
7052 delta = (str2->length - str1->length);
7053 if (delta == 0) {
7054 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007056 product = n * (str2->length - str1->length);
7057 if ((product / (str2->length - str1->length)) != n) {
7058 PyErr_SetString(PyExc_OverflowError,
7059 "replace string is too long");
7060 return NULL;
7061 }
7062 new_size = self->length + product;
7063 if (new_size < 0) {
7064 PyErr_SetString(PyExc_OverflowError,
7065 "replace string is too long");
7066 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067 }
7068 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007069 u = _PyUnicode_New(new_size);
7070 if (!u)
7071 return NULL;
7072 i = 0;
7073 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007074 if (str1->length > 0) {
7075 while (n-- > 0) {
7076 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007077 j = stringlib_find(self->str+i, self->length-i,
7078 str1->str, str1->length,
7079 i);
7080 if (j == -1)
7081 break;
7082 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007083 /* copy unchanged part [i:j] */
7084 Py_UNICODE_COPY(p, self->str+i, j-i);
7085 p += j - i;
7086 }
7087 /* copy substitution string */
7088 if (str2->length > 0) {
7089 Py_UNICODE_COPY(p, str2->str, str2->length);
7090 p += str2->length;
7091 }
7092 i = j + str1->length;
7093 }
7094 if (i < self->length)
7095 /* copy tail [i:] */
7096 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7097 } else {
7098 /* interleave */
7099 while (n > 0) {
7100 Py_UNICODE_COPY(p, str2->str, str2->length);
7101 p += str2->length;
7102 if (--n <= 0)
7103 break;
7104 *p++ = self->str[i++];
7105 }
7106 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007109 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007110
Benjamin Peterson29060642009-01-31 22:14:21 +00007111 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007112 /* nothing to replace; return original string (when possible) */
7113 if (PyUnicode_CheckExact(self)) {
7114 Py_INCREF(self);
7115 return (PyObject *) self;
7116 }
7117 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118}
7119
7120/* --- Unicode Object Methods --------------------------------------------- */
7121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007122PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007123 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124\n\
7125Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007126characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127
7128static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007129unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007131 return fixup(self, fixtitle);
7132}
7133
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007134PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136\n\
7137Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007138have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139
7140static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007141unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143 return fixup(self, fixcapitalize);
7144}
7145
7146#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007147PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007148 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149\n\
7150Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007151normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152
7153static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007154unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155{
7156 PyObject *list;
7157 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007158 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160 /* Split into words */
7161 list = split(self, NULL, -1);
7162 if (!list)
7163 return NULL;
7164
7165 /* Capitalize each word */
7166 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7167 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169 if (item == NULL)
7170 goto onError;
7171 Py_DECREF(PyList_GET_ITEM(list, i));
7172 PyList_SET_ITEM(list, i, item);
7173 }
7174
7175 /* Join the words to form a new string */
7176 item = PyUnicode_Join(NULL, list);
7177
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179 Py_DECREF(list);
7180 return (PyObject *)item;
7181}
7182#endif
7183
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007184/* Argument converter. Coerces to a single unicode character */
7185
7186static int
7187convert_uc(PyObject *obj, void *addr)
7188{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007189 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7190 PyObject *uniobj;
7191 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007192
Benjamin Peterson14339b62009-01-31 16:36:08 +00007193 uniobj = PyUnicode_FromObject(obj);
7194 if (uniobj == NULL) {
7195 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007197 return 0;
7198 }
7199 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7200 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007202 Py_DECREF(uniobj);
7203 return 0;
7204 }
7205 unistr = PyUnicode_AS_UNICODE(uniobj);
7206 *fillcharloc = unistr[0];
7207 Py_DECREF(uniobj);
7208 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007209}
7210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007211PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007214Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007215done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216
7217static PyObject *
7218unicode_center(PyUnicodeObject *self, PyObject *args)
7219{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007220 Py_ssize_t marg, left;
7221 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007222 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223
Thomas Woutersde017742006-02-16 19:34:37 +00007224 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225 return NULL;
7226
Tim Peters7a29bd52001-09-12 03:03:31 +00007227 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007228 Py_INCREF(self);
7229 return (PyObject*) self;
7230 }
7231
7232 marg = width - self->length;
7233 left = marg / 2 + (marg & width & 1);
7234
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007235 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007236}
7237
Marc-André Lemburge5034372000-08-08 08:04:29 +00007238#if 0
7239
7240/* This code should go into some future Unicode collation support
7241 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007242 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007243
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007244/* speedy UTF-16 code point order comparison */
7245/* gleaned from: */
7246/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7247
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007248static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007249{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007250 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007251 0, 0, 0, 0, 0, 0, 0, 0,
7252 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007253 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007254};
7255
Guido van Rossumd57fd912000-03-10 22:53:23 +00007256static int
7257unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7258{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007259 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007260
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261 Py_UNICODE *s1 = str1->str;
7262 Py_UNICODE *s2 = str2->str;
7263
7264 len1 = str1->length;
7265 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007266
Guido van Rossumd57fd912000-03-10 22:53:23 +00007267 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007268 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007269
7270 c1 = *s1++;
7271 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007272
Benjamin Peterson29060642009-01-31 22:14:21 +00007273 if (c1 > (1<<11) * 26)
7274 c1 += utf16Fixup[c1>>11];
7275 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007276 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007277 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007278
7279 if (c1 != c2)
7280 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007281
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007282 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007283 }
7284
7285 return (len1 < len2) ? -1 : (len1 != len2);
7286}
7287
Marc-André Lemburge5034372000-08-08 08:04:29 +00007288#else
7289
7290static int
7291unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7292{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007293 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007294
7295 Py_UNICODE *s1 = str1->str;
7296 Py_UNICODE *s2 = str2->str;
7297
7298 len1 = str1->length;
7299 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007300
Marc-André Lemburge5034372000-08-08 08:04:29 +00007301 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007302 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007303
Fredrik Lundh45714e92001-06-26 16:39:36 +00007304 c1 = *s1++;
7305 c2 = *s2++;
7306
7307 if (c1 != c2)
7308 return (c1 < c2) ? -1 : 1;
7309
Marc-André Lemburge5034372000-08-08 08:04:29 +00007310 len1--; len2--;
7311 }
7312
7313 return (len1 < len2) ? -1 : (len1 != len2);
7314}
7315
7316#endif
7317
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007320{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007321 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7322 return unicode_compare((PyUnicodeObject *)left,
7323 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007324 PyErr_Format(PyExc_TypeError,
7325 "Can't compare %.100s and %.100s",
7326 left->ob_type->tp_name,
7327 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007328 return -1;
7329}
7330
Martin v. Löwis5b222132007-06-10 09:51:05 +00007331int
7332PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7333{
7334 int i;
7335 Py_UNICODE *id;
7336 assert(PyUnicode_Check(uni));
7337 id = PyUnicode_AS_UNICODE(uni);
7338 /* Compare Unicode string and source character set string */
7339 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007340 if (id[i] != str[i])
7341 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007342 /* This check keeps Python strings that end in '\0' from comparing equal
7343 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007344 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007346 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007347 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007348 return 0;
7349}
7350
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007351
Benjamin Peterson29060642009-01-31 22:14:21 +00007352#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007353 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007354
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007355PyObject *PyUnicode_RichCompare(PyObject *left,
7356 PyObject *right,
7357 int op)
7358{
7359 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007360
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007361 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7362 PyObject *v;
7363 if (((PyUnicodeObject *) left)->length !=
7364 ((PyUnicodeObject *) right)->length) {
7365 if (op == Py_EQ) {
7366 Py_INCREF(Py_False);
7367 return Py_False;
7368 }
7369 if (op == Py_NE) {
7370 Py_INCREF(Py_True);
7371 return Py_True;
7372 }
7373 }
7374 if (left == right)
7375 result = 0;
7376 else
7377 result = unicode_compare((PyUnicodeObject *)left,
7378 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007379
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007380 /* Convert the return value to a Boolean */
7381 switch (op) {
7382 case Py_EQ:
7383 v = TEST_COND(result == 0);
7384 break;
7385 case Py_NE:
7386 v = TEST_COND(result != 0);
7387 break;
7388 case Py_LE:
7389 v = TEST_COND(result <= 0);
7390 break;
7391 case Py_GE:
7392 v = TEST_COND(result >= 0);
7393 break;
7394 case Py_LT:
7395 v = TEST_COND(result == -1);
7396 break;
7397 case Py_GT:
7398 v = TEST_COND(result == 1);
7399 break;
7400 default:
7401 PyErr_BadArgument();
7402 return NULL;
7403 }
7404 Py_INCREF(v);
7405 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007406 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007407
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007408 Py_INCREF(Py_NotImplemented);
7409 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007410}
7411
Guido van Rossum403d68b2000-03-13 15:55:09 +00007412int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007413 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007414{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007415 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007416 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007417
7418 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007419 sub = PyUnicode_FromObject(element);
7420 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 PyErr_Format(PyExc_TypeError,
7422 "'in <string>' requires string as left operand, not %s",
7423 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007424 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007425 }
7426
Thomas Wouters477c8d52006-05-27 19:21:47 +00007427 str = PyUnicode_FromObject(container);
7428 if (!str) {
7429 Py_DECREF(sub);
7430 return -1;
7431 }
7432
7433 result = stringlib_contains_obj(str, sub);
7434
7435 Py_DECREF(str);
7436 Py_DECREF(sub);
7437
Guido van Rossum403d68b2000-03-13 15:55:09 +00007438 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007439}
7440
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441/* Concat to string or Unicode object giving a new Unicode object. */
7442
7443PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445{
7446 PyUnicodeObject *u = NULL, *v = NULL, *w;
7447
7448 /* Coerce the two arguments */
7449 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7450 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7453 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455
7456 /* Shortcuts */
7457 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 Py_DECREF(v);
7459 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460 }
7461 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 Py_DECREF(u);
7463 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464 }
7465
7466 /* Concat the two Unicode strings */
7467 w = _PyUnicode_New(u->length + v->length);
7468 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470 Py_UNICODE_COPY(w->str, u->str, u->length);
7471 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7472
7473 Py_DECREF(u);
7474 Py_DECREF(v);
7475 return (PyObject *)w;
7476
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478 Py_XDECREF(u);
7479 Py_XDECREF(v);
7480 return NULL;
7481}
7482
Walter Dörwald1ab83302007-05-18 17:15:44 +00007483void
7484PyUnicode_Append(PyObject **pleft, PyObject *right)
7485{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007486 PyObject *new;
7487 if (*pleft == NULL)
7488 return;
7489 if (right == NULL || !PyUnicode_Check(*pleft)) {
7490 Py_DECREF(*pleft);
7491 *pleft = NULL;
7492 return;
7493 }
7494 new = PyUnicode_Concat(*pleft, right);
7495 Py_DECREF(*pleft);
7496 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007497}
7498
7499void
7500PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7501{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007502 PyUnicode_Append(pleft, right);
7503 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007504}
7505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007506PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007509Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007510string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007511interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512
7513static PyObject *
7514unicode_count(PyUnicodeObject *self, PyObject *args)
7515{
7516 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007517 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007518 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519 PyObject *result;
7520
Jesus Ceaac451502011-04-20 17:09:23 +02007521 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
7522 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007524
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007525 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007526 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007527 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007528 substring->str, substring->length,
7529 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007530 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531
7532 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007533
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534 return result;
7535}
7536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007537PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007538 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007539\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007540Encode S using the codec registered for encoding. Default encoding\n\
7541is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007542handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007543a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7544'xmlcharrefreplace' as well as any other name registered with\n\
7545codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546
7547static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007548unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007550 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551 char *encoding = NULL;
7552 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007553
Benjamin Peterson308d6372009-09-18 21:42:35 +00007554 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7555 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007557 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007558}
7559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007560PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007561 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562\n\
7563Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007564If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565
7566static PyObject*
7567unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7568{
7569 Py_UNICODE *e;
7570 Py_UNICODE *p;
7571 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007572 Py_UNICODE *qe;
7573 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574 PyUnicodeObject *u;
7575 int tabsize = 8;
7576
7577 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007578 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579
Thomas Wouters7e474022000-07-16 12:04:32 +00007580 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007581 i = 0; /* chars up to and including most recent \n or \r */
7582 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7583 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584 for (p = self->str; p < e; p++)
7585 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007586 if (tabsize > 0) {
7587 incr = tabsize - (j % tabsize); /* cannot overflow */
7588 if (j > PY_SSIZE_T_MAX - incr)
7589 goto overflow1;
7590 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007591 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007594 if (j > PY_SSIZE_T_MAX - 1)
7595 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596 j++;
7597 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 if (i > PY_SSIZE_T_MAX - j)
7599 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007601 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602 }
7603 }
7604
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007605 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007606 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007607
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608 /* Second pass: create output string and fill it */
7609 u = _PyUnicode_New(i + j);
7610 if (!u)
7611 return NULL;
7612
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007613 j = 0; /* same as in first pass */
7614 q = u->str; /* next output char */
7615 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616
7617 for (p = self->str; p < e; p++)
7618 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 if (tabsize > 0) {
7620 i = tabsize - (j % tabsize);
7621 j += i;
7622 while (i--) {
7623 if (q >= qe)
7624 goto overflow2;
7625 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007626 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007627 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007628 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 else {
7630 if (q >= qe)
7631 goto overflow2;
7632 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007633 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007634 if (*p == '\n' || *p == '\r')
7635 j = 0;
7636 }
7637
7638 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007639
7640 overflow2:
7641 Py_DECREF(u);
7642 overflow1:
7643 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7644 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645}
7646
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007647PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649\n\
7650Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08007651such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652arguments start and end are interpreted as in slice notation.\n\
7653\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007654Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655
7656static PyObject *
7657unicode_find(PyUnicodeObject *self, PyObject *args)
7658{
Jesus Ceaac451502011-04-20 17:09:23 +02007659 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007660 Py_ssize_t start;
7661 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007662 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663
Jesus Ceaac451502011-04-20 17:09:23 +02007664 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
7665 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667
Thomas Wouters477c8d52006-05-27 19:21:47 +00007668 result = stringlib_find_slice(
7669 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7670 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7671 start, end
7672 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673
7674 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007675
Christian Heimes217cfd12007-12-02 14:31:20 +00007676 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677}
7678
7679static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007680unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681{
7682 if (index < 0 || index >= self->length) {
7683 PyErr_SetString(PyExc_IndexError, "string index out of range");
7684 return NULL;
7685 }
7686
7687 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7688}
7689
Guido van Rossumc2504932007-09-18 19:42:40 +00007690/* Believe it or not, this produces the same value for ASCII strings
7691 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007692static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007693unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694{
Guido van Rossumc2504932007-09-18 19:42:40 +00007695 Py_ssize_t len;
7696 Py_UNICODE *p;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -08007697 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +00007698
Benjamin Petersonf6622c82012-04-09 14:53:07 -04007699#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -05007700 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -04007701#endif
Guido van Rossumc2504932007-09-18 19:42:40 +00007702 if (self->hash != -1)
7703 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007704 len = Py_SIZE(self);
Georg Brandl2daf6ae2012-02-20 19:54:16 +01007705 /*
7706 We make the hash of the empty string be 0, rather than using
7707 (prefix ^ suffix), since this slightly obfuscates the hash secret
7708 */
7709 if (len == 0) {
7710 self->hash = 0;
7711 return 0;
7712 }
Guido van Rossumc2504932007-09-18 19:42:40 +00007713 p = self->str;
Georg Brandl2daf6ae2012-02-20 19:54:16 +01007714 x = _Py_HashSecret.prefix;
7715 x ^= *p << 7;
Guido van Rossumc2504932007-09-18 19:42:40 +00007716 while (--len >= 0)
Gregory P. Smith63e6c322012-01-14 15:31:34 -08007717 x = (_PyHASH_MULTIPLIER*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007718 x ^= Py_SIZE(self);
Georg Brandl2daf6ae2012-02-20 19:54:16 +01007719 x ^= _Py_HashSecret.suffix;
Guido van Rossumc2504932007-09-18 19:42:40 +00007720 if (x == -1)
7721 x = -2;
7722 self->hash = x;
7723 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724}
7725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007726PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007727 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007729Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730
7731static PyObject *
7732unicode_index(PyUnicodeObject *self, PyObject *args)
7733{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007734 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02007735 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007736 Py_ssize_t start;
7737 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738
Jesus Ceaac451502011-04-20 17:09:23 +02007739 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
7740 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742
Thomas Wouters477c8d52006-05-27 19:21:47 +00007743 result = stringlib_find_slice(
7744 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7745 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7746 start, end
7747 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748
7749 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007750
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751 if (result < 0) {
7752 PyErr_SetString(PyExc_ValueError, "substring not found");
7753 return NULL;
7754 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007755
Christian Heimes217cfd12007-12-02 14:31:20 +00007756 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757}
7758
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007759PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007761\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007762Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007763at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764
7765static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007766unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767{
7768 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7769 register const Py_UNICODE *e;
7770 int cased;
7771
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772 /* Shortcut for single character strings */
7773 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007776 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007777 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007779
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780 e = p + PyUnicode_GET_SIZE(self);
7781 cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007782 while (p < e) {
7783 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007784
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7786 return PyBool_FromLong(0);
7787 else if (!cased && Py_UNICODE_ISLOWER(ch))
7788 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007790 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791}
7792
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007793PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007794 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007796Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007797at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798
7799static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007800unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801{
7802 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7803 register const Py_UNICODE *e;
7804 int cased;
7805
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806 /* Shortcut for single character strings */
7807 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007810 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007811 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007813
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814 e = p + PyUnicode_GET_SIZE(self);
7815 cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007816 while (p < e) {
7817 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007818
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7820 return PyBool_FromLong(0);
7821 else if (!cased && Py_UNICODE_ISUPPER(ch))
7822 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007824 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825}
7826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007827PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007830Return True if S is a titlecased string and there is at least one\n\
7831character in S, i.e. upper- and titlecase characters may only\n\
7832follow uncased characters and lowercase characters only cased ones.\n\
7833Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834
7835static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007836unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007837{
7838 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7839 register const Py_UNICODE *e;
7840 int cased, previous_is_cased;
7841
Guido van Rossumd57fd912000-03-10 22:53:23 +00007842 /* Shortcut for single character strings */
7843 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7845 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007846
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007847 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007848 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007849 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007850
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851 e = p + PyUnicode_GET_SIZE(self);
7852 cased = 0;
7853 previous_is_cased = 0;
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007854 while (p < e) {
7855 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
Tim Petersced69f82003-09-16 20:30:58 +00007856
Benjamin Peterson29060642009-01-31 22:14:21 +00007857 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7858 if (previous_is_cased)
7859 return PyBool_FromLong(0);
7860 previous_is_cased = 1;
7861 cased = 1;
7862 }
7863 else if (Py_UNICODE_ISLOWER(ch)) {
7864 if (!previous_is_cased)
7865 return PyBool_FromLong(0);
7866 previous_is_cased = 1;
7867 cased = 1;
7868 }
7869 else
7870 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007872 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873}
7874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007875PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007876 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007878Return True if all characters in S are whitespace\n\
7879and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880
7881static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007882unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883{
7884 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7885 register const Py_UNICODE *e;
7886
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887 /* Shortcut for single character strings */
7888 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 Py_UNICODE_ISSPACE(*p))
7890 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007892 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007893 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007894 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007895
Guido van Rossumd57fd912000-03-10 22:53:23 +00007896 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007897 while (p < e) {
7898 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7899 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007902 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903}
7904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007905PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007907\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007908Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007909and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007910
7911static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007912unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007913{
7914 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7915 register const Py_UNICODE *e;
7916
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007917 /* Shortcut for single character strings */
7918 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007919 Py_UNICODE_ISALPHA(*p))
7920 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007921
7922 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007923 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007924 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007925
7926 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007927 while (p < e) {
7928 if (!Py_UNICODE_ISALPHA(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007929 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007930 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007931 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007932}
7933
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007934PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007936\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007937Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007938and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007939
7940static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007941unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007942{
7943 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7944 register const Py_UNICODE *e;
7945
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007946 /* Shortcut for single character strings */
7947 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007948 Py_UNICODE_ISALNUM(*p))
7949 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007950
7951 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007952 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007954
7955 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007956 while (p < e) {
7957 const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7958 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007960 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007961 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007962}
7963
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007964PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007967Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007968False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969
7970static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007971unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972{
7973 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7974 register const Py_UNICODE *e;
7975
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976 /* Shortcut for single character strings */
7977 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 Py_UNICODE_ISDECIMAL(*p))
7979 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007981 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007982 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007984
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03007986 while (p < e) {
7987 if (!Py_UNICODE_ISDECIMAL(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007989 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007990 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991}
7992
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007993PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007994 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007996Return True if all characters in S are digits\n\
7997and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998
7999static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008000unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001{
8002 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8003 register const Py_UNICODE *e;
8004
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005 /* Shortcut for single character strings */
8006 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 Py_UNICODE_ISDIGIT(*p))
8008 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008010 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008011 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008012 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008013
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008015 while (p < e) {
8016 if (!Py_UNICODE_ISDIGIT(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008019 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020}
8021
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008022PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00008025Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008026False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027
8028static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008029unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030{
8031 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8032 register const Py_UNICODE *e;
8033
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034 /* Shortcut for single character strings */
8035 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 Py_UNICODE_ISNUMERIC(*p))
8037 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008039 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008040 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008042
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008044 while (p < e) {
8045 if (!Py_UNICODE_ISNUMERIC(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008046 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008048 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008049}
8050
Martin v. Löwis47383402007-08-15 07:32:56 +00008051int
8052PyUnicode_IsIdentifier(PyObject *self)
8053{
Benjamin Petersonf413b802011-08-12 22:17:18 -05008054 const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008055 const Py_UNICODE *e;
8056 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +00008057
8058 /* Special case for empty strings */
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008059 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008061
8062 /* PEP 3131 says that the first character must be in
8063 XID_Start and subsequent characters in XID_Continue,
8064 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00008065 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00008066 letters, digits, underscore). However, given the current
8067 definition of XID_Start and XID_Continue, it is sufficient
8068 to check just for these, except that _ must be allowed
8069 as starting an identifier. */
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008070 e = p + PyUnicode_GET_SIZE(self);
8071 first = _Py_UNICODE_NEXT(p, e);
Benjamin Petersonf413b802011-08-12 22:17:18 -05008072 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +00008073 return 0;
8074
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008075 while (p < e)
8076 if (!_PyUnicode_IsXidContinue(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008077 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008078 return 1;
8079}
8080
8081PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008082 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008083\n\
8084Return True if S is a valid identifier according\n\
8085to the language definition.");
8086
8087static PyObject*
8088unicode_isidentifier(PyObject *self)
8089{
8090 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8091}
8092
Georg Brandl559e5d72008-06-11 18:37:52 +00008093PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008095\n\
8096Return True if all characters in S are considered\n\
8097printable in repr() or S is empty, False otherwise.");
8098
8099static PyObject*
8100unicode_isprintable(PyObject *self)
8101{
8102 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8103 register const Py_UNICODE *e;
8104
8105 /* Shortcut for single character strings */
8106 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8107 Py_RETURN_TRUE;
8108 }
8109
8110 e = p + PyUnicode_GET_SIZE(self);
Ezio Melotti93e7afc2011-08-22 14:08:38 +03008111 while (p < e) {
8112 if (!Py_UNICODE_ISPRINTABLE(_Py_UNICODE_NEXT(p, e))) {
Georg Brandl559e5d72008-06-11 18:37:52 +00008113 Py_RETURN_FALSE;
8114 }
8115 }
8116 Py_RETURN_TRUE;
8117}
8118
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008119PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008120 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008121\n\
8122Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008123iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124
8125static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008126unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008128 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129}
8130
Martin v. Löwis18e16552006-02-15 17:27:45 +00008131static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008132unicode_length(PyUnicodeObject *self)
8133{
8134 return self->length;
8135}
8136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008137PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008138 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008140Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008141done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142
8143static PyObject *
8144unicode_ljust(PyUnicodeObject *self, PyObject *args)
8145{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008146 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008147 Py_UNICODE fillchar = ' ';
8148
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008149 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150 return NULL;
8151
Tim Peters7a29bd52001-09-12 03:03:31 +00008152 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 Py_INCREF(self);
8154 return (PyObject*) self;
8155 }
8156
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008157 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158}
8159
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008160PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008163Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008164
8165static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008166unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008168 return fixup(self, fixlower);
8169}
8170
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008171#define LEFTSTRIP 0
8172#define RIGHTSTRIP 1
8173#define BOTHSTRIP 2
8174
8175/* Arrays indexed by above */
8176static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8177
8178#define STRIPNAME(i) (stripformat[i]+3)
8179
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008180/* externally visible for str.strip(unicode) */
8181PyObject *
8182_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8183{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008184 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8185 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8186 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8187 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8188 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008189
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008191
Benjamin Peterson14339b62009-01-31 16:36:08 +00008192 i = 0;
8193 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008194 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8195 i++;
8196 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008197 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008198
Benjamin Peterson14339b62009-01-31 16:36:08 +00008199 j = len;
8200 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 do {
8202 j--;
8203 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8204 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008205 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008206
Benjamin Peterson14339b62009-01-31 16:36:08 +00008207 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 Py_INCREF(self);
8209 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008210 }
8211 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008212 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008213}
8214
Guido van Rossumd57fd912000-03-10 22:53:23 +00008215
8216static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008217do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008219 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8220 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008221
Benjamin Peterson14339b62009-01-31 16:36:08 +00008222 i = 0;
8223 if (striptype != RIGHTSTRIP) {
8224 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8225 i++;
8226 }
8227 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008228
Benjamin Peterson14339b62009-01-31 16:36:08 +00008229 j = len;
8230 if (striptype != LEFTSTRIP) {
8231 do {
8232 j--;
8233 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8234 j++;
8235 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008236
Benjamin Peterson14339b62009-01-31 16:36:08 +00008237 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8238 Py_INCREF(self);
8239 return (PyObject*)self;
8240 }
8241 else
8242 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243}
8244
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008245
8246static PyObject *
8247do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8248{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008249 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008250
Benjamin Peterson14339b62009-01-31 16:36:08 +00008251 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8252 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008253
Benjamin Peterson14339b62009-01-31 16:36:08 +00008254 if (sep != NULL && sep != Py_None) {
8255 if (PyUnicode_Check(sep))
8256 return _PyUnicode_XStrip(self, striptype, sep);
8257 else {
8258 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 "%s arg must be None or str",
8260 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008261 return NULL;
8262 }
8263 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008264
Benjamin Peterson14339b62009-01-31 16:36:08 +00008265 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008266}
8267
8268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008269PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008271\n\
8272Return a copy of the string S with leading and trailing\n\
8273whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008274If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008275
8276static PyObject *
8277unicode_strip(PyUnicodeObject *self, PyObject *args)
8278{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008279 if (PyTuple_GET_SIZE(args) == 0)
8280 return do_strip(self, BOTHSTRIP); /* Common case */
8281 else
8282 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008283}
8284
8285
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008286PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008288\n\
8289Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008290If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008291
8292static PyObject *
8293unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8294{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008295 if (PyTuple_GET_SIZE(args) == 0)
8296 return do_strip(self, LEFTSTRIP); /* Common case */
8297 else
8298 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008299}
8300
8301
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008302PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008304\n\
8305Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008306If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008307
8308static PyObject *
8309unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8310{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008311 if (PyTuple_GET_SIZE(args) == 0)
8312 return do_strip(self, RIGHTSTRIP); /* Common case */
8313 else
8314 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008315}
8316
8317
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008319unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320{
8321 PyUnicodeObject *u;
8322 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008323 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008324 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325
Georg Brandl222de0f2009-04-12 12:01:50 +00008326 if (len < 1) {
8327 Py_INCREF(unicode_empty);
8328 return (PyObject *)unicode_empty;
8329 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330
Tim Peters7a29bd52001-09-12 03:03:31 +00008331 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332 /* no repeat, return original string */
8333 Py_INCREF(str);
8334 return (PyObject*) str;
8335 }
Tim Peters8f422462000-09-09 06:13:41 +00008336
8337 /* ensure # of chars needed doesn't overflow int and # of bytes
8338 * needed doesn't overflow size_t
8339 */
8340 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008341 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008342 PyErr_SetString(PyExc_OverflowError,
8343 "repeated string is too long");
8344 return NULL;
8345 }
8346 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8347 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8348 PyErr_SetString(PyExc_OverflowError,
8349 "repeated string is too long");
8350 return NULL;
8351 }
8352 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 if (!u)
8354 return NULL;
8355
8356 p = u->str;
8357
Georg Brandl222de0f2009-04-12 12:01:50 +00008358 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008359 Py_UNICODE_FILL(p, str->str[0], len);
8360 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008361 Py_ssize_t done = str->length; /* number of characters copied this far */
8362 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008364 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008365 Py_UNICODE_COPY(p+done, p, n);
8366 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368 }
8369
8370 return (PyObject*) u;
8371}
8372
8373PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 PyObject *subobj,
8375 PyObject *replobj,
8376 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377{
8378 PyObject *self;
8379 PyObject *str1;
8380 PyObject *str2;
8381 PyObject *result;
8382
8383 self = PyUnicode_FromObject(obj);
8384 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386 str1 = PyUnicode_FromObject(subobj);
8387 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 Py_DECREF(self);
8389 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008390 }
8391 str2 = PyUnicode_FromObject(replobj);
8392 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 Py_DECREF(self);
8394 Py_DECREF(str1);
8395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008396 }
Tim Petersced69f82003-09-16 20:30:58 +00008397 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 (PyUnicodeObject *)str1,
8399 (PyUnicodeObject *)str2,
8400 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401 Py_DECREF(self);
8402 Py_DECREF(str1);
8403 Py_DECREF(str2);
8404 return result;
8405}
8406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008407PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008408 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008409\n\
8410Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008411old replaced by new. If the optional argument count is\n\
8412given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413
8414static PyObject*
8415unicode_replace(PyUnicodeObject *self, PyObject *args)
8416{
8417 PyUnicodeObject *str1;
8418 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008419 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420 PyObject *result;
8421
Martin v. Löwis18e16552006-02-15 17:27:45 +00008422 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423 return NULL;
8424 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8425 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008428 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 Py_DECREF(str1);
8430 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008431 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432
8433 result = replace(self, str1, str2, maxcount);
8434
8435 Py_DECREF(str1);
8436 Py_DECREF(str2);
8437 return result;
8438}
8439
8440static
8441PyObject *unicode_repr(PyObject *unicode)
8442{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008443 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008444 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008445 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8446 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8447
8448 /* XXX(nnorwitz): rather than over-allocating, it would be
8449 better to choose a different scheme. Perhaps scan the
8450 first N-chars of the string and allocate based on that size.
8451 */
8452 /* Initial allocation is based on the longest-possible unichr
8453 escape.
8454
8455 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8456 unichr, so in this case it's the longest unichr escape. In
8457 narrow (UTF-16) builds this is five chars per source unichr
8458 since there are two unichrs in the surrogate pair, so in narrow
8459 (UTF-16) builds it's not the longest unichr escape.
8460
8461 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8462 so in the narrow (UTF-16) build case it's the longest unichr
8463 escape.
8464 */
8465
Walter Dörwald1ab83302007-05-18 17:15:44 +00008466 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008468#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008470#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008472#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008474 if (repr == NULL)
8475 return NULL;
8476
Walter Dörwald1ab83302007-05-18 17:15:44 +00008477 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008478
8479 /* Add quote */
8480 *p++ = (findchar(s, size, '\'') &&
8481 !findchar(s, size, '"')) ? '"' : '\'';
8482 while (size-- > 0) {
8483 Py_UNICODE ch = *s++;
8484
8485 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008486 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008487 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008488 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008489 continue;
8490 }
8491
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008493 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008494 *p++ = '\\';
8495 *p++ = 't';
8496 }
8497 else if (ch == '\n') {
8498 *p++ = '\\';
8499 *p++ = 'n';
8500 }
8501 else if (ch == '\r') {
8502 *p++ = '\\';
8503 *p++ = 'r';
8504 }
8505
8506 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008507 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008508 *p++ = '\\';
8509 *p++ = 'x';
8510 *p++ = hexdigits[(ch >> 4) & 0x000F];
8511 *p++ = hexdigits[ch & 0x000F];
8512 }
8513
Georg Brandl559e5d72008-06-11 18:37:52 +00008514 /* Copy ASCII characters as-is */
8515 else if (ch < 0x7F) {
8516 *p++ = ch;
8517 }
8518
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008520 else {
8521 Py_UCS4 ucs = ch;
8522
8523#ifndef Py_UNICODE_WIDE
8524 Py_UNICODE ch2 = 0;
8525 /* Get code point from surrogate pair */
8526 if (size > 0) {
8527 ch2 = *s;
8528 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008530 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008531 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008532 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008533 size--;
8534 }
8535 }
8536#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008537 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008538 (categories Z* and C* except ASCII space)
8539 */
8540 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8541 /* Map 8-bit characters to '\xhh' */
8542 if (ucs <= 0xff) {
8543 *p++ = '\\';
8544 *p++ = 'x';
8545 *p++ = hexdigits[(ch >> 4) & 0x000F];
8546 *p++ = hexdigits[ch & 0x000F];
8547 }
8548 /* Map 21-bit characters to '\U00xxxxxx' */
8549 else if (ucs >= 0x10000) {
8550 *p++ = '\\';
8551 *p++ = 'U';
8552 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8553 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8554 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8555 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8556 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8557 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8558 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8559 *p++ = hexdigits[ucs & 0x0000000F];
8560 }
8561 /* Map 16-bit characters to '\uxxxx' */
8562 else {
8563 *p++ = '\\';
8564 *p++ = 'u';
8565 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8566 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8567 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8568 *p++ = hexdigits[ucs & 0x000F];
8569 }
8570 }
8571 /* Copy characters as-is */
8572 else {
8573 *p++ = ch;
8574#ifndef Py_UNICODE_WIDE
8575 if (ucs >= 0x10000)
8576 *p++ = ch2;
8577#endif
8578 }
8579 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008580 }
8581 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008582 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008583
8584 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008585 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008586 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587}
8588
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008589PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591\n\
8592Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08008593such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594arguments start and end are interpreted as in slice notation.\n\
8595\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008596Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597
8598static PyObject *
8599unicode_rfind(PyUnicodeObject *self, PyObject *args)
8600{
Jesus Ceaac451502011-04-20 17:09:23 +02008601 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008602 Py_ssize_t start;
8603 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008604 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605
Jesus Ceaac451502011-04-20 17:09:23 +02008606 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
8607 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008608 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609
Thomas Wouters477c8d52006-05-27 19:21:47 +00008610 result = stringlib_rfind_slice(
8611 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8612 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8613 start, end
8614 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615
8616 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008617
Christian Heimes217cfd12007-12-02 14:31:20 +00008618 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619}
8620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008621PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008624Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625
8626static PyObject *
8627unicode_rindex(PyUnicodeObject *self, PyObject *args)
8628{
Jesus Ceaac451502011-04-20 17:09:23 +02008629 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008630 Py_ssize_t start;
8631 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008632 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633
Jesus Ceaac451502011-04-20 17:09:23 +02008634 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
8635 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008636 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637
Thomas Wouters477c8d52006-05-27 19:21:47 +00008638 result = stringlib_rfind_slice(
8639 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8640 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8641 start, end
8642 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643
8644 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008645
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 if (result < 0) {
8647 PyErr_SetString(PyExc_ValueError, "substring not found");
8648 return NULL;
8649 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008650 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651}
8652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008653PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008656Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008657done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658
8659static PyObject *
8660unicode_rjust(PyUnicodeObject *self, PyObject *args)
8661{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008662 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008663 Py_UNICODE fillchar = ' ';
8664
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008665 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666 return NULL;
8667
Tim Peters7a29bd52001-09-12 03:03:31 +00008668 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669 Py_INCREF(self);
8670 return (PyObject*) self;
8671 }
8672
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008673 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674}
8675
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 PyObject *sep,
8678 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679{
8680 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008681
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 s = PyUnicode_FromObject(s);
8683 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008684 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 if (sep != NULL) {
8686 sep = PyUnicode_FromObject(sep);
8687 if (sep == NULL) {
8688 Py_DECREF(s);
8689 return NULL;
8690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691 }
8692
8693 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8694
8695 Py_DECREF(s);
8696 Py_XDECREF(sep);
8697 return result;
8698}
8699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008700PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702\n\
8703Return a list of the words in S, using sep as the\n\
8704delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008705splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008706whitespace string is a separator and empty strings are\n\
8707removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708
8709static PyObject*
8710unicode_split(PyUnicodeObject *self, PyObject *args)
8711{
8712 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008713 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008714
Martin v. Löwis18e16552006-02-15 17:27:45 +00008715 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 return NULL;
8717
8718 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008722 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724}
8725
Thomas Wouters477c8d52006-05-27 19:21:47 +00008726PyObject *
8727PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8728{
8729 PyObject* str_obj;
8730 PyObject* sep_obj;
8731 PyObject* out;
8732
8733 str_obj = PyUnicode_FromObject(str_in);
8734 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008736 sep_obj = PyUnicode_FromObject(sep_in);
8737 if (!sep_obj) {
8738 Py_DECREF(str_obj);
8739 return NULL;
8740 }
8741
8742 out = stringlib_partition(
8743 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8744 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8745 );
8746
8747 Py_DECREF(sep_obj);
8748 Py_DECREF(str_obj);
8749
8750 return out;
8751}
8752
8753
8754PyObject *
8755PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8756{
8757 PyObject* str_obj;
8758 PyObject* sep_obj;
8759 PyObject* out;
8760
8761 str_obj = PyUnicode_FromObject(str_in);
8762 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008764 sep_obj = PyUnicode_FromObject(sep_in);
8765 if (!sep_obj) {
8766 Py_DECREF(str_obj);
8767 return NULL;
8768 }
8769
8770 out = stringlib_rpartition(
8771 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8772 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8773 );
8774
8775 Py_DECREF(sep_obj);
8776 Py_DECREF(str_obj);
8777
8778 return out;
8779}
8780
8781PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008782 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008783\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008784Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008785the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008786found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008787
8788static PyObject*
8789unicode_partition(PyUnicodeObject *self, PyObject *separator)
8790{
8791 return PyUnicode_Partition((PyObject *)self, separator);
8792}
8793
8794PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008795 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008796\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008797Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008798the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008799separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008800
8801static PyObject*
8802unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8803{
8804 return PyUnicode_RPartition((PyObject *)self, separator);
8805}
8806
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008807PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008808 PyObject *sep,
8809 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008810{
8811 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008812
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008813 s = PyUnicode_FromObject(s);
8814 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008815 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008816 if (sep != NULL) {
8817 sep = PyUnicode_FromObject(sep);
8818 if (sep == NULL) {
8819 Py_DECREF(s);
8820 return NULL;
8821 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008822 }
8823
8824 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8825
8826 Py_DECREF(s);
8827 Py_XDECREF(sep);
8828 return result;
8829}
8830
8831PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008833\n\
8834Return a list of the words in S, using sep as the\n\
8835delimiter string, starting at the end of the string and\n\
8836working to the front. If maxsplit is given, at most maxsplit\n\
8837splits are done. If sep is not specified, any whitespace string\n\
8838is a separator.");
8839
8840static PyObject*
8841unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8842{
8843 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008844 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008845
Martin v. Löwis18e16552006-02-15 17:27:45 +00008846 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008847 return NULL;
8848
8849 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008850 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008851 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008852 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008853 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008854 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008855}
8856
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008857PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008858 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859\n\
8860Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008861Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008862is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863
8864static PyObject*
8865unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8866{
Guido van Rossum86662912000-04-11 15:38:46 +00008867 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868
Guido van Rossum86662912000-04-11 15:38:46 +00008869 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870 return NULL;
8871
Guido van Rossum86662912000-04-11 15:38:46 +00008872 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873}
8874
8875static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008876PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877{
Walter Dörwald346737f2007-05-31 10:44:43 +00008878 if (PyUnicode_CheckExact(self)) {
8879 Py_INCREF(self);
8880 return self;
8881 } else
8882 /* Subtype -- return genuine unicode string with the same value. */
8883 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8884 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885}
8886
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008887PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008888 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889\n\
8890Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008891and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892
8893static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008894unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896 return fixup(self, fixswapcase);
8897}
8898
Georg Brandlceee0772007-11-27 23:48:05 +00008899PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008901\n\
8902Return a translation table usable for str.translate().\n\
8903If there is only one argument, it must be a dictionary mapping Unicode\n\
8904ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008905Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008906If there are two arguments, they must be strings of equal length, and\n\
8907in the resulting dictionary, each character in x will be mapped to the\n\
8908character at the same position in y. If there is a third argument, it\n\
8909must be a string, whose characters will be mapped to None in the result.");
8910
8911static PyObject*
8912unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8913{
8914 PyObject *x, *y = NULL, *z = NULL;
8915 PyObject *new = NULL, *key, *value;
8916 Py_ssize_t i = 0;
8917 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008918
Georg Brandlceee0772007-11-27 23:48:05 +00008919 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8920 return NULL;
8921 new = PyDict_New();
8922 if (!new)
8923 return NULL;
8924 if (y != NULL) {
8925 /* x must be a string too, of equal length */
8926 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8927 if (!PyUnicode_Check(x)) {
8928 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8929 "be a string if there is a second argument");
8930 goto err;
8931 }
8932 if (PyUnicode_GET_SIZE(x) != ylen) {
8933 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8934 "arguments must have equal length");
8935 goto err;
8936 }
8937 /* create entries for translating chars in x to those in y */
8938 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008939 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
Benjamin Peterson53aa1d72011-12-20 13:29:45 -06008940 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +00008941 goto err;
Benjamin Peterson53aa1d72011-12-20 13:29:45 -06008942 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8943 if (!value) {
8944 Py_DECREF(key);
8945 goto err;
8946 }
Georg Brandlceee0772007-11-27 23:48:05 +00008947 res = PyDict_SetItem(new, key, value);
8948 Py_DECREF(key);
8949 Py_DECREF(value);
8950 if (res < 0)
8951 goto err;
8952 }
8953 /* create entries for deleting chars in z */
8954 if (z != NULL) {
8955 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008956 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008957 if (!key)
8958 goto err;
8959 res = PyDict_SetItem(new, key, Py_None);
8960 Py_DECREF(key);
8961 if (res < 0)
8962 goto err;
8963 }
8964 }
8965 } else {
8966 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008967 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008968 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8969 "to maketrans it must be a dict");
8970 goto err;
8971 }
8972 /* copy entries into the new dict, converting string keys to int keys */
8973 while (PyDict_Next(x, &i, &key, &value)) {
8974 if (PyUnicode_Check(key)) {
8975 /* convert string keys to integer keys */
8976 PyObject *newkey;
8977 if (PyUnicode_GET_SIZE(key) != 1) {
8978 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8979 "table must be of length 1");
8980 goto err;
8981 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008982 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008983 if (!newkey)
8984 goto err;
8985 res = PyDict_SetItem(new, newkey, value);
8986 Py_DECREF(newkey);
8987 if (res < 0)
8988 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008989 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008990 /* just keep integer keys */
8991 if (PyDict_SetItem(new, key, value) < 0)
8992 goto err;
8993 } else {
8994 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8995 "be strings or integers");
8996 goto err;
8997 }
8998 }
8999 }
9000 return new;
9001 err:
9002 Py_DECREF(new);
9003 return NULL;
9004}
9005
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009006PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009007 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008\n\
9009Return a copy of the string S, where all characters have been mapped\n\
9010through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009011Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00009012Unmapped characters are left untouched. Characters mapped to None\n\
9013are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014
9015static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009016unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009017{
Georg Brandlceee0772007-11-27 23:48:05 +00009018 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019}
9020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009021PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009022 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009024Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025
9026static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009027unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029 return fixup(self, fixupper);
9030}
9031
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009032PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009033 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00009035Pad a numeric string S with zeros on the left, to fill a field\n\
9036of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037
9038static PyObject *
9039unicode_zfill(PyUnicodeObject *self, PyObject *args)
9040{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009041 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042 PyUnicodeObject *u;
9043
Martin v. Löwis18e16552006-02-15 17:27:45 +00009044 Py_ssize_t width;
9045 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046 return NULL;
9047
9048 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00009049 if (PyUnicode_CheckExact(self)) {
9050 Py_INCREF(self);
9051 return (PyObject*) self;
9052 }
9053 else
9054 return PyUnicode_FromUnicode(
9055 PyUnicode_AS_UNICODE(self),
9056 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00009057 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058 }
9059
9060 fill = width - self->length;
9061
9062 u = pad(self, fill, 0, '0');
9063
Walter Dörwald068325e2002-04-15 13:36:47 +00009064 if (u == NULL)
9065 return NULL;
9066
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067 if (u->str[fill] == '+' || u->str[fill] == '-') {
9068 /* move sign to beginning of string */
9069 u->str[0] = u->str[fill];
9070 u->str[fill] = '0';
9071 }
9072
9073 return (PyObject*) u;
9074}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075
9076#if 0
9077static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009078unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079{
Christian Heimes2202f872008-02-06 14:31:34 +00009080 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009082
9083static PyObject *
9084unicode__decimal2ascii(PyObject *self)
9085{
9086 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9087 PyUnicode_GET_SIZE(self));
9088}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089#endif
9090
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009091PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009094Return True if S starts with the specified prefix, False otherwise.\n\
9095With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009096With optional end, stop comparing S at that position.\n\
9097prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098
9099static PyObject *
9100unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009101 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009103 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009105 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009106 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009107 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108
Jesus Ceaac451502011-04-20 17:09:23 +02009109 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009110 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009111 if (PyTuple_Check(subobj)) {
9112 Py_ssize_t i;
9113 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9114 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009115 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009116 if (substring == NULL)
9117 return NULL;
9118 result = tailmatch(self, substring, start, end, -1);
9119 Py_DECREF(substring);
9120 if (result) {
9121 Py_RETURN_TRUE;
9122 }
9123 }
9124 /* nothing matched */
9125 Py_RETURN_FALSE;
9126 }
9127 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009128 if (substring == NULL) {
9129 if (PyErr_ExceptionMatches(PyExc_TypeError))
9130 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
9131 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009133 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009134 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009135 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009136 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137}
9138
9139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009140PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009141 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009143Return True if S ends with the specified suffix, False otherwise.\n\
9144With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009145With optional end, stop comparing S at that position.\n\
9146suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147
9148static PyObject *
9149unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009150 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009152 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009154 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009155 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009156 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157
Jesus Ceaac451502011-04-20 17:09:23 +02009158 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009160 if (PyTuple_Check(subobj)) {
9161 Py_ssize_t i;
9162 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9163 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009164 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009165 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009166 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009167 result = tailmatch(self, substring, start, end, +1);
9168 Py_DECREF(substring);
9169 if (result) {
9170 Py_RETURN_TRUE;
9171 }
9172 }
9173 Py_RETURN_FALSE;
9174 }
9175 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009176 if (substring == NULL) {
9177 if (PyErr_ExceptionMatches(PyExc_TypeError))
9178 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
9179 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009180 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009181 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009182 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009184 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185}
9186
Eric Smith8c663262007-08-25 02:26:07 +00009187#include "stringlib/string_format.h"
9188
9189PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009191\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009192Return a formatted version of S, using substitutions from args and kwargs.\n\
9193The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009194
Eric Smith27bbca62010-11-04 17:06:58 +00009195PyDoc_STRVAR(format_map__doc__,
9196 "S.format_map(mapping) -> str\n\
9197\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009198Return a formatted version of S, using substitutions from mapping.\n\
9199The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009200
Eric Smith4a7d76d2008-05-30 18:10:19 +00009201static PyObject *
9202unicode__format__(PyObject* self, PyObject* args)
9203{
9204 PyObject *format_spec;
9205
9206 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9207 return NULL;
9208
9209 return _PyUnicode_FormatAdvanced(self,
9210 PyUnicode_AS_UNICODE(format_spec),
9211 PyUnicode_GET_SIZE(format_spec));
9212}
9213
Eric Smith8c663262007-08-25 02:26:07 +00009214PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009215 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009216\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009217Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009218
9219static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009220unicode__sizeof__(PyUnicodeObject *v)
9221{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009222 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9223 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009224}
9225
9226PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009227 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009228
9229static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009230unicode_getnewargs(PyUnicodeObject *v)
9231{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009232 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009233}
9234
Guido van Rossumd57fd912000-03-10 22:53:23 +00009235static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009236 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009237 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9238 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009239 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009240 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9241 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9242 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9243 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9244 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9245 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9246 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009247 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009248 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9249 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9250 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009251 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009252 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9253 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9254 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009255 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009256 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009257 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009258 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009259 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9260 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9261 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9262 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9263 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9264 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9265 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9266 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9267 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9268 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9269 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9270 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9271 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9272 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009273 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009274 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009275 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009276 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009277 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009278 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009279 {"maketrans", (PyCFunction) unicode_maketrans,
9280 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009281 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009282#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009283 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284#endif
9285
9286#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009287 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009288 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009289 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290#endif
9291
Benjamin Peterson14339b62009-01-31 16:36:08 +00009292 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293 {NULL, NULL}
9294};
9295
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009296static PyObject *
9297unicode_mod(PyObject *v, PyObject *w)
9298{
Benjamin Peterson29060642009-01-31 22:14:21 +00009299 if (!PyUnicode_Check(v)) {
9300 Py_INCREF(Py_NotImplemented);
9301 return Py_NotImplemented;
9302 }
9303 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009304}
9305
9306static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009307 0, /*nb_add*/
9308 0, /*nb_subtract*/
9309 0, /*nb_multiply*/
9310 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009311};
9312
Guido van Rossumd57fd912000-03-10 22:53:23 +00009313static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009314 (lenfunc) unicode_length, /* sq_length */
9315 PyUnicode_Concat, /* sq_concat */
9316 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9317 (ssizeargfunc) unicode_getitem, /* sq_item */
9318 0, /* sq_slice */
9319 0, /* sq_ass_item */
9320 0, /* sq_ass_slice */
9321 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322};
9323
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009324static PyObject*
9325unicode_subscript(PyUnicodeObject* self, PyObject* item)
9326{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009327 if (PyIndex_Check(item)) {
9328 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009329 if (i == -1 && PyErr_Occurred())
9330 return NULL;
9331 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009332 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009333 return unicode_getitem(self, i);
9334 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009335 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009336 Py_UNICODE* source_buf;
9337 Py_UNICODE* result_buf;
9338 PyObject* result;
9339
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009340 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009341 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009342 return NULL;
9343 }
9344
9345 if (slicelength <= 0) {
9346 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009347 } else if (start == 0 && step == 1 && slicelength == self->length &&
9348 PyUnicode_CheckExact(self)) {
9349 Py_INCREF(self);
9350 return (PyObject *)self;
9351 } else if (step == 1) {
9352 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009353 } else {
9354 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009355 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9356 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009357
Benjamin Peterson29060642009-01-31 22:14:21 +00009358 if (result_buf == NULL)
9359 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009360
9361 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9362 result_buf[i] = source_buf[cur];
9363 }
Tim Petersced69f82003-09-16 20:30:58 +00009364
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009365 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009366 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009367 return result;
9368 }
9369 } else {
9370 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9371 return NULL;
9372 }
9373}
9374
9375static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009376 (lenfunc)unicode_length, /* mp_length */
9377 (binaryfunc)unicode_subscript, /* mp_subscript */
9378 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009379};
9380
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382/* Helpers for PyUnicode_Format() */
9383
9384static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009385getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009386{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009387 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009389 (*p_argidx)++;
9390 if (arglen < 0)
9391 return args;
9392 else
9393 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009394 }
9395 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009396 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009397 return NULL;
9398}
9399
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009400/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009402static PyObject *
9403formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009405 char *p;
9406 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009407 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009408
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409 x = PyFloat_AsDouble(v);
9410 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009411 return NULL;
9412
Guido van Rossumd57fd912000-03-10 22:53:23 +00009413 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009414 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009415
Eric Smith0923d1d2009-04-16 20:16:10 +00009416 p = PyOS_double_to_string(x, type, prec,
9417 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009418 if (p == NULL)
9419 return NULL;
9420 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009421 PyMem_Free(p);
9422 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009423}
9424
Tim Peters38fd5b62000-09-21 05:43:11 +00009425static PyObject*
9426formatlong(PyObject *val, int flags, int prec, int type)
9427{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009428 char *buf;
9429 int len;
9430 PyObject *str; /* temporary string object. */
9431 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009432
Benjamin Peterson14339b62009-01-31 16:36:08 +00009433 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9434 if (!str)
9435 return NULL;
9436 result = PyUnicode_FromStringAndSize(buf, len);
9437 Py_DECREF(str);
9438 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009439}
9440
Guido van Rossumd57fd912000-03-10 22:53:23 +00009441static int
9442formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009443 size_t buflen,
9444 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009446 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009447 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009448 if (PyUnicode_GET_SIZE(v) == 1) {
9449 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9450 buf[1] = '\0';
9451 return 1;
9452 }
9453#ifndef Py_UNICODE_WIDE
9454 if (PyUnicode_GET_SIZE(v) == 2) {
9455 /* Decode a valid surrogate pair */
9456 int c0 = PyUnicode_AS_UNICODE(v)[0];
9457 int c1 = PyUnicode_AS_UNICODE(v)[1];
9458 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9459 0xDC00 <= c1 && c1 <= 0xDFFF) {
9460 buf[0] = c0;
9461 buf[1] = c1;
9462 buf[2] = '\0';
9463 return 2;
9464 }
9465 }
9466#endif
9467 goto onError;
9468 }
9469 else {
9470 /* Integer input truncated to a character */
9471 long x;
9472 x = PyLong_AsLong(v);
9473 if (x == -1 && PyErr_Occurred())
9474 goto onError;
9475
9476 if (x < 0 || x > 0x10ffff) {
9477 PyErr_SetString(PyExc_OverflowError,
9478 "%c arg not in range(0x110000)");
9479 return -1;
9480 }
9481
9482#ifndef Py_UNICODE_WIDE
9483 if (x > 0xffff) {
9484 x -= 0x10000;
9485 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9486 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9487 return 2;
9488 }
9489#endif
9490 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009491 buf[1] = '\0';
9492 return 1;
9493 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009494
Benjamin Peterson29060642009-01-31 22:14:21 +00009495 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009496 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009497 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009498 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009499}
9500
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009501/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009502 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009503*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009504#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009505
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009507 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508{
9509 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009510 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511 int args_owned = 0;
9512 PyUnicodeObject *result = NULL;
9513 PyObject *dict = NULL;
9514 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009515
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009517 PyErr_BadInternalCall();
9518 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519 }
9520 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009521 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523 fmt = PyUnicode_AS_UNICODE(uformat);
9524 fmtcnt = PyUnicode_GET_SIZE(uformat);
9525
9526 reslen = rescnt = fmtcnt + 100;
9527 result = _PyUnicode_New(reslen);
9528 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009529 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530 res = PyUnicode_AS_UNICODE(result);
9531
9532 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009533 arglen = PyTuple_Size(args);
9534 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535 }
9536 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009537 arglen = -1;
9538 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539 }
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -04009540 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009541 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542
9543 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009544 if (*fmt != '%') {
9545 if (--rescnt < 0) {
9546 rescnt = fmtcnt + 100;
9547 reslen += rescnt;
9548 if (_PyUnicode_Resize(&result, reslen) < 0)
9549 goto onError;
9550 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9551 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009552 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009553 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009554 }
9555 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009556 /* Got a format specifier */
9557 int flags = 0;
9558 Py_ssize_t width = -1;
9559 int prec = -1;
9560 Py_UNICODE c = '\0';
9561 Py_UNICODE fill;
9562 int isnumok;
9563 PyObject *v = NULL;
9564 PyObject *temp = NULL;
9565 Py_UNICODE *pbuf;
9566 Py_UNICODE sign;
9567 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009568 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569
Benjamin Peterson29060642009-01-31 22:14:21 +00009570 fmt++;
9571 if (*fmt == '(') {
9572 Py_UNICODE *keystart;
9573 Py_ssize_t keylen;
9574 PyObject *key;
9575 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009576
Benjamin Peterson29060642009-01-31 22:14:21 +00009577 if (dict == NULL) {
9578 PyErr_SetString(PyExc_TypeError,
9579 "format requires a mapping");
9580 goto onError;
9581 }
9582 ++fmt;
9583 --fmtcnt;
9584 keystart = fmt;
9585 /* Skip over balanced parentheses */
9586 while (pcount > 0 && --fmtcnt >= 0) {
9587 if (*fmt == ')')
9588 --pcount;
9589 else if (*fmt == '(')
9590 ++pcount;
9591 fmt++;
9592 }
9593 keylen = fmt - keystart - 1;
9594 if (fmtcnt < 0 || pcount > 0) {
9595 PyErr_SetString(PyExc_ValueError,
9596 "incomplete format key");
9597 goto onError;
9598 }
9599#if 0
9600 /* keys are converted to strings using UTF-8 and
9601 then looked up since Python uses strings to hold
9602 variables names etc. in its namespaces and we
9603 wouldn't want to break common idioms. */
9604 key = PyUnicode_EncodeUTF8(keystart,
9605 keylen,
9606 NULL);
9607#else
9608 key = PyUnicode_FromUnicode(keystart, keylen);
9609#endif
9610 if (key == NULL)
9611 goto onError;
9612 if (args_owned) {
9613 Py_DECREF(args);
9614 args_owned = 0;
9615 }
9616 args = PyObject_GetItem(dict, key);
9617 Py_DECREF(key);
9618 if (args == NULL) {
9619 goto onError;
9620 }
9621 args_owned = 1;
9622 arglen = -1;
9623 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009624 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009625 while (--fmtcnt >= 0) {
9626 switch (c = *fmt++) {
9627 case '-': flags |= F_LJUST; continue;
9628 case '+': flags |= F_SIGN; continue;
9629 case ' ': flags |= F_BLANK; continue;
9630 case '#': flags |= F_ALT; continue;
9631 case '0': flags |= F_ZERO; continue;
9632 }
9633 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009634 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009635 if (c == '*') {
9636 v = getnextarg(args, arglen, &argidx);
9637 if (v == NULL)
9638 goto onError;
9639 if (!PyLong_Check(v)) {
9640 PyErr_SetString(PyExc_TypeError,
9641 "* wants int");
9642 goto onError;
9643 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +02009644 width = PyLong_AsSsize_t(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00009645 if (width == -1 && PyErr_Occurred())
9646 goto onError;
9647 if (width < 0) {
9648 flags |= F_LJUST;
9649 width = -width;
9650 }
9651 if (--fmtcnt >= 0)
9652 c = *fmt++;
9653 }
9654 else if (c >= '0' && c <= '9') {
9655 width = c - '0';
9656 while (--fmtcnt >= 0) {
9657 c = *fmt++;
9658 if (c < '0' || c > '9')
9659 break;
Mark Dickinsonfb90c092012-10-28 10:18:03 +00009660 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009661 PyErr_SetString(PyExc_ValueError,
9662 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009663 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009664 }
9665 width = width*10 + (c - '0');
9666 }
9667 }
9668 if (c == '.') {
9669 prec = 0;
9670 if (--fmtcnt >= 0)
9671 c = *fmt++;
9672 if (c == '*') {
9673 v = getnextarg(args, arglen, &argidx);
9674 if (v == NULL)
9675 goto onError;
9676 if (!PyLong_Check(v)) {
9677 PyErr_SetString(PyExc_TypeError,
9678 "* wants int");
9679 goto onError;
9680 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +02009681 prec = _PyLong_AsInt(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00009682 if (prec == -1 && PyErr_Occurred())
9683 goto onError;
9684 if (prec < 0)
9685 prec = 0;
9686 if (--fmtcnt >= 0)
9687 c = *fmt++;
9688 }
9689 else if (c >= '0' && c <= '9') {
9690 prec = c - '0';
9691 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009692 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009693 if (c < '0' || c > '9')
9694 break;
Mark Dickinsonfb90c092012-10-28 10:18:03 +00009695 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009696 PyErr_SetString(PyExc_ValueError,
9697 "prec too big");
9698 goto onError;
9699 }
9700 prec = prec*10 + (c - '0');
9701 }
9702 }
9703 } /* prec */
9704 if (fmtcnt >= 0) {
9705 if (c == 'h' || c == 'l' || c == 'L') {
9706 if (--fmtcnt >= 0)
9707 c = *fmt++;
9708 }
9709 }
9710 if (fmtcnt < 0) {
9711 PyErr_SetString(PyExc_ValueError,
9712 "incomplete format");
9713 goto onError;
9714 }
9715 if (c != '%') {
9716 v = getnextarg(args, arglen, &argidx);
9717 if (v == NULL)
9718 goto onError;
9719 }
9720 sign = 0;
9721 fill = ' ';
9722 switch (c) {
9723
9724 case '%':
9725 pbuf = formatbuf;
9726 /* presume that buffer length is at least 1 */
9727 pbuf[0] = '%';
9728 len = 1;
9729 break;
9730
9731 case 's':
9732 case 'r':
9733 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009734 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009735 temp = v;
9736 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009737 }
9738 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009739 if (c == 's')
9740 temp = PyObject_Str(v);
9741 else if (c == 'r')
9742 temp = PyObject_Repr(v);
9743 else
9744 temp = PyObject_ASCII(v);
9745 if (temp == NULL)
9746 goto onError;
9747 if (PyUnicode_Check(temp))
9748 /* nothing to do */;
9749 else {
9750 Py_DECREF(temp);
9751 PyErr_SetString(PyExc_TypeError,
9752 "%s argument has non-string str()");
9753 goto onError;
9754 }
9755 }
9756 pbuf = PyUnicode_AS_UNICODE(temp);
9757 len = PyUnicode_GET_SIZE(temp);
9758 if (prec >= 0 && len > prec)
9759 len = prec;
9760 break;
9761
9762 case 'i':
9763 case 'd':
9764 case 'u':
9765 case 'o':
9766 case 'x':
9767 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +00009768 isnumok = 0;
9769 if (PyNumber_Check(v)) {
9770 PyObject *iobj=NULL;
9771
9772 if (PyLong_Check(v)) {
9773 iobj = v;
9774 Py_INCREF(iobj);
9775 }
9776 else {
9777 iobj = PyNumber_Long(v);
9778 }
9779 if (iobj!=NULL) {
9780 if (PyLong_Check(iobj)) {
9781 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -07009782 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +00009783 Py_DECREF(iobj);
9784 if (!temp)
9785 goto onError;
9786 pbuf = PyUnicode_AS_UNICODE(temp);
9787 len = PyUnicode_GET_SIZE(temp);
9788 sign = 1;
9789 }
9790 else {
9791 Py_DECREF(iobj);
9792 }
9793 }
9794 }
9795 if (!isnumok) {
9796 PyErr_Format(PyExc_TypeError,
9797 "%%%c format: a number is required, "
9798 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9799 goto onError;
9800 }
9801 if (flags & F_ZERO)
9802 fill = '0';
9803 break;
9804
9805 case 'e':
9806 case 'E':
9807 case 'f':
9808 case 'F':
9809 case 'g':
9810 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009811 temp = formatfloat(v, flags, prec, c);
9812 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009813 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009814 pbuf = PyUnicode_AS_UNICODE(temp);
9815 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009816 sign = 1;
9817 if (flags & F_ZERO)
9818 fill = '0';
9819 break;
9820
9821 case 'c':
9822 pbuf = formatbuf;
9823 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9824 if (len < 0)
9825 goto onError;
9826 break;
9827
9828 default:
9829 PyErr_Format(PyExc_ValueError,
9830 "unsupported format character '%c' (0x%x) "
9831 "at index %zd",
9832 (31<=c && c<=126) ? (char)c : '?',
9833 (int)c,
9834 (Py_ssize_t)(fmt - 1 -
9835 PyUnicode_AS_UNICODE(uformat)));
9836 goto onError;
9837 }
9838 if (sign) {
9839 if (*pbuf == '-' || *pbuf == '+') {
9840 sign = *pbuf++;
9841 len--;
9842 }
9843 else if (flags & F_SIGN)
9844 sign = '+';
9845 else if (flags & F_BLANK)
9846 sign = ' ';
9847 else
9848 sign = 0;
9849 }
9850 if (width < len)
9851 width = len;
9852 if (rescnt - (sign != 0) < width) {
9853 reslen -= rescnt;
9854 rescnt = width + fmtcnt + 100;
9855 reslen += rescnt;
9856 if (reslen < 0) {
9857 Py_XDECREF(temp);
9858 PyErr_NoMemory();
9859 goto onError;
9860 }
9861 if (_PyUnicode_Resize(&result, reslen) < 0) {
9862 Py_XDECREF(temp);
9863 goto onError;
9864 }
9865 res = PyUnicode_AS_UNICODE(result)
9866 + reslen - rescnt;
9867 }
9868 if (sign) {
9869 if (fill != ' ')
9870 *res++ = sign;
9871 rescnt--;
9872 if (width > len)
9873 width--;
9874 }
9875 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9876 assert(pbuf[0] == '0');
9877 assert(pbuf[1] == c);
9878 if (fill != ' ') {
9879 *res++ = *pbuf++;
9880 *res++ = *pbuf++;
9881 }
9882 rescnt -= 2;
9883 width -= 2;
9884 if (width < 0)
9885 width = 0;
9886 len -= 2;
9887 }
9888 if (width > len && !(flags & F_LJUST)) {
9889 do {
9890 --rescnt;
9891 *res++ = fill;
9892 } while (--width > len);
9893 }
9894 if (fill == ' ') {
9895 if (sign)
9896 *res++ = sign;
9897 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9898 assert(pbuf[0] == '0');
9899 assert(pbuf[1] == c);
9900 *res++ = *pbuf++;
9901 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009902 }
9903 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009904 Py_UNICODE_COPY(res, pbuf, len);
9905 res += len;
9906 rescnt -= len;
9907 while (--width >= len) {
9908 --rescnt;
9909 *res++ = ' ';
9910 }
9911 if (dict && (argidx < arglen) && c != '%') {
9912 PyErr_SetString(PyExc_TypeError,
9913 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009914 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009915 goto onError;
9916 }
9917 Py_XDECREF(temp);
9918 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009919 } /* until end */
9920 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009921 PyErr_SetString(PyExc_TypeError,
9922 "not all arguments converted during string formatting");
9923 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924 }
9925
Thomas Woutersa96affe2006-03-12 00:29:36 +00009926 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009927 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009929 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930 }
9931 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932 return (PyObject *)result;
9933
Benjamin Peterson29060642009-01-31 22:14:21 +00009934 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935 Py_XDECREF(result);
9936 Py_DECREF(uformat);
9937 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009938 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009939 }
9940 return NULL;
9941}
9942
Jeremy Hylton938ace62002-07-17 16:30:39 +00009943static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009944unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9945
Tim Peters6d6c1a32001-08-02 04:15:00 +00009946static PyObject *
9947unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9948{
Benjamin Peterson29060642009-01-31 22:14:21 +00009949 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009950 static char *kwlist[] = {"object", "encoding", "errors", 0};
9951 char *encoding = NULL;
9952 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009953
Benjamin Peterson14339b62009-01-31 16:36:08 +00009954 if (type != &PyUnicode_Type)
9955 return unicode_subtype_new(type, args, kwds);
9956 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009957 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009958 return NULL;
9959 if (x == NULL)
9960 return (PyObject *)_PyUnicode_New(0);
9961 if (encoding == NULL && errors == NULL)
9962 return PyObject_Str(x);
9963 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009964 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009965}
9966
Guido van Rossume023fe02001-08-30 03:12:59 +00009967static PyObject *
9968unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9969{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009970 PyUnicodeObject *tmp, *pnew;
9971 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009972
Benjamin Peterson14339b62009-01-31 16:36:08 +00009973 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9974 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9975 if (tmp == NULL)
9976 return NULL;
9977 assert(PyUnicode_Check(tmp));
9978 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9979 if (pnew == NULL) {
9980 Py_DECREF(tmp);
9981 return NULL;
9982 }
9983 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9984 if (pnew->str == NULL) {
9985 _Py_ForgetReference((PyObject *)pnew);
9986 PyObject_Del(pnew);
9987 Py_DECREF(tmp);
9988 return PyErr_NoMemory();
9989 }
9990 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9991 pnew->length = n;
9992 pnew->hash = tmp->hash;
9993 Py_DECREF(tmp);
9994 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009995}
9996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009997PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -07009998"str(object='') -> str\n\
9999str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000010000\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100010001Create a new string object from the given object. If encoding or\n\
10002errors is specified, then the object must expose a data buffer\n\
10003that will be decoded using the given encoding and error handler.\n\
10004Otherwise, returns the result of object.__str__() (if defined)\n\
10005or repr(object).\n\
10006encoding defaults to sys.getdefaultencoding().\n\
10007errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000010008
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010009static PyObject *unicode_iter(PyObject *seq);
10010
Guido van Rossumd57fd912000-03-10 22:53:23 +000010011PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000010012 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010013 "str", /* tp_name */
10014 sizeof(PyUnicodeObject), /* tp_size */
10015 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010017 (destructor)unicode_dealloc, /* tp_dealloc */
10018 0, /* tp_print */
10019 0, /* tp_getattr */
10020 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010021 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010022 unicode_repr, /* tp_repr */
10023 &unicode_as_number, /* tp_as_number */
10024 &unicode_as_sequence, /* tp_as_sequence */
10025 &unicode_as_mapping, /* tp_as_mapping */
10026 (hashfunc) unicode_hash, /* tp_hash*/
10027 0, /* tp_call*/
10028 (reprfunc) unicode_str, /* tp_str */
10029 PyObject_GenericGetAttr, /* tp_getattro */
10030 0, /* tp_setattro */
10031 0, /* tp_as_buffer */
10032 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000010033 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010034 unicode_doc, /* tp_doc */
10035 0, /* tp_traverse */
10036 0, /* tp_clear */
10037 PyUnicode_RichCompare, /* tp_richcompare */
10038 0, /* tp_weaklistoffset */
10039 unicode_iter, /* tp_iter */
10040 0, /* tp_iternext */
10041 unicode_methods, /* tp_methods */
10042 0, /* tp_members */
10043 0, /* tp_getset */
10044 &PyBaseObject_Type, /* tp_base */
10045 0, /* tp_dict */
10046 0, /* tp_descr_get */
10047 0, /* tp_descr_set */
10048 0, /* tp_dictoffset */
10049 0, /* tp_init */
10050 0, /* tp_alloc */
10051 unicode_new, /* tp_new */
10052 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010053};
10054
10055/* Initialize the Unicode implementation */
10056
Thomas Wouters78890102000-07-22 19:25:51 +000010057void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010058{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010059 int i;
10060
Thomas Wouters477c8d52006-05-27 19:21:47 +000010061 /* XXX - move this array to unicodectype.c ? */
10062 Py_UNICODE linebreak[] = {
10063 0x000A, /* LINE FEED */
10064 0x000D, /* CARRIAGE RETURN */
10065 0x001C, /* FILE SEPARATOR */
10066 0x001D, /* GROUP SEPARATOR */
10067 0x001E, /* RECORD SEPARATOR */
10068 0x0085, /* NEXT LINE */
10069 0x2028, /* LINE SEPARATOR */
10070 0x2029, /* PARAGRAPH SEPARATOR */
10071 };
10072
Fred Drakee4315f52000-05-09 19:53:39 +000010073 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +000010074 free_list = NULL;
10075 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010076 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010077 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +000010078 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010079
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010080 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000010081 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000010082 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010083 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000010084
10085 /* initialize the linebreak bloom filter */
10086 bloom_linebreak = make_bloom_mask(
10087 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10088 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010089
10090 PyType_Ready(&EncodingMapType);
Benjamin Petersonc4311282012-10-30 23:21:10 -040010091
10092 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
10093 Py_FatalError("Can't initialize field name iterator type");
10094
10095 if (PyType_Ready(&PyFormatterIter_Type) < 0)
10096 Py_FatalError("Can't initialize formatter iter type");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010097}
10098
10099/* Finalize the Unicode implementation */
10100
Christian Heimesa156e092008-02-16 07:38:31 +000010101int
10102PyUnicode_ClearFreeList(void)
10103{
10104 int freelist_size = numfree;
10105 PyUnicodeObject *u;
10106
10107 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010108 PyUnicodeObject *v = u;
10109 u = *(PyUnicodeObject **)u;
10110 if (v->str)
10111 PyObject_DEL(v->str);
10112 Py_XDECREF(v->defenc);
10113 PyObject_Del(v);
10114 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010115 }
10116 free_list = NULL;
10117 assert(numfree == 0);
10118 return freelist_size;
10119}
10120
Guido van Rossumd57fd912000-03-10 22:53:23 +000010121void
Thomas Wouters78890102000-07-22 19:25:51 +000010122_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010123{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010124 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010126 Py_XDECREF(unicode_empty);
10127 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010128
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010129 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010130 if (unicode_latin1[i]) {
10131 Py_DECREF(unicode_latin1[i]);
10132 unicode_latin1[i] = NULL;
10133 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010134 }
Christian Heimesa156e092008-02-16 07:38:31 +000010135 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010137
Walter Dörwald16807132007-05-25 13:52:07 +000010138void
10139PyUnicode_InternInPlace(PyObject **p)
10140{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010141 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10142 PyObject *t;
10143 if (s == NULL || !PyUnicode_Check(s))
10144 Py_FatalError(
10145 "PyUnicode_InternInPlace: unicode strings only please!");
10146 /* If it's a subclass, we don't really know what putting
10147 it in the interned dict might do. */
10148 if (!PyUnicode_CheckExact(s))
10149 return;
10150 if (PyUnicode_CHECK_INTERNED(s))
10151 return;
10152 if (interned == NULL) {
10153 interned = PyDict_New();
10154 if (interned == NULL) {
10155 PyErr_Clear(); /* Don't leave an exception */
10156 return;
10157 }
10158 }
10159 /* It might be that the GetItem call fails even
10160 though the key is present in the dictionary,
10161 namely when this happens during a stack overflow. */
10162 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010163 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010164 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010165
Benjamin Peterson29060642009-01-31 22:14:21 +000010166 if (t) {
10167 Py_INCREF(t);
10168 Py_DECREF(*p);
10169 *p = t;
10170 return;
10171 }
Walter Dörwald16807132007-05-25 13:52:07 +000010172
Benjamin Peterson14339b62009-01-31 16:36:08 +000010173 PyThreadState_GET()->recursion_critical = 1;
10174 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10175 PyErr_Clear();
10176 PyThreadState_GET()->recursion_critical = 0;
10177 return;
10178 }
10179 PyThreadState_GET()->recursion_critical = 0;
10180 /* The two references in interned are not counted by refcnt.
10181 The deallocator will take care of this */
10182 Py_REFCNT(s) -= 2;
10183 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010184}
10185
10186void
10187PyUnicode_InternImmortal(PyObject **p)
10188{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010189 PyUnicode_InternInPlace(p);
10190 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10191 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10192 Py_INCREF(*p);
10193 }
Walter Dörwald16807132007-05-25 13:52:07 +000010194}
10195
10196PyObject *
10197PyUnicode_InternFromString(const char *cp)
10198{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010199 PyObject *s = PyUnicode_FromString(cp);
10200 if (s == NULL)
10201 return NULL;
10202 PyUnicode_InternInPlace(&s);
10203 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010204}
10205
10206void _Py_ReleaseInternedUnicodeStrings(void)
10207{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010208 PyObject *keys;
10209 PyUnicodeObject *s;
10210 Py_ssize_t i, n;
10211 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010212
Benjamin Peterson14339b62009-01-31 16:36:08 +000010213 if (interned == NULL || !PyDict_Check(interned))
10214 return;
10215 keys = PyDict_Keys(interned);
10216 if (keys == NULL || !PyList_Check(keys)) {
10217 PyErr_Clear();
10218 return;
10219 }
Walter Dörwald16807132007-05-25 13:52:07 +000010220
Benjamin Peterson14339b62009-01-31 16:36:08 +000010221 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10222 detector, interned unicode strings are not forcibly deallocated;
10223 rather, we give them their stolen references back, and then clear
10224 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010225
Benjamin Peterson14339b62009-01-31 16:36:08 +000010226 n = PyList_GET_SIZE(keys);
10227 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010228 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010229 for (i = 0; i < n; i++) {
10230 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10231 switch (s->state) {
10232 case SSTATE_NOT_INTERNED:
10233 /* XXX Shouldn't happen */
10234 break;
10235 case SSTATE_INTERNED_IMMORTAL:
10236 Py_REFCNT(s) += 1;
10237 immortal_size += s->length;
10238 break;
10239 case SSTATE_INTERNED_MORTAL:
10240 Py_REFCNT(s) += 2;
10241 mortal_size += s->length;
10242 break;
10243 default:
10244 Py_FatalError("Inconsistent interned string state.");
10245 }
10246 s->state = SSTATE_NOT_INTERNED;
10247 }
10248 fprintf(stderr, "total size of all interned strings: "
10249 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10250 "mortal/immortal\n", mortal_size, immortal_size);
10251 Py_DECREF(keys);
10252 PyDict_Clear(interned);
10253 Py_DECREF(interned);
10254 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010255}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010256
10257
10258/********************* Unicode Iterator **************************/
10259
10260typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010261 PyObject_HEAD
10262 Py_ssize_t it_index;
10263 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010264} unicodeiterobject;
10265
10266static void
10267unicodeiter_dealloc(unicodeiterobject *it)
10268{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010269 _PyObject_GC_UNTRACK(it);
10270 Py_XDECREF(it->it_seq);
10271 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010272}
10273
10274static int
10275unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10276{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010277 Py_VISIT(it->it_seq);
10278 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010279}
10280
10281static PyObject *
10282unicodeiter_next(unicodeiterobject *it)
10283{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010284 PyUnicodeObject *seq;
10285 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010286
Benjamin Peterson14339b62009-01-31 16:36:08 +000010287 assert(it != NULL);
10288 seq = it->it_seq;
10289 if (seq == NULL)
10290 return NULL;
10291 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010292
Benjamin Peterson14339b62009-01-31 16:36:08 +000010293 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10294 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010295 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010296 if (item != NULL)
10297 ++it->it_index;
10298 return item;
10299 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010300
Benjamin Peterson14339b62009-01-31 16:36:08 +000010301 Py_DECREF(seq);
10302 it->it_seq = NULL;
10303 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010304}
10305
10306static PyObject *
10307unicodeiter_len(unicodeiterobject *it)
10308{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010309 Py_ssize_t len = 0;
10310 if (it->it_seq)
10311 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10312 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010313}
10314
10315PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10316
10317static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010318 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010319 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010320 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010321};
10322
10323PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010324 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10325 "str_iterator", /* tp_name */
10326 sizeof(unicodeiterobject), /* tp_basicsize */
10327 0, /* tp_itemsize */
10328 /* methods */
10329 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10330 0, /* tp_print */
10331 0, /* tp_getattr */
10332 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010333 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010334 0, /* tp_repr */
10335 0, /* tp_as_number */
10336 0, /* tp_as_sequence */
10337 0, /* tp_as_mapping */
10338 0, /* tp_hash */
10339 0, /* tp_call */
10340 0, /* tp_str */
10341 PyObject_GenericGetAttr, /* tp_getattro */
10342 0, /* tp_setattro */
10343 0, /* tp_as_buffer */
10344 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10345 0, /* tp_doc */
10346 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10347 0, /* tp_clear */
10348 0, /* tp_richcompare */
10349 0, /* tp_weaklistoffset */
10350 PyObject_SelfIter, /* tp_iter */
10351 (iternextfunc)unicodeiter_next, /* tp_iternext */
10352 unicodeiter_methods, /* tp_methods */
10353 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010354};
10355
10356static PyObject *
10357unicode_iter(PyObject *seq)
10358{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010359 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010360
Benjamin Peterson14339b62009-01-31 16:36:08 +000010361 if (!PyUnicode_Check(seq)) {
10362 PyErr_BadInternalCall();
10363 return NULL;
10364 }
10365 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10366 if (it == NULL)
10367 return NULL;
10368 it->it_index = 0;
10369 Py_INCREF(seq);
10370 it->it_seq = (PyUnicodeObject *)seq;
10371 _PyObject_GC_TRACK(it);
10372 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010373}
10374
Martin v. Löwis5b222132007-06-10 09:51:05 +000010375size_t
10376Py_UNICODE_strlen(const Py_UNICODE *u)
10377{
10378 int res = 0;
10379 while(*u++)
10380 res++;
10381 return res;
10382}
10383
10384Py_UNICODE*
10385Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10386{
10387 Py_UNICODE *u = s1;
10388 while ((*u++ = *s2++));
10389 return s1;
10390}
10391
10392Py_UNICODE*
10393Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10394{
10395 Py_UNICODE *u = s1;
10396 while ((*u++ = *s2++))
10397 if (n-- == 0)
10398 break;
10399 return s1;
10400}
10401
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010402Py_UNICODE*
10403Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10404{
10405 Py_UNICODE *u1 = s1;
10406 u1 += Py_UNICODE_strlen(u1);
10407 Py_UNICODE_strcpy(u1, s2);
10408 return s1;
10409}
10410
Martin v. Löwis5b222132007-06-10 09:51:05 +000010411int
10412Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10413{
10414 while (*s1 && *s2 && *s1 == *s2)
10415 s1++, s2++;
10416 if (*s1 && *s2)
10417 return (*s1 < *s2) ? -1 : +1;
10418 if (*s1)
10419 return 1;
10420 if (*s2)
10421 return -1;
10422 return 0;
10423}
10424
Victor Stinneref8d95c2010-08-16 22:03:11 +000010425int
10426Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10427{
10428 register Py_UNICODE u1, u2;
10429 for (; n != 0; n--) {
10430 u1 = *s1;
10431 u2 = *s2;
10432 if (u1 != u2)
10433 return (u1 < u2) ? -1 : +1;
10434 if (u1 == '\0')
10435 return 0;
10436 s1++;
10437 s2++;
10438 }
10439 return 0;
10440}
10441
Martin v. Löwis5b222132007-06-10 09:51:05 +000010442Py_UNICODE*
10443Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10444{
10445 const Py_UNICODE *p;
10446 for (p = s; *p; p++)
10447 if (*p == c)
10448 return (Py_UNICODE*)p;
10449 return NULL;
10450}
10451
Victor Stinner331ea922010-08-10 16:37:20 +000010452Py_UNICODE*
10453Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10454{
10455 const Py_UNICODE *p;
10456 p = s + Py_UNICODE_strlen(s);
10457 while (p != s) {
10458 p--;
10459 if (*p == c)
10460 return (Py_UNICODE*)p;
10461 }
10462 return NULL;
10463}
10464
Victor Stinner71133ff2010-09-01 23:43:53 +000010465Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010466PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010467{
10468 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10469 Py_UNICODE *copy;
10470 Py_ssize_t size;
10471
10472 /* Ensure we won't overflow the size. */
10473 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10474 PyErr_NoMemory();
10475 return NULL;
10476 }
10477 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10478 size *= sizeof(Py_UNICODE);
10479 copy = PyMem_Malloc(size);
10480 if (copy == NULL) {
10481 PyErr_NoMemory();
10482 return NULL;
10483 }
10484 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10485 return copy;
10486}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010487
Georg Brandl66c221e2010-10-14 07:04:07 +000010488/* A _string module, to export formatter_parser and formatter_field_name_split
10489 to the string.Formatter class implemented in Python. */
10490
10491static PyMethodDef _string_methods[] = {
10492 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10493 METH_O, PyDoc_STR("split the argument as a field name")},
10494 {"formatter_parser", (PyCFunction) formatter_parser,
10495 METH_O, PyDoc_STR("parse the argument as a format string")},
10496 {NULL, NULL}
10497};
10498
10499static struct PyModuleDef _string_module = {
10500 PyModuleDef_HEAD_INIT,
10501 "_string",
10502 PyDoc_STR("string helper module"),
10503 0,
10504 _string_methods,
10505 NULL,
10506 NULL,
10507 NULL,
10508 NULL
10509};
10510
10511PyMODINIT_FUNC
10512PyInit__string(void)
10513{
10514 return PyModule_Create(&_string_module);
10515}
10516
10517
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010518#ifdef __cplusplus
10519}
10520#endif