blob: cbda72532d8d65cac92b278c763ff6005003f7d9 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000145static PyObject *unicode_encode_call_errorhandler(const char *errors,
146 PyObject **errorHandler,const char *encoding, const char *reason,
147 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
148 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
149
Victor Stinner31be90b2010-04-22 19:38:16 +0000150static void raise_encode_exception(PyObject **exceptionObject,
151 const char *encoding,
152 const Py_UNICODE *unicode, Py_ssize_t size,
153 Py_ssize_t startpos, Py_ssize_t endpos,
154 const char *reason);
155
Christian Heimes190d79e2008-01-30 11:58:22 +0000156/* Same for linebreaks */
157static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000160/* 0x000B, * LINE TABULATION */
161/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x001C, * FILE SEPARATOR */
166/* 0x001D, * GROUP SEPARATOR */
167/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000168 0, 0, 0, 0, 1, 1, 1, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000173
Benjamin Peterson14339b62009-01-31 16:36:08 +0000174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000182};
183
184
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000185Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000186PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000188#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000189 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 /* This is actually an illegal character, so it should
192 not be passed to unichr. */
193 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#endif
195}
196
Thomas Wouters477c8d52006-05-27 19:21:47 +0000197/* --- Bloom Filters ----------------------------------------------------- */
198
199/* stuff to implement simple "bloom filters" for Unicode characters.
200 to keep things simple, we use a single bitmask, using the least 5
201 bits from each unicode characters as the bit index. */
202
203/* the linebreak mask is set up by Unicode_Init below */
204
Antoine Pitrouf068f942010-01-13 14:19:12 +0000205#if LONG_BIT >= 128
206#define BLOOM_WIDTH 128
207#elif LONG_BIT >= 64
208#define BLOOM_WIDTH 64
209#elif LONG_BIT >= 32
210#define BLOOM_WIDTH 32
211#else
212#error "LONG_BIT is smaller than 32"
213#endif
214
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215#define BLOOM_MASK unsigned long
216
217static BLOOM_MASK bloom_linebreak;
218
Antoine Pitrouf068f942010-01-13 14:19:12 +0000219#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
220#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221
Benjamin Peterson29060642009-01-31 22:14:21 +0000222#define BLOOM_LINEBREAK(ch) \
223 ((ch) < 128U ? ascii_linebreak[(ch)] : \
224 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000225
226Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
227{
228 /* calculate simple bloom-style bitmask for a given unicode string */
229
Antoine Pitrouf068f942010-01-13 14:19:12 +0000230 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231 Py_ssize_t i;
232
233 mask = 0;
234 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000235 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000236
237 return mask;
238}
239
240Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
241{
242 Py_ssize_t i;
243
244 for (i = 0; i < setlen; i++)
245 if (set[i] == chr)
246 return 1;
247
248 return 0;
249}
250
Benjamin Peterson29060642009-01-31 22:14:21 +0000251#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000252 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
253
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254/* --- Unicode Object ----------------------------------------------------- */
255
256static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000257int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000258 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259{
260 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000261
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000266 /* Resizing shared object (unicode_empty or single character
267 objects) in-place is not allowed. Use PyUnicode_Resize()
268 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000271 (unicode->length == 1 &&
272 unicode->str[0] < 256U &&
273 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000275 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 return -1;
277 }
278
Thomas Wouters477c8d52006-05-27 19:21:47 +0000279 /* We allocate one more byte to make sure the string is Ux0000 terminated.
280 The overallocation is also used by fastsearch, which assumes that it's
281 safe to look at str[length] (without making any assumptions about what
282 it contains). */
283
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000285 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000286 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 PyErr_NoMemory();
290 return -1;
291 }
292 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294
Benjamin Peterson29060642009-01-31 22:14:21 +0000295 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000297 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000298 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 }
300 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000301
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 return 0;
303}
304
305/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000306 Ux0000 terminated; some code (e.g. new_identifier)
307 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308
309 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000310 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312*/
313
314static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316{
317 register PyUnicodeObject *unicode;
318
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 if (length == 0 && unicode_empty != NULL) {
321 Py_INCREF(unicode_empty);
322 return unicode_empty;
323 }
324
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000325 /* Ensure we won't overflow the size. */
326 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
327 return (PyUnicodeObject *)PyErr_NoMemory();
328 }
329
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000331 if (free_list) {
332 unicode = free_list;
333 free_list = *(PyUnicodeObject **)unicode;
334 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000335 if (unicode->str) {
336 /* Keep-Alive optimization: we only upsize the buffer,
337 never downsize it. */
338 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000339 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000340 PyObject_DEL(unicode->str);
341 unicode->str = NULL;
342 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000343 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 }
348 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 }
350 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000351 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000352 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 if (unicode == NULL)
354 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
356 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 }
358
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000359 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000360 PyErr_NoMemory();
361 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000363 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000364 * the caller fails before initializing str -- unicode_resize()
365 * reads str[0], and the Keep-Alive optimization can keep memory
366 * allocated for str alive across a call to unicode_dealloc(unicode).
367 * We don't want unicode_resize to read uninitialized memory in
368 * that case.
369 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000370 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000374 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000375 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377
Benjamin Peterson29060642009-01-31 22:14:21 +0000378 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000379 /* XXX UNREF/NEWREF interface should be more symmetrical */
380 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000382 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000383 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384}
385
386static
Guido van Rossum9475a232001-10-05 20:51:39 +0000387void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388{
Walter Dörwald16807132007-05-25 13:52:07 +0000389 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 case SSTATE_NOT_INTERNED:
391 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000392
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_INTERNED_MORTAL:
394 /* revive dead object temporarily for DelItem */
395 Py_REFCNT(unicode) = 3;
396 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
397 Py_FatalError(
398 "deletion of interned string failed");
399 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000400
Benjamin Peterson29060642009-01-31 22:14:21 +0000401 case SSTATE_INTERNED_IMMORTAL:
402 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 default:
405 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406 }
407
Guido van Rossum604ddf82001-12-06 20:03:56 +0000408 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000409 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000410 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000411 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
412 PyObject_DEL(unicode->str);
413 unicode->str = NULL;
414 unicode->length = 0;
415 }
416 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000417 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000418 }
419 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000420 *(PyUnicodeObject **)unicode = free_list;
421 free_list = unicode;
422 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000425 PyObject_DEL(unicode->str);
426 Py_XDECREF(unicode->defenc);
427 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428 }
429}
430
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000431static
432int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433{
434 register PyUnicodeObject *v;
435
436 /* Argument checks */
437 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000441 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000442 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000443 PyErr_BadInternalCall();
444 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000445 }
446
447 /* Resizing unicode_empty and single character objects is not
448 possible since these are being shared. We simply return a fresh
449 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000450 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000451 (v == unicode_empty || v->length == 1)) {
452 PyUnicodeObject *w = _PyUnicode_New(length);
453 if (w == NULL)
454 return -1;
455 Py_UNICODE_COPY(w->str, v->str,
456 length < v->length ? length : v->length);
457 Py_DECREF(*unicode);
458 *unicode = w;
459 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 }
461
462 /* Note that we don't have to modify *unicode for unshared Unicode
463 objects, since we can modify them in-place. */
464 return unicode_resize(v, length);
465}
466
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000467int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
468{
469 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
470}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000471
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000473 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474{
475 PyUnicodeObject *unicode;
476
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000477 /* If the Unicode data is known at construction time, we can apply
478 some optimizations which share commonly used objects. */
479 if (u != NULL) {
480
Benjamin Peterson29060642009-01-31 22:14:21 +0000481 /* Optimization for empty strings */
482 if (size == 0 && unicode_empty != NULL) {
483 Py_INCREF(unicode_empty);
484 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000485 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000486
487 /* Single character Unicode objects in the Latin-1 range are
488 shared when using this constructor */
489 if (size == 1 && *u < 256) {
490 unicode = unicode_latin1[*u];
491 if (!unicode) {
492 unicode = _PyUnicode_New(1);
493 if (!unicode)
494 return NULL;
495 unicode->str[0] = *u;
496 unicode_latin1[*u] = unicode;
497 }
498 Py_INCREF(unicode);
499 return (PyObject *)unicode;
500 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000501 }
Tim Petersced69f82003-09-16 20:30:58 +0000502
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 unicode = _PyUnicode_New(size);
504 if (!unicode)
505 return NULL;
506
507 /* Copy the Unicode data into the new object */
508 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000509 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510
511 return (PyObject *)unicode;
512}
513
Walter Dörwaldd2034312007-05-18 16:29:38 +0000514PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000515{
516 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000517
Benjamin Peterson14339b62009-01-31 16:36:08 +0000518 if (size < 0) {
519 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000520 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 return NULL;
522 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000524 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000525 some optimizations which share commonly used objects.
526 Also, this means the input must be UTF-8, so fall back to the
527 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000528 if (u != NULL) {
529
Benjamin Peterson29060642009-01-31 22:14:21 +0000530 /* Optimization for empty strings */
531 if (size == 0 && unicode_empty != NULL) {
532 Py_INCREF(unicode_empty);
533 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000534 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000535
536 /* Single characters are shared when using this constructor.
537 Restrict to ASCII, since the input must be UTF-8. */
538 if (size == 1 && Py_CHARMASK(*u) < 128) {
539 unicode = unicode_latin1[Py_CHARMASK(*u)];
540 if (!unicode) {
541 unicode = _PyUnicode_New(1);
542 if (!unicode)
543 return NULL;
544 unicode->str[0] = Py_CHARMASK(*u);
545 unicode_latin1[Py_CHARMASK(*u)] = unicode;
546 }
547 Py_INCREF(unicode);
548 return (PyObject *)unicode;
549 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000550
551 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000552 }
553
Walter Dörwald55507312007-05-18 13:12:10 +0000554 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 if (!unicode)
556 return NULL;
557
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 return (PyObject *)unicode;
559}
560
Walter Dörwaldd2034312007-05-18 16:29:38 +0000561PyObject *PyUnicode_FromString(const char *u)
562{
563 size_t size = strlen(u);
564 if (size > PY_SSIZE_T_MAX) {
565 PyErr_SetString(PyExc_OverflowError, "input too long");
566 return NULL;
567 }
568
569 return PyUnicode_FromStringAndSize(u, size);
570}
571
Guido van Rossumd57fd912000-03-10 22:53:23 +0000572#ifdef HAVE_WCHAR_H
573
Mark Dickinson081dfee2009-03-18 14:47:41 +0000574#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
575# define CONVERT_WCHAR_TO_SURROGATES
576#endif
577
578#ifdef CONVERT_WCHAR_TO_SURROGATES
579
580/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
581 to convert from UTF32 to UTF16. */
582
583PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
584 Py_ssize_t size)
585{
586 PyUnicodeObject *unicode;
587 register Py_ssize_t i;
588 Py_ssize_t alloc;
589 const wchar_t *orig_w;
590
591 if (w == NULL) {
592 if (size == 0)
593 return PyUnicode_FromStringAndSize(NULL, 0);
594 PyErr_BadInternalCall();
595 return NULL;
596 }
597
598 if (size == -1) {
599 size = wcslen(w);
600 }
601
602 alloc = size;
603 orig_w = w;
604 for (i = size; i > 0; i--) {
605 if (*w > 0xFFFF)
606 alloc++;
607 w++;
608 }
609 w = orig_w;
610 unicode = _PyUnicode_New(alloc);
611 if (!unicode)
612 return NULL;
613
614 /* Copy the wchar_t data into the new object */
615 {
616 register Py_UNICODE *u;
617 u = PyUnicode_AS_UNICODE(unicode);
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF) {
620 wchar_t ordinal = *w++;
621 ordinal -= 0x10000;
622 *u++ = 0xD800 | (ordinal >> 10);
623 *u++ = 0xDC00 | (ordinal & 0x3FF);
624 }
625 else
626 *u++ = *w++;
627 }
628 }
629 return (PyObject *)unicode;
630}
631
632#else
633
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000635 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636{
637 PyUnicodeObject *unicode;
638
639 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000640 if (size == 0)
641 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000642 PyErr_BadInternalCall();
643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000644 }
645
Martin v. Löwis790465f2008-04-05 20:41:37 +0000646 if (size == -1) {
647 size = wcslen(w);
648 }
649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = _PyUnicode_New(size);
651 if (!unicode)
652 return NULL;
653
654 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000655#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000657#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000659 register Py_UNICODE *u;
660 register Py_ssize_t i;
661 u = PyUnicode_AS_UNICODE(unicode);
662 for (i = size; i > 0; i--)
663 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 }
665#endif
666
667 return (PyObject *)unicode;
668}
669
Mark Dickinson081dfee2009-03-18 14:47:41 +0000670#endif /* CONVERT_WCHAR_TO_SURROGATES */
671
672#undef CONVERT_WCHAR_TO_SURROGATES
673
Walter Dörwald346737f2007-05-31 10:44:43 +0000674static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000675makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
676 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000677{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000678 *fmt++ = '%';
679 if (width) {
680 if (zeropad)
681 *fmt++ = '0';
682 fmt += sprintf(fmt, "%d", width);
683 }
684 if (precision)
685 fmt += sprintf(fmt, ".%d", precision);
686 if (longflag)
687 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000688 else if (longlongflag) {
689 /* longlongflag should only ever be nonzero on machines with
690 HAVE_LONG_LONG defined */
691#ifdef HAVE_LONG_LONG
692 char *f = PY_FORMAT_LONG_LONG;
693 while (*f)
694 *fmt++ = *f++;
695#else
696 /* we shouldn't ever get here */
697 assert(0);
698 *fmt++ = 'l';
699#endif
700 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000701 else if (size_tflag) {
702 char *f = PY_FORMAT_SIZE_T;
703 while (*f)
704 *fmt++ = *f++;
705 }
706 *fmt++ = c;
707 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000708}
709
Walter Dörwaldd2034312007-05-18 16:29:38 +0000710#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
711
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000712/* size of fixed-size buffer for formatting single arguments */
713#define ITEM_BUFFER_LEN 21
714/* maximum number of characters required for output of %ld. 21 characters
715 allows for 64-bit integers (in decimal) and an optional sign. */
716#define MAX_LONG_CHARS 21
717/* maximum number of characters required for output of %lld.
718 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
719 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
720#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
721
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722PyObject *
723PyUnicode_FromFormatV(const char *format, va_list vargs)
724{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000725 va_list count;
726 Py_ssize_t callcount = 0;
727 PyObject **callresults = NULL;
728 PyObject **callresult = NULL;
729 Py_ssize_t n = 0;
730 int width = 0;
731 int precision = 0;
732 int zeropad;
733 const char* f;
734 Py_UNICODE *s;
735 PyObject *string;
736 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000737 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000738 /* use abuffer instead of buffer, if we need more space
739 * (which can happen if there's a format specifier with width). */
740 char *abuffer = NULL;
741 char *realbuffer;
742 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000743 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000744 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000745
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000746 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000747 /* step 1: count the number of %S/%R/%A/%s format specifications
748 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
749 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
750 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000751 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000752 if (*f == '%') {
753 if (*(f+1)=='%')
754 continue;
Victor Stinner2b574a22011-03-01 22:48:49 +0000755 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000756 ++callcount;
David Malcolm96960882010-11-05 17:23:41 +0000757 while (Py_ISDIGIT((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000758 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000759 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000760 ;
761 if (*f == 's')
762 ++callcount;
763 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000764 else if (128 <= (unsigned char)*f) {
765 PyErr_Format(PyExc_ValueError,
766 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000767 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000768 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000769 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 }
772 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000773 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000774 if (callcount) {
775 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
776 if (!callresults) {
777 PyErr_NoMemory();
778 return NULL;
779 }
780 callresult = callresults;
781 }
782 /* step 3: figure out how large a buffer we need */
783 for (f = format; *f; f++) {
784 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000785#ifdef HAVE_LONG_LONG
786 int longlongflag = 0;
787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000788 const char* p = f;
789 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000790 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000792 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000793 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794
Benjamin Peterson14339b62009-01-31 16:36:08 +0000795 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
796 * they don't affect the amount of space we reserve.
797 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000798 if (*f == 'l') {
799 if (f[1] == 'd' || f[1] == 'u') {
800 ++f;
801 }
802#ifdef HAVE_LONG_LONG
803 else if (f[1] == 'l' &&
804 (f[2] == 'd' || f[2] == 'u')) {
805 longlongflag = 1;
806 f += 2;
807 }
808#endif
809 }
810 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000811 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000812 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000813
Benjamin Peterson14339b62009-01-31 16:36:08 +0000814 switch (*f) {
815 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +0000816 {
817#ifndef Py_UNICODE_WIDE
818 int ordinal = va_arg(count, int);
819 if (ordinal > 0xffff)
820 n += 2;
821 else
822 n++;
823#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000824 (void)va_arg(count, int);
Victor Stinner659eb842011-02-23 12:14:22 +0000825 n++;
826#endif
827 break;
828 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 case '%':
830 n++;
831 break;
832 case 'd': case 'u': case 'i': case 'x':
833 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000834#ifdef HAVE_LONG_LONG
835 if (longlongflag) {
836 if (width < MAX_LONG_LONG_CHARS)
837 width = MAX_LONG_LONG_CHARS;
838 }
839 else
840#endif
841 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
842 including sign. Decimal takes the most space. This
843 isn't enough for octal. If a width is specified we
844 need more (which we allocate later). */
845 if (width < MAX_LONG_CHARS)
846 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000847 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000848 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000849 if (abuffersize < width)
850 abuffersize = width;
851 break;
852 case 's':
853 {
854 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000855 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000856 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
857 if (!str)
858 goto fail;
859 n += PyUnicode_GET_SIZE(str);
860 /* Remember the str and switch to the next slot */
861 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000862 break;
863 }
864 case 'U':
865 {
866 PyObject *obj = va_arg(count, PyObject *);
867 assert(obj && PyUnicode_Check(obj));
868 n += PyUnicode_GET_SIZE(obj);
869 break;
870 }
871 case 'V':
872 {
873 PyObject *obj = va_arg(count, PyObject *);
874 const char *str = va_arg(count, const char *);
Victor Stinner2b574a22011-03-01 22:48:49 +0000875 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000876 assert(obj || str);
877 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2b574a22011-03-01 22:48:49 +0000878 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000879 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2b574a22011-03-01 22:48:49 +0000880 *callresult++ = NULL;
881 }
882 else {
883 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
884 if (!str_obj)
885 goto fail;
886 n += PyUnicode_GET_SIZE(str_obj);
887 *callresult++ = str_obj;
888 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000889 break;
890 }
891 case 'S':
892 {
893 PyObject *obj = va_arg(count, PyObject *);
894 PyObject *str;
895 assert(obj);
896 str = PyObject_Str(obj);
897 if (!str)
898 goto fail;
899 n += PyUnicode_GET_SIZE(str);
900 /* Remember the str and switch to the next slot */
901 *callresult++ = str;
902 break;
903 }
904 case 'R':
905 {
906 PyObject *obj = va_arg(count, PyObject *);
907 PyObject *repr;
908 assert(obj);
909 repr = PyObject_Repr(obj);
910 if (!repr)
911 goto fail;
912 n += PyUnicode_GET_SIZE(repr);
913 /* Remember the repr and switch to the next slot */
914 *callresult++ = repr;
915 break;
916 }
917 case 'A':
918 {
919 PyObject *obj = va_arg(count, PyObject *);
920 PyObject *ascii;
921 assert(obj);
922 ascii = PyObject_ASCII(obj);
923 if (!ascii)
924 goto fail;
925 n += PyUnicode_GET_SIZE(ascii);
926 /* Remember the repr and switch to the next slot */
927 *callresult++ = ascii;
928 break;
929 }
930 case 'p':
931 (void) va_arg(count, int);
932 /* maximum 64-bit pointer representation:
933 * 0xffffffffffffffff
934 * so 19 characters is enough.
935 * XXX I count 18 -- what's the extra for?
936 */
937 n += 19;
938 break;
939 default:
940 /* if we stumble upon an unknown
941 formatting code, copy the rest of
942 the format string to the output
943 string. (we cannot just skip the
944 code, since there's no way to know
945 what's in the argument list) */
946 n += strlen(p);
947 goto expand;
948 }
949 } else
950 n++;
951 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000952 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000953 if (abuffersize > ITEM_BUFFER_LEN) {
954 /* add 1 for sprintf's trailing null byte */
955 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000956 if (!abuffer) {
957 PyErr_NoMemory();
958 goto fail;
959 }
960 realbuffer = abuffer;
961 }
962 else
963 realbuffer = buffer;
964 /* step 4: fill the buffer */
965 /* Since we've analyzed how much space we need for the worst case,
966 we don't have to resize the string.
967 There can be no errors beyond this point. */
968 string = PyUnicode_FromUnicode(NULL, n);
969 if (!string)
970 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000971
Benjamin Peterson14339b62009-01-31 16:36:08 +0000972 s = PyUnicode_AS_UNICODE(string);
973 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000974
Benjamin Peterson14339b62009-01-31 16:36:08 +0000975 for (f = format; *f; f++) {
976 if (*f == '%') {
977 const char* p = f++;
978 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000979 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000980 int size_tflag = 0;
981 zeropad = (*f == '0');
982 /* parse the width.precision part */
983 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000984 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000985 width = (width*10) + *f++ - '0';
986 precision = 0;
987 if (*f == '.') {
988 f++;
David Malcolm96960882010-11-05 17:23:41 +0000989 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000990 precision = (precision*10) + *f++ - '0';
991 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000992 /* Handle %ld, %lu, %lld and %llu. */
993 if (*f == 'l') {
994 if (f[1] == 'd' || f[1] == 'u') {
995 longflag = 1;
996 ++f;
997 }
998#ifdef HAVE_LONG_LONG
999 else if (f[1] == 'l' &&
1000 (f[2] == 'd' || f[2] == 'u')) {
1001 longlongflag = 1;
1002 f += 2;
1003 }
1004#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001005 }
1006 /* handle the size_t flag. */
1007 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1008 size_tflag = 1;
1009 ++f;
1010 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001011
Benjamin Peterson14339b62009-01-31 16:36:08 +00001012 switch (*f) {
1013 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +00001014 {
1015 int ordinal = va_arg(vargs, int);
1016#ifndef Py_UNICODE_WIDE
1017 if (ordinal > 0xffff) {
1018 ordinal -= 0x10000;
1019 *s++ = 0xD800 | (ordinal >> 10);
1020 *s++ = 0xDC00 | (ordinal & 0x3FF);
1021 } else
1022#endif
1023 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001024 break;
Victor Stinner659eb842011-02-23 12:14:22 +00001025 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001026 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001027 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1028 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001029 if (longflag)
1030 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001031#ifdef HAVE_LONG_LONG
1032 else if (longlongflag)
1033 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1034#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001035 else if (size_tflag)
1036 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1037 else
1038 sprintf(realbuffer, fmt, va_arg(vargs, int));
1039 appendstring(realbuffer);
1040 break;
1041 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001042 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1043 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001044 if (longflag)
1045 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001046#ifdef HAVE_LONG_LONG
1047 else if (longlongflag)
1048 sprintf(realbuffer, fmt, va_arg(vargs,
1049 unsigned PY_LONG_LONG));
1050#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 else if (size_tflag)
1052 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1053 else
1054 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1055 appendstring(realbuffer);
1056 break;
1057 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001058 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001059 sprintf(realbuffer, fmt, va_arg(vargs, int));
1060 appendstring(realbuffer);
1061 break;
1062 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001063 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001064 sprintf(realbuffer, fmt, va_arg(vargs, int));
1065 appendstring(realbuffer);
1066 break;
1067 case 's':
1068 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001069 /* unused, since we already have the result */
1070 (void) va_arg(vargs, char *);
1071 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1072 PyUnicode_GET_SIZE(*callresult));
1073 s += PyUnicode_GET_SIZE(*callresult);
1074 /* We're done with the unicode()/repr() => forget it */
1075 Py_DECREF(*callresult);
1076 /* switch to next unicode()/repr() result */
1077 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001078 break;
1079 }
1080 case 'U':
1081 {
1082 PyObject *obj = va_arg(vargs, PyObject *);
1083 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1084 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1085 s += size;
1086 break;
1087 }
1088 case 'V':
1089 {
1090 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2b574a22011-03-01 22:48:49 +00001091 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001092 if (obj) {
1093 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1094 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1095 s += size;
1096 } else {
Victor Stinner2b574a22011-03-01 22:48:49 +00001097 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1098 PyUnicode_GET_SIZE(*callresult));
1099 s += PyUnicode_GET_SIZE(*callresult);
1100 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001101 }
Victor Stinner2b574a22011-03-01 22:48:49 +00001102 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001103 break;
1104 }
1105 case 'S':
1106 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001107 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001108 {
1109 Py_UNICODE *ucopy;
1110 Py_ssize_t usize;
1111 Py_ssize_t upos;
1112 /* unused, since we already have the result */
1113 (void) va_arg(vargs, PyObject *);
1114 ucopy = PyUnicode_AS_UNICODE(*callresult);
1115 usize = PyUnicode_GET_SIZE(*callresult);
1116 for (upos = 0; upos<usize;)
1117 *s++ = ucopy[upos++];
1118 /* We're done with the unicode()/repr() => forget it */
1119 Py_DECREF(*callresult);
1120 /* switch to next unicode()/repr() result */
1121 ++callresult;
1122 break;
1123 }
1124 case 'p':
1125 sprintf(buffer, "%p", va_arg(vargs, void*));
1126 /* %p is ill-defined: ensure leading 0x. */
1127 if (buffer[1] == 'X')
1128 buffer[1] = 'x';
1129 else if (buffer[1] != 'x') {
1130 memmove(buffer+2, buffer, strlen(buffer)+1);
1131 buffer[0] = '0';
1132 buffer[1] = 'x';
1133 }
1134 appendstring(buffer);
1135 break;
1136 case '%':
1137 *s++ = '%';
1138 break;
1139 default:
1140 appendstring(p);
1141 goto end;
1142 }
Victor Stinner1205f272010-09-11 00:54:47 +00001143 }
Victor Stinner1205f272010-09-11 00:54:47 +00001144 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001145 *s++ = *f;
1146 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001147
Benjamin Peterson29060642009-01-31 22:14:21 +00001148 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001149 if (callresults)
1150 PyObject_Free(callresults);
1151 if (abuffer)
1152 PyObject_Free(abuffer);
1153 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1154 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001155 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001156 if (callresults) {
1157 PyObject **callresult2 = callresults;
1158 while (callresult2 < callresult) {
Victor Stinner2b574a22011-03-01 22:48:49 +00001159 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001160 ++callresult2;
1161 }
1162 PyObject_Free(callresults);
1163 }
1164 if (abuffer)
1165 PyObject_Free(abuffer);
1166 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001167}
1168
1169#undef appendstring
1170
1171PyObject *
1172PyUnicode_FromFormat(const char *format, ...)
1173{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001174 PyObject* ret;
1175 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001176
1177#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001178 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001179#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001180 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001181#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001182 ret = PyUnicode_FromFormatV(format, vargs);
1183 va_end(vargs);
1184 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001185}
1186
Victor Stinner5593d8a2010-10-02 11:11:27 +00001187/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1188 convert a Unicode object to a wide character string.
1189
1190 - If w is NULL: return the number of wide characters (including the nul
1191 character) required to convert the unicode object. Ignore size argument.
1192
1193 - Otherwise: return the number of wide characters (excluding the nul
1194 character) written into w. Write at most size wide characters (including
1195 the nul character). */
1196static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001197unicode_aswidechar(PyUnicodeObject *unicode,
1198 wchar_t *w,
1199 Py_ssize_t size)
1200{
1201#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001202 Py_ssize_t res;
1203 if (w != NULL) {
1204 res = PyUnicode_GET_SIZE(unicode);
1205 if (size > res)
1206 size = res + 1;
1207 else
1208 res = size;
1209 memcpy(w, unicode->str, size * sizeof(wchar_t));
1210 return res;
1211 }
1212 else
1213 return PyUnicode_GET_SIZE(unicode) + 1;
1214#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1215 register const Py_UNICODE *u;
1216 const Py_UNICODE *uend;
1217 const wchar_t *worig, *wend;
1218 Py_ssize_t nchar;
1219
Victor Stinner137c34c2010-09-29 10:25:54 +00001220 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001221 uend = u + PyUnicode_GET_SIZE(unicode);
1222 if (w != NULL) {
1223 worig = w;
1224 wend = w + size;
1225 while (u != uend && w != wend) {
1226 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1227 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1228 {
1229 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1230 u += 2;
1231 }
1232 else {
1233 *w = *u;
1234 u++;
1235 }
1236 w++;
1237 }
1238 if (w != wend)
1239 *w = L'\0';
1240 return w - worig;
1241 }
1242 else {
1243 nchar = 1; /* nul character at the end */
1244 while (u != uend) {
1245 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1246 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1247 u += 2;
1248 else
1249 u++;
1250 nchar++;
1251 }
1252 }
1253 return nchar;
1254#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1255 register Py_UNICODE *u, *uend, ordinal;
1256 register Py_ssize_t i;
1257 wchar_t *worig, *wend;
1258 Py_ssize_t nchar;
1259
1260 u = PyUnicode_AS_UNICODE(unicode);
1261 uend = u + PyUnicode_GET_SIZE(u);
1262 if (w != NULL) {
1263 worig = w;
1264 wend = w + size;
1265 while (u != uend && w != wend) {
1266 ordinal = *u;
1267 if (ordinal > 0xffff) {
1268 ordinal -= 0x10000;
1269 *w++ = 0xD800 | (ordinal >> 10);
1270 *w++ = 0xDC00 | (ordinal & 0x3FF);
1271 }
1272 else
1273 *w++ = ordinal;
1274 u++;
1275 }
1276 if (w != wend)
1277 *w = 0;
1278 return w - worig;
1279 }
1280 else {
1281 nchar = 1; /* nul character */
1282 while (u != uend) {
1283 if (*u > 0xffff)
1284 nchar += 2;
1285 else
1286 nchar++;
1287 u++;
1288 }
1289 return nchar;
1290 }
1291#else
1292# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001293#endif
1294}
1295
1296Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001297PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001298 wchar_t *w,
1299 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300{
1301 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001302 PyErr_BadInternalCall();
1303 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001305 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306}
1307
Victor Stinner137c34c2010-09-29 10:25:54 +00001308wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001309PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001310 Py_ssize_t *size)
1311{
1312 wchar_t* buffer;
1313 Py_ssize_t buflen;
1314
1315 if (unicode == NULL) {
1316 PyErr_BadInternalCall();
1317 return NULL;
1318 }
1319
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001320 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001321 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001322 PyErr_NoMemory();
1323 return NULL;
1324 }
1325
Victor Stinner137c34c2010-09-29 10:25:54 +00001326 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1327 if (buffer == NULL) {
1328 PyErr_NoMemory();
1329 return NULL;
1330 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001331 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001332 if (size != NULL)
1333 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001334 return buffer;
1335}
1336
Guido van Rossumd57fd912000-03-10 22:53:23 +00001337#endif
1338
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001339PyObject *PyUnicode_FromOrdinal(int ordinal)
1340{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001341 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001342
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001343 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001344 PyErr_SetString(PyExc_ValueError,
1345 "chr() arg not in range(0x110000)");
1346 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001347 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001348
1349#ifndef Py_UNICODE_WIDE
1350 if (ordinal > 0xffff) {
1351 ordinal -= 0x10000;
1352 s[0] = 0xD800 | (ordinal >> 10);
1353 s[1] = 0xDC00 | (ordinal & 0x3FF);
1354 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001355 }
1356#endif
1357
Hye-Shik Chang40574832004-04-06 07:24:51 +00001358 s[0] = (Py_UNICODE)ordinal;
1359 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001360}
1361
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362PyObject *PyUnicode_FromObject(register PyObject *obj)
1363{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001364 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001365 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001366 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001367 Py_INCREF(obj);
1368 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001369 }
1370 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001371 /* For a Unicode subtype that's not a Unicode object,
1372 return a true Unicode object with the same data. */
1373 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1374 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001375 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001376 PyErr_Format(PyExc_TypeError,
1377 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001378 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001379 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001380}
1381
1382PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001383 const char *encoding,
1384 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001385{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001386 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001387 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001388
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 PyErr_BadInternalCall();
1391 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001393
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001394 /* Decoding bytes objects is the most common case and should be fast */
1395 if (PyBytes_Check(obj)) {
1396 if (PyBytes_GET_SIZE(obj) == 0) {
1397 Py_INCREF(unicode_empty);
1398 v = (PyObject *) unicode_empty;
1399 }
1400 else {
1401 v = PyUnicode_Decode(
1402 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1403 encoding, errors);
1404 }
1405 return v;
1406 }
1407
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001408 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001409 PyErr_SetString(PyExc_TypeError,
1410 "decoding str is not supported");
1411 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001412 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001413
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001414 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1415 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1416 PyErr_Format(PyExc_TypeError,
1417 "coercing to str: need bytes, bytearray "
1418 "or buffer-like object, %.80s found",
1419 Py_TYPE(obj)->tp_name);
1420 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001421 }
Tim Petersced69f82003-09-16 20:30:58 +00001422
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001423 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001424 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001425 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426 }
Tim Petersced69f82003-09-16 20:30:58 +00001427 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001428 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001429
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001430 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001431 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432}
1433
Victor Stinner600d3be2010-06-10 12:00:55 +00001434/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001435 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1436 1 on success. */
1437static int
1438normalize_encoding(const char *encoding,
1439 char *lower,
1440 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001442 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001443 char *l;
1444 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001446 e = encoding;
1447 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001448 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001449 while (*e) {
1450 if (l == l_end)
1451 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001452 if (Py_ISUPPER(*e)) {
1453 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001454 }
1455 else if (*e == '_') {
1456 *l++ = '-';
1457 e++;
1458 }
1459 else {
1460 *l++ = *e++;
1461 }
1462 }
1463 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001464 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001465}
1466
1467PyObject *PyUnicode_Decode(const char *s,
1468 Py_ssize_t size,
1469 const char *encoding,
1470 const char *errors)
1471{
1472 PyObject *buffer = NULL, *unicode;
1473 Py_buffer info;
1474 char lower[11]; /* Enough for any encoding shortcut */
1475
1476 if (encoding == NULL)
1477 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001478
1479 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001480 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1481 if (strcmp(lower, "utf-8") == 0)
1482 return PyUnicode_DecodeUTF8(s, size, errors);
1483 else if ((strcmp(lower, "latin-1") == 0) ||
1484 (strcmp(lower, "iso-8859-1") == 0))
1485 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001486#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001487 else if (strcmp(lower, "mbcs") == 0)
1488 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001489#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001490 else if (strcmp(lower, "ascii") == 0)
1491 return PyUnicode_DecodeASCII(s, size, errors);
1492 else if (strcmp(lower, "utf-16") == 0)
1493 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1494 else if (strcmp(lower, "utf-32") == 0)
1495 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497
1498 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001499 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001500 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001501 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001502 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503 if (buffer == NULL)
1504 goto onError;
1505 unicode = PyCodec_Decode(buffer, encoding, errors);
1506 if (unicode == NULL)
1507 goto onError;
1508 if (!PyUnicode_Check(unicode)) {
1509 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001510 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001511 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001512 Py_DECREF(unicode);
1513 goto onError;
1514 }
1515 Py_DECREF(buffer);
1516 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001517
Benjamin Peterson29060642009-01-31 22:14:21 +00001518 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519 Py_XDECREF(buffer);
1520 return NULL;
1521}
1522
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001523PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1524 const char *encoding,
1525 const char *errors)
1526{
1527 PyObject *v;
1528
1529 if (!PyUnicode_Check(unicode)) {
1530 PyErr_BadArgument();
1531 goto onError;
1532 }
1533
1534 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001535 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001536
1537 /* Decode via the codec registry */
1538 v = PyCodec_Decode(unicode, encoding, errors);
1539 if (v == NULL)
1540 goto onError;
1541 return v;
1542
Benjamin Peterson29060642009-01-31 22:14:21 +00001543 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001544 return NULL;
1545}
1546
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001547PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1548 const char *encoding,
1549 const char *errors)
1550{
1551 PyObject *v;
1552
1553 if (!PyUnicode_Check(unicode)) {
1554 PyErr_BadArgument();
1555 goto onError;
1556 }
1557
1558 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001559 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001560
1561 /* Decode via the codec registry */
1562 v = PyCodec_Decode(unicode, encoding, errors);
1563 if (v == NULL)
1564 goto onError;
1565 if (!PyUnicode_Check(v)) {
1566 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001567 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001568 Py_TYPE(v)->tp_name);
1569 Py_DECREF(v);
1570 goto onError;
1571 }
1572 return v;
1573
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001575 return NULL;
1576}
1577
Guido van Rossumd57fd912000-03-10 22:53:23 +00001578PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001579 Py_ssize_t size,
1580 const char *encoding,
1581 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001582{
1583 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001584
Guido van Rossumd57fd912000-03-10 22:53:23 +00001585 unicode = PyUnicode_FromUnicode(s, size);
1586 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001588 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1589 Py_DECREF(unicode);
1590 return v;
1591}
1592
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001593PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1594 const char *encoding,
1595 const char *errors)
1596{
1597 PyObject *v;
1598
1599 if (!PyUnicode_Check(unicode)) {
1600 PyErr_BadArgument();
1601 goto onError;
1602 }
1603
1604 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001605 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001606
1607 /* Encode via the codec registry */
1608 v = PyCodec_Encode(unicode, encoding, errors);
1609 if (v == NULL)
1610 goto onError;
1611 return v;
1612
Benjamin Peterson29060642009-01-31 22:14:21 +00001613 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001614 return NULL;
1615}
1616
Victor Stinnerad158722010-10-27 00:25:46 +00001617PyObject *
1618PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001619{
Victor Stinner313a1202010-06-11 23:56:51 +00001620#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001621 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1622 PyUnicode_GET_SIZE(unicode),
1623 NULL);
1624#elif defined(__APPLE__)
1625 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1626 PyUnicode_GET_SIZE(unicode),
1627 "surrogateescape");
1628#else
1629 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001630 return PyUnicode_AsEncodedString(unicode,
1631 Py_FileSystemDefaultEncoding,
1632 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001633 }
1634 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001635 /* locale encoding with surrogateescape */
1636 wchar_t *wchar;
1637 char *bytes;
1638 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001639 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001640
1641 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1642 if (wchar == NULL)
1643 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001644 bytes = _Py_wchar2char(wchar, &error_pos);
1645 if (bytes == NULL) {
1646 if (error_pos != (size_t)-1) {
1647 char *errmsg = strerror(errno);
1648 PyObject *exc = NULL;
1649 if (errmsg == NULL)
1650 errmsg = "Py_wchar2char() failed";
1651 raise_encode_exception(&exc,
1652 "filesystemencoding",
1653 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1654 error_pos, error_pos+1,
1655 errmsg);
1656 Py_XDECREF(exc);
1657 }
1658 else
1659 PyErr_NoMemory();
1660 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001661 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001662 }
1663 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001664
1665 bytes_obj = PyBytes_FromString(bytes);
1666 PyMem_Free(bytes);
1667 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001668 }
Victor Stinnerad158722010-10-27 00:25:46 +00001669#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001670}
1671
Guido van Rossumd57fd912000-03-10 22:53:23 +00001672PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1673 const char *encoding,
1674 const char *errors)
1675{
1676 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001677 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001678
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679 if (!PyUnicode_Check(unicode)) {
1680 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001682 }
Fred Drakee4315f52000-05-09 19:53:39 +00001683
Tim Petersced69f82003-09-16 20:30:58 +00001684 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001685 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001686
1687 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001688 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1689 if (strcmp(lower, "utf-8") == 0)
1690 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1691 PyUnicode_GET_SIZE(unicode),
1692 errors);
1693 else if ((strcmp(lower, "latin-1") == 0) ||
1694 (strcmp(lower, "iso-8859-1") == 0))
1695 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1696 PyUnicode_GET_SIZE(unicode),
1697 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001698#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001699 else if (strcmp(lower, "mbcs") == 0)
1700 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1701 PyUnicode_GET_SIZE(unicode),
1702 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001703#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001704 else if (strcmp(lower, "ascii") == 0)
1705 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1706 PyUnicode_GET_SIZE(unicode),
1707 errors);
1708 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001709 /* During bootstrap, we may need to find the encodings
1710 package, to load the file system encoding, and require the
1711 file system encoding in order to load the encodings
1712 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001713
Victor Stinner59e62db2010-05-15 13:14:32 +00001714 Break out of this dependency by assuming that the path to
1715 the encodings module is ASCII-only. XXX could try wcstombs
1716 instead, if the file system encoding is the locale's
1717 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001718 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001719 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1720 !PyThreadState_GET()->interp->codecs_initialized)
1721 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1722 PyUnicode_GET_SIZE(unicode),
1723 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724
1725 /* Encode via the codec registry */
1726 v = PyCodec_Encode(unicode, encoding, errors);
1727 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001728 return NULL;
1729
1730 /* The normal path */
1731 if (PyBytes_Check(v))
1732 return v;
1733
1734 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001735 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001736 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001737 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001738
1739 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1740 "encoder %s returned bytearray instead of bytes",
1741 encoding);
1742 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001743 Py_DECREF(v);
1744 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001745 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001746
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001747 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1748 Py_DECREF(v);
1749 return b;
1750 }
1751
1752 PyErr_Format(PyExc_TypeError,
1753 "encoder did not return a bytes object (type=%.400s)",
1754 Py_TYPE(v)->tp_name);
1755 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001756 return NULL;
1757}
1758
1759PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1760 const char *encoding,
1761 const char *errors)
1762{
1763 PyObject *v;
1764
1765 if (!PyUnicode_Check(unicode)) {
1766 PyErr_BadArgument();
1767 goto onError;
1768 }
1769
1770 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001771 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001772
1773 /* Encode via the codec registry */
1774 v = PyCodec_Encode(unicode, encoding, errors);
1775 if (v == NULL)
1776 goto onError;
1777 if (!PyUnicode_Check(v)) {
1778 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001779 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001780 Py_TYPE(v)->tp_name);
1781 Py_DECREF(v);
1782 goto onError;
1783 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001785
Benjamin Peterson29060642009-01-31 22:14:21 +00001786 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 return NULL;
1788}
1789
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001790PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001791 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001792{
1793 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001794 if (v)
1795 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001796 if (errors != NULL)
1797 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001798 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001799 PyUnicode_GET_SIZE(unicode),
1800 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001801 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001802 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001803 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001804 return v;
1805}
1806
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001807PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001808PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001809 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001810 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1811}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001812
Christian Heimes5894ba72007-11-04 11:43:14 +00001813PyObject*
1814PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1815{
Victor Stinnerad158722010-10-27 00:25:46 +00001816#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1817 return PyUnicode_DecodeMBCS(s, size, NULL);
1818#elif defined(__APPLE__)
1819 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1820#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001821 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1822 can be undefined. If it is case, decode using UTF-8. The following assumes
1823 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1824 bootstrapping process where the codecs aren't ready yet.
1825 */
1826 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001827 return PyUnicode_Decode(s, size,
1828 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001829 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001830 }
1831 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001832 /* locale encoding with surrogateescape */
1833 wchar_t *wchar;
1834 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001835 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001836
1837 if (s[size] != '\0' || size != strlen(s)) {
1838 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1839 return NULL;
1840 }
1841
Victor Stinner168e1172010-10-16 23:16:16 +00001842 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001843 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001844 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001845
Victor Stinner168e1172010-10-16 23:16:16 +00001846 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001847 PyMem_Free(wchar);
1848 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001849 }
Victor Stinnerad158722010-10-27 00:25:46 +00001850#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001851}
1852
Martin v. Löwis011e8422009-05-05 04:43:17 +00001853
1854int
1855PyUnicode_FSConverter(PyObject* arg, void* addr)
1856{
1857 PyObject *output = NULL;
1858 Py_ssize_t size;
1859 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001860 if (arg == NULL) {
1861 Py_DECREF(*(PyObject**)addr);
1862 return 1;
1863 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001864 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001865 output = arg;
1866 Py_INCREF(output);
1867 }
1868 else {
1869 arg = PyUnicode_FromObject(arg);
1870 if (!arg)
1871 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001872 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001873 Py_DECREF(arg);
1874 if (!output)
1875 return 0;
1876 if (!PyBytes_Check(output)) {
1877 Py_DECREF(output);
1878 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1879 return 0;
1880 }
1881 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001882 size = PyBytes_GET_SIZE(output);
1883 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001884 if (size != strlen(data)) {
1885 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1886 Py_DECREF(output);
1887 return 0;
1888 }
1889 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001890 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001891}
1892
1893
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001894int
1895PyUnicode_FSDecoder(PyObject* arg, void* addr)
1896{
1897 PyObject *output = NULL;
1898 Py_ssize_t size;
1899 void *data;
1900 if (arg == NULL) {
1901 Py_DECREF(*(PyObject**)addr);
1902 return 1;
1903 }
1904 if (PyUnicode_Check(arg)) {
1905 output = arg;
1906 Py_INCREF(output);
1907 }
1908 else {
1909 arg = PyBytes_FromObject(arg);
1910 if (!arg)
1911 return 0;
1912 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1913 PyBytes_GET_SIZE(arg));
1914 Py_DECREF(arg);
1915 if (!output)
1916 return 0;
1917 if (!PyUnicode_Check(output)) {
1918 Py_DECREF(output);
1919 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1920 return 0;
1921 }
1922 }
1923 size = PyUnicode_GET_SIZE(output);
1924 data = PyUnicode_AS_UNICODE(output);
1925 if (size != Py_UNICODE_strlen(data)) {
1926 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1927 Py_DECREF(output);
1928 return 0;
1929 }
1930 *(PyObject**)addr = output;
1931 return Py_CLEANUP_SUPPORTED;
1932}
1933
1934
Martin v. Löwis5b222132007-06-10 09:51:05 +00001935char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001936_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001937{
Christian Heimesf3863112007-11-22 07:46:41 +00001938 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001939 if (!PyUnicode_Check(unicode)) {
1940 PyErr_BadArgument();
1941 return NULL;
1942 }
Christian Heimesf3863112007-11-22 07:46:41 +00001943 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1944 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001945 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001946 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001947 *psize = PyBytes_GET_SIZE(bytes);
1948 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001949}
1950
1951char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001952_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001953{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001954 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001955}
1956
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1958{
1959 if (!PyUnicode_Check(unicode)) {
1960 PyErr_BadArgument();
1961 goto onError;
1962 }
1963 return PyUnicode_AS_UNICODE(unicode);
1964
Benjamin Peterson29060642009-01-31 22:14:21 +00001965 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966 return NULL;
1967}
1968
Martin v. Löwis18e16552006-02-15 17:27:45 +00001969Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970{
1971 if (!PyUnicode_Check(unicode)) {
1972 PyErr_BadArgument();
1973 goto onError;
1974 }
1975 return PyUnicode_GET_SIZE(unicode);
1976
Benjamin Peterson29060642009-01-31 22:14:21 +00001977 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978 return -1;
1979}
1980
Thomas Wouters78890102000-07-22 19:25:51 +00001981const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001982{
Victor Stinner42cb4622010-09-01 19:39:01 +00001983 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001984}
1985
Victor Stinner554f3f02010-06-16 23:33:54 +00001986/* create or adjust a UnicodeDecodeError */
1987static void
1988make_decode_exception(PyObject **exceptionObject,
1989 const char *encoding,
1990 const char *input, Py_ssize_t length,
1991 Py_ssize_t startpos, Py_ssize_t endpos,
1992 const char *reason)
1993{
1994 if (*exceptionObject == NULL) {
1995 *exceptionObject = PyUnicodeDecodeError_Create(
1996 encoding, input, length, startpos, endpos, reason);
1997 }
1998 else {
1999 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2000 goto onError;
2001 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2002 goto onError;
2003 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2004 goto onError;
2005 }
2006 return;
2007
2008onError:
2009 Py_DECREF(*exceptionObject);
2010 *exceptionObject = NULL;
2011}
2012
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002013/* error handling callback helper:
2014 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002015 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002016 and adjust various state variables.
2017 return 0 on success, -1 on error
2018*/
2019
2020static
2021int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00002022 const char *encoding, const char *reason,
2023 const char **input, const char **inend, Py_ssize_t *startinpos,
2024 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2025 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002026{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002027 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002028
2029 PyObject *restuple = NULL;
2030 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002031 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002032 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002033 Py_ssize_t requiredsize;
2034 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002035 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002036 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002037 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002038 int res = -1;
2039
2040 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002041 *errorHandler = PyCodec_LookupError(errors);
2042 if (*errorHandler == NULL)
2043 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002044 }
2045
Victor Stinner554f3f02010-06-16 23:33:54 +00002046 make_decode_exception(exceptionObject,
2047 encoding,
2048 *input, *inend - *input,
2049 *startinpos, *endinpos,
2050 reason);
2051 if (*exceptionObject == NULL)
2052 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053
2054 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2055 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002056 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002057 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002058 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002059 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002060 }
2061 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002062 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002063
2064 /* Copy back the bytes variables, which might have been modified by the
2065 callback */
2066 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2067 if (!inputobj)
2068 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002069 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002070 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002071 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002072 *input = PyBytes_AS_STRING(inputobj);
2073 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002074 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002075 /* we can DECREF safely, as the exception has another reference,
2076 so the object won't go away. */
2077 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002080 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002081 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002082 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2083 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002084 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002085
2086 /* need more space? (at least enough for what we
2087 have+the replacement+the rest of the string (starting
2088 at the new input position), so we won't have to check space
2089 when there are no errors in the rest of the string) */
2090 repptr = PyUnicode_AS_UNICODE(repunicode);
2091 repsize = PyUnicode_GET_SIZE(repunicode);
2092 requiredsize = *outpos + repsize + insize-newpos;
2093 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002094 if (requiredsize<2*outsize)
2095 requiredsize = 2*outsize;
2096 if (_PyUnicode_Resize(output, requiredsize) < 0)
2097 goto onError;
2098 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002099 }
2100 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002101 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002102 Py_UNICODE_COPY(*outptr, repptr, repsize);
2103 *outptr += repsize;
2104 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002106 /* we made it! */
2107 res = 0;
2108
Benjamin Peterson29060642009-01-31 22:14:21 +00002109 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002110 Py_XDECREF(restuple);
2111 return res;
2112}
2113
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002114/* --- UTF-7 Codec -------------------------------------------------------- */
2115
Antoine Pitrou244651a2009-05-04 18:56:13 +00002116/* See RFC2152 for details. We encode conservatively and decode liberally. */
2117
2118/* Three simple macros defining base-64. */
2119
2120/* Is c a base-64 character? */
2121
2122#define IS_BASE64(c) \
2123 (((c) >= 'A' && (c) <= 'Z') || \
2124 ((c) >= 'a' && (c) <= 'z') || \
2125 ((c) >= '0' && (c) <= '9') || \
2126 (c) == '+' || (c) == '/')
2127
2128/* given that c is a base-64 character, what is its base-64 value? */
2129
2130#define FROM_BASE64(c) \
2131 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2132 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2133 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2134 (c) == '+' ? 62 : 63)
2135
2136/* What is the base-64 character of the bottom 6 bits of n? */
2137
2138#define TO_BASE64(n) \
2139 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2140
2141/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2142 * decoded as itself. We are permissive on decoding; the only ASCII
2143 * byte not decoding to itself is the + which begins a base64
2144 * string. */
2145
2146#define DECODE_DIRECT(c) \
2147 ((c) <= 127 && (c) != '+')
2148
2149/* The UTF-7 encoder treats ASCII characters differently according to
2150 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2151 * the above). See RFC2152. This array identifies these different
2152 * sets:
2153 * 0 : "Set D"
2154 * alphanumeric and '(),-./:?
2155 * 1 : "Set O"
2156 * !"#$%&*;<=>@[]^_`{|}
2157 * 2 : "whitespace"
2158 * ht nl cr sp
2159 * 3 : special (must be base64 encoded)
2160 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2161 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002162
Tim Petersced69f82003-09-16 20:30:58 +00002163static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002164char utf7_category[128] = {
2165/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2166 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2167/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2168 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2169/* sp ! " # $ % & ' ( ) * + , - . / */
2170 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2171/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2172 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2173/* @ A B C D E F G H I J K L M N O */
2174 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2175/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2176 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2177/* ` a b c d e f g h i j k l m n o */
2178 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2179/* p q r s t u v w x y z { | } ~ del */
2180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002181};
2182
Antoine Pitrou244651a2009-05-04 18:56:13 +00002183/* ENCODE_DIRECT: this character should be encoded as itself. The
2184 * answer depends on whether we are encoding set O as itself, and also
2185 * on whether we are encoding whitespace as itself. RFC2152 makes it
2186 * clear that the answers to these questions vary between
2187 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002188
Antoine Pitrou244651a2009-05-04 18:56:13 +00002189#define ENCODE_DIRECT(c, directO, directWS) \
2190 ((c) < 128 && (c) > 0 && \
2191 ((utf7_category[(c)] == 0) || \
2192 (directWS && (utf7_category[(c)] == 2)) || \
2193 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002194
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002195PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002196 Py_ssize_t size,
2197 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002198{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002199 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2200}
2201
Antoine Pitrou244651a2009-05-04 18:56:13 +00002202/* The decoder. The only state we preserve is our read position,
2203 * i.e. how many characters we have consumed. So if we end in the
2204 * middle of a shift sequence we have to back off the read position
2205 * and the output to the beginning of the sequence, otherwise we lose
2206 * all the shift state (seen bits, number of bits seen, high
2207 * surrogate). */
2208
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002209PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002210 Py_ssize_t size,
2211 const char *errors,
2212 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002213{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002214 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002215 Py_ssize_t startinpos;
2216 Py_ssize_t endinpos;
2217 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002218 const char *e;
2219 PyUnicodeObject *unicode;
2220 Py_UNICODE *p;
2221 const char *errmsg = "";
2222 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002223 Py_UNICODE *shiftOutStart;
2224 unsigned int base64bits = 0;
2225 unsigned long base64buffer = 0;
2226 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002227 PyObject *errorHandler = NULL;
2228 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002229
2230 unicode = _PyUnicode_New(size);
2231 if (!unicode)
2232 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002233 if (size == 0) {
2234 if (consumed)
2235 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002236 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002237 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002238
2239 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002240 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002241 e = s + size;
2242
2243 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002244 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002245 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002246 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002247
Antoine Pitrou244651a2009-05-04 18:56:13 +00002248 if (inShift) { /* in a base-64 section */
2249 if (IS_BASE64(ch)) { /* consume a base-64 character */
2250 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2251 base64bits += 6;
2252 s++;
2253 if (base64bits >= 16) {
2254 /* we have enough bits for a UTF-16 value */
2255 Py_UNICODE outCh = (Py_UNICODE)
2256 (base64buffer >> (base64bits-16));
2257 base64bits -= 16;
2258 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2259 if (surrogate) {
2260 /* expecting a second surrogate */
2261 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2262#ifdef Py_UNICODE_WIDE
2263 *p++ = (((surrogate & 0x3FF)<<10)
2264 | (outCh & 0x3FF)) + 0x10000;
2265#else
2266 *p++ = surrogate;
2267 *p++ = outCh;
2268#endif
2269 surrogate = 0;
2270 }
2271 else {
2272 surrogate = 0;
2273 errmsg = "second surrogate missing";
2274 goto utf7Error;
2275 }
2276 }
2277 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2278 /* first surrogate */
2279 surrogate = outCh;
2280 }
2281 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2282 errmsg = "unexpected second surrogate";
2283 goto utf7Error;
2284 }
2285 else {
2286 *p++ = outCh;
2287 }
2288 }
2289 }
2290 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002291 inShift = 0;
2292 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002293 if (surrogate) {
2294 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002295 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002296 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002297 if (base64bits > 0) { /* left-over bits */
2298 if (base64bits >= 6) {
2299 /* We've seen at least one base-64 character */
2300 errmsg = "partial character in shift sequence";
2301 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002302 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002303 else {
2304 /* Some bits remain; they should be zero */
2305 if (base64buffer != 0) {
2306 errmsg = "non-zero padding bits in shift sequence";
2307 goto utf7Error;
2308 }
2309 }
2310 }
2311 if (ch != '-') {
2312 /* '-' is absorbed; other terminating
2313 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002314 *p++ = ch;
2315 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002316 }
2317 }
2318 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002319 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002320 s++; /* consume '+' */
2321 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002322 s++;
2323 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002324 }
2325 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002326 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002327 shiftOutStart = p;
2328 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002329 }
2330 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002331 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002332 *p++ = ch;
2333 s++;
2334 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002335 else {
2336 startinpos = s-starts;
2337 s++;
2338 errmsg = "unexpected special character";
2339 goto utf7Error;
2340 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002341 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002342utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002343 outpos = p-PyUnicode_AS_UNICODE(unicode);
2344 endinpos = s-starts;
2345 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002346 errors, &errorHandler,
2347 "utf7", errmsg,
2348 &starts, &e, &startinpos, &endinpos, &exc, &s,
2349 &unicode, &outpos, &p))
2350 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002351 }
2352
Antoine Pitrou244651a2009-05-04 18:56:13 +00002353 /* end of string */
2354
2355 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2356 /* if we're in an inconsistent state, that's an error */
2357 if (surrogate ||
2358 (base64bits >= 6) ||
2359 (base64bits > 0 && base64buffer != 0)) {
2360 outpos = p-PyUnicode_AS_UNICODE(unicode);
2361 endinpos = size;
2362 if (unicode_decode_call_errorhandler(
2363 errors, &errorHandler,
2364 "utf7", "unterminated shift sequence",
2365 &starts, &e, &startinpos, &endinpos, &exc, &s,
2366 &unicode, &outpos, &p))
2367 goto onError;
2368 if (s < e)
2369 goto restart;
2370 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002371 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002372
2373 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002374 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002375 if (inShift) {
2376 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002377 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002378 }
2379 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002380 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002381 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002382 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002383
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002384 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002385 goto onError;
2386
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002387 Py_XDECREF(errorHandler);
2388 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002389 return (PyObject *)unicode;
2390
Benjamin Peterson29060642009-01-31 22:14:21 +00002391 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002392 Py_XDECREF(errorHandler);
2393 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002394 Py_DECREF(unicode);
2395 return NULL;
2396}
2397
2398
2399PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002400 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002401 int base64SetO,
2402 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002403 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002404{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002405 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002406 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002407 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002408 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002409 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002410 unsigned int base64bits = 0;
2411 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002412 char * out;
2413 char * start;
2414
2415 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002416 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002417
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002418 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002419 return PyErr_NoMemory();
2420
Antoine Pitrou244651a2009-05-04 18:56:13 +00002421 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002422 if (v == NULL)
2423 return NULL;
2424
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002425 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002426 for (;i < size; ++i) {
2427 Py_UNICODE ch = s[i];
2428
Antoine Pitrou244651a2009-05-04 18:56:13 +00002429 if (inShift) {
2430 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2431 /* shifting out */
2432 if (base64bits) { /* output remaining bits */
2433 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2434 base64buffer = 0;
2435 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002436 }
2437 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002438 /* Characters not in the BASE64 set implicitly unshift the sequence
2439 so no '-' is required, except if the character is itself a '-' */
2440 if (IS_BASE64(ch) || ch == '-') {
2441 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002442 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002443 *out++ = (char) ch;
2444 }
2445 else {
2446 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002447 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002448 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002449 else { /* not in a shift sequence */
2450 if (ch == '+') {
2451 *out++ = '+';
2452 *out++ = '-';
2453 }
2454 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2455 *out++ = (char) ch;
2456 }
2457 else {
2458 *out++ = '+';
2459 inShift = 1;
2460 goto encode_char;
2461 }
2462 }
2463 continue;
2464encode_char:
2465#ifdef Py_UNICODE_WIDE
2466 if (ch >= 0x10000) {
2467 /* code first surrogate */
2468 base64bits += 16;
2469 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2470 while (base64bits >= 6) {
2471 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2472 base64bits -= 6;
2473 }
2474 /* prepare second surrogate */
2475 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2476 }
2477#endif
2478 base64bits += 16;
2479 base64buffer = (base64buffer << 16) | ch;
2480 while (base64bits >= 6) {
2481 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2482 base64bits -= 6;
2483 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002484 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002485 if (base64bits)
2486 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2487 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002488 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002489 if (_PyBytes_Resize(&v, out - start) < 0)
2490 return NULL;
2491 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002492}
2493
Antoine Pitrou244651a2009-05-04 18:56:13 +00002494#undef IS_BASE64
2495#undef FROM_BASE64
2496#undef TO_BASE64
2497#undef DECODE_DIRECT
2498#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002499
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500/* --- UTF-8 Codec -------------------------------------------------------- */
2501
Tim Petersced69f82003-09-16 20:30:58 +00002502static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002504 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2505 illegal prefix. See RFC 3629 for details */
2506 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2507 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002508 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2510 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2511 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2512 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002513 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2514 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2516 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002517 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2518 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2519 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2520 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2521 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522};
2523
Guido van Rossumd57fd912000-03-10 22:53:23 +00002524PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002525 Py_ssize_t size,
2526 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527{
Walter Dörwald69652032004-09-07 20:24:22 +00002528 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2529}
2530
Antoine Pitrouab868312009-01-10 15:40:25 +00002531/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2532#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2533
2534/* Mask to quickly check whether a C 'long' contains a
2535 non-ASCII, UTF8-encoded char. */
2536#if (SIZEOF_LONG == 8)
2537# define ASCII_CHAR_MASK 0x8080808080808080L
2538#elif (SIZEOF_LONG == 4)
2539# define ASCII_CHAR_MASK 0x80808080L
2540#else
2541# error C 'long' size should be either 4 or 8!
2542#endif
2543
Walter Dörwald69652032004-09-07 20:24:22 +00002544PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002545 Py_ssize_t size,
2546 const char *errors,
2547 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002548{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002549 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002551 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002552 Py_ssize_t startinpos;
2553 Py_ssize_t endinpos;
2554 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002555 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556 PyUnicodeObject *unicode;
2557 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002558 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002559 PyObject *errorHandler = NULL;
2560 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561
2562 /* Note: size will always be longer than the resulting Unicode
2563 character count */
2564 unicode = _PyUnicode_New(size);
2565 if (!unicode)
2566 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002567 if (size == 0) {
2568 if (consumed)
2569 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002570 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572
2573 /* Unpack UTF-8 encoded data */
2574 p = unicode->str;
2575 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002576 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002577
2578 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002579 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580
2581 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002582 /* Fast path for runs of ASCII characters. Given that common UTF-8
2583 input will consist of an overwhelming majority of ASCII
2584 characters, we try to optimize for this case by checking
2585 as many characters as a C 'long' can contain.
2586 First, check if we can do an aligned read, as most CPUs have
2587 a penalty for unaligned reads.
2588 */
2589 if (!((size_t) s & LONG_PTR_MASK)) {
2590 /* Help register allocation */
2591 register const char *_s = s;
2592 register Py_UNICODE *_p = p;
2593 while (_s < aligned_end) {
2594 /* Read a whole long at a time (either 4 or 8 bytes),
2595 and do a fast unrolled copy if it only contains ASCII
2596 characters. */
2597 unsigned long data = *(unsigned long *) _s;
2598 if (data & ASCII_CHAR_MASK)
2599 break;
2600 _p[0] = (unsigned char) _s[0];
2601 _p[1] = (unsigned char) _s[1];
2602 _p[2] = (unsigned char) _s[2];
2603 _p[3] = (unsigned char) _s[3];
2604#if (SIZEOF_LONG == 8)
2605 _p[4] = (unsigned char) _s[4];
2606 _p[5] = (unsigned char) _s[5];
2607 _p[6] = (unsigned char) _s[6];
2608 _p[7] = (unsigned char) _s[7];
2609#endif
2610 _s += SIZEOF_LONG;
2611 _p += SIZEOF_LONG;
2612 }
2613 s = _s;
2614 p = _p;
2615 if (s == e)
2616 break;
2617 ch = (unsigned char)*s;
2618 }
2619 }
2620
2621 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002622 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623 s++;
2624 continue;
2625 }
2626
2627 n = utf8_code_length[ch];
2628
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002629 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002630 if (consumed)
2631 break;
2632 else {
2633 errmsg = "unexpected end of data";
2634 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002635 endinpos = startinpos+1;
2636 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2637 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002638 goto utf8Error;
2639 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002640 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641
2642 switch (n) {
2643
2644 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002645 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002646 startinpos = s-starts;
2647 endinpos = startinpos+1;
2648 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649
2650 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002651 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002652 startinpos = s-starts;
2653 endinpos = startinpos+1;
2654 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002655
2656 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002657 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002658 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002659 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002660 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002661 goto utf8Error;
2662 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002663 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002664 assert ((ch > 0x007F) && (ch <= 0x07FF));
2665 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666 break;
2667
2668 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002669 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2670 will result in surrogates in range d800-dfff. Surrogates are
2671 not valid UTF-8 so they are rejected.
2672 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2673 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002674 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002675 (s[2] & 0xc0) != 0x80 ||
2676 ((unsigned char)s[0] == 0xE0 &&
2677 (unsigned char)s[1] < 0xA0) ||
2678 ((unsigned char)s[0] == 0xED &&
2679 (unsigned char)s[1] > 0x9F)) {
2680 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002681 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002682 endinpos = startinpos + 1;
2683
2684 /* if s[1] first two bits are 1 and 0, then the invalid
2685 continuation byte is s[2], so increment endinpos by 1,
2686 if not, s[1] is invalid and endinpos doesn't need to
2687 be incremented. */
2688 if ((s[1] & 0xC0) == 0x80)
2689 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002690 goto utf8Error;
2691 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002693 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2694 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002695 break;
2696
2697 case 4:
2698 if ((s[1] & 0xc0) != 0x80 ||
2699 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002700 (s[3] & 0xc0) != 0x80 ||
2701 ((unsigned char)s[0] == 0xF0 &&
2702 (unsigned char)s[1] < 0x90) ||
2703 ((unsigned char)s[0] == 0xF4 &&
2704 (unsigned char)s[1] > 0x8F)) {
2705 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002706 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002707 endinpos = startinpos + 1;
2708 if ((s[1] & 0xC0) == 0x80) {
2709 endinpos++;
2710 if ((s[2] & 0xC0) == 0x80)
2711 endinpos++;
2712 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002713 goto utf8Error;
2714 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002715 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002716 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2717 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2718
Fredrik Lundh8f455852001-06-27 18:59:43 +00002719#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002720 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002721#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002722 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002723
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002724 /* translate from 10000..10FFFF to 0..FFFF */
2725 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002726
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002727 /* high surrogate = top 10 bits added to D800 */
2728 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002729
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002730 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002731 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002732#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734 }
2735 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002736 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002737
Benjamin Peterson29060642009-01-31 22:14:21 +00002738 utf8Error:
2739 outpos = p-PyUnicode_AS_UNICODE(unicode);
2740 if (unicode_decode_call_errorhandler(
2741 errors, &errorHandler,
2742 "utf8", errmsg,
2743 &starts, &e, &startinpos, &endinpos, &exc, &s,
2744 &unicode, &outpos, &p))
2745 goto onError;
2746 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 }
Walter Dörwald69652032004-09-07 20:24:22 +00002748 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002749 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750
2751 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002752 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002753 goto onError;
2754
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002755 Py_XDECREF(errorHandler);
2756 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 return (PyObject *)unicode;
2758
Benjamin Peterson29060642009-01-31 22:14:21 +00002759 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002760 Py_XDECREF(errorHandler);
2761 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 Py_DECREF(unicode);
2763 return NULL;
2764}
2765
Antoine Pitrouab868312009-01-10 15:40:25 +00002766#undef ASCII_CHAR_MASK
2767
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002768#ifdef __APPLE__
2769
2770/* Simplified UTF-8 decoder using surrogateescape error handler,
2771 used to decode the command line arguments on Mac OS X. */
2772
2773wchar_t*
2774_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2775{
2776 int n;
2777 const char *e;
2778 wchar_t *unicode, *p;
2779
2780 /* Note: size will always be longer than the resulting Unicode
2781 character count */
2782 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2783 PyErr_NoMemory();
2784 return NULL;
2785 }
2786 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2787 if (!unicode)
2788 return NULL;
2789
2790 /* Unpack UTF-8 encoded data */
2791 p = unicode;
2792 e = s + size;
2793 while (s < e) {
2794 Py_UCS4 ch = (unsigned char)*s;
2795
2796 if (ch < 0x80) {
2797 *p++ = (wchar_t)ch;
2798 s++;
2799 continue;
2800 }
2801
2802 n = utf8_code_length[ch];
2803 if (s + n > e) {
2804 goto surrogateescape;
2805 }
2806
2807 switch (n) {
2808 case 0:
2809 case 1:
2810 goto surrogateescape;
2811
2812 case 2:
2813 if ((s[1] & 0xc0) != 0x80)
2814 goto surrogateescape;
2815 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2816 assert ((ch > 0x007F) && (ch <= 0x07FF));
2817 *p++ = (wchar_t)ch;
2818 break;
2819
2820 case 3:
2821 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2822 will result in surrogates in range d800-dfff. Surrogates are
2823 not valid UTF-8 so they are rejected.
2824 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2825 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2826 if ((s[1] & 0xc0) != 0x80 ||
2827 (s[2] & 0xc0) != 0x80 ||
2828 ((unsigned char)s[0] == 0xE0 &&
2829 (unsigned char)s[1] < 0xA0) ||
2830 ((unsigned char)s[0] == 0xED &&
2831 (unsigned char)s[1] > 0x9F)) {
2832
2833 goto surrogateescape;
2834 }
2835 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2836 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2837 *p++ = (Py_UNICODE)ch;
2838 break;
2839
2840 case 4:
2841 if ((s[1] & 0xc0) != 0x80 ||
2842 (s[2] & 0xc0) != 0x80 ||
2843 (s[3] & 0xc0) != 0x80 ||
2844 ((unsigned char)s[0] == 0xF0 &&
2845 (unsigned char)s[1] < 0x90) ||
2846 ((unsigned char)s[0] == 0xF4 &&
2847 (unsigned char)s[1] > 0x8F)) {
2848 goto surrogateescape;
2849 }
2850 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2851 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2852 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2853
2854#if SIZEOF_WCHAR_T == 4
2855 *p++ = (wchar_t)ch;
2856#else
2857 /* compute and append the two surrogates: */
2858
2859 /* translate from 10000..10FFFF to 0..FFFF */
2860 ch -= 0x10000;
2861
2862 /* high surrogate = top 10 bits added to D800 */
2863 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2864
2865 /* low surrogate = bottom 10 bits added to DC00 */
2866 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2867#endif
2868 break;
2869 }
2870 s += n;
2871 continue;
2872
2873 surrogateescape:
2874 *p++ = 0xDC00 + ch;
2875 s++;
2876 }
2877 *p = L'\0';
2878 return unicode;
2879}
2880
2881#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002882
Tim Peters602f7402002-04-27 18:03:26 +00002883/* Allocation strategy: if the string is short, convert into a stack buffer
2884 and allocate exactly as much space needed at the end. Else allocate the
2885 maximum possible needed (4 result bytes per Unicode character), and return
2886 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002887*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002888PyObject *
2889PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002890 Py_ssize_t size,
2891 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892{
Tim Peters602f7402002-04-27 18:03:26 +00002893#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002894
Guido van Rossum98297ee2007-11-06 21:34:58 +00002895 Py_ssize_t i; /* index into s of next input byte */
2896 PyObject *result; /* result string object */
2897 char *p; /* next free byte in output buffer */
2898 Py_ssize_t nallocated; /* number of result bytes allocated */
2899 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002900 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002901 PyObject *errorHandler = NULL;
2902 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002903
Tim Peters602f7402002-04-27 18:03:26 +00002904 assert(s != NULL);
2905 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906
Tim Peters602f7402002-04-27 18:03:26 +00002907 if (size <= MAX_SHORT_UNICHARS) {
2908 /* Write into the stack buffer; nallocated can't overflow.
2909 * At the end, we'll allocate exactly as much heap space as it
2910 * turns out we need.
2911 */
2912 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002913 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002914 p = stackbuf;
2915 }
2916 else {
2917 /* Overallocate on the heap, and give the excess back at the end. */
2918 nallocated = size * 4;
2919 if (nallocated / 4 != size) /* overflow! */
2920 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002921 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002922 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002923 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002924 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002925 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002926
Tim Peters602f7402002-04-27 18:03:26 +00002927 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002928 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002929
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002930 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002931 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002933
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002935 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002936 *p++ = (char)(0xc0 | (ch >> 6));
2937 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002938 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002939#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002940 /* Special case: check for high and low surrogate */
2941 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2942 Py_UCS4 ch2 = s[i];
2943 /* Combine the two surrogates to form a UCS4 value */
2944 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2945 i++;
2946
2947 /* Encode UCS4 Unicode ordinals */
2948 *p++ = (char)(0xf0 | (ch >> 18));
2949 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002950 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2951 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002952 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002953#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002954 Py_ssize_t newpos;
2955 PyObject *rep;
2956 Py_ssize_t repsize, k;
2957 rep = unicode_encode_call_errorhandler
2958 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2959 s, size, &exc, i-1, i, &newpos);
2960 if (!rep)
2961 goto error;
2962
2963 if (PyBytes_Check(rep))
2964 repsize = PyBytes_GET_SIZE(rep);
2965 else
2966 repsize = PyUnicode_GET_SIZE(rep);
2967
2968 if (repsize > 4) {
2969 Py_ssize_t offset;
2970
2971 if (result == NULL)
2972 offset = p - stackbuf;
2973 else
2974 offset = p - PyBytes_AS_STRING(result);
2975
2976 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2977 /* integer overflow */
2978 PyErr_NoMemory();
2979 goto error;
2980 }
2981 nallocated += repsize - 4;
2982 if (result != NULL) {
2983 if (_PyBytes_Resize(&result, nallocated) < 0)
2984 goto error;
2985 } else {
2986 result = PyBytes_FromStringAndSize(NULL, nallocated);
2987 if (result == NULL)
2988 goto error;
2989 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2990 }
2991 p = PyBytes_AS_STRING(result) + offset;
2992 }
2993
2994 if (PyBytes_Check(rep)) {
2995 char *prep = PyBytes_AS_STRING(rep);
2996 for(k = repsize; k > 0; k--)
2997 *p++ = *prep++;
2998 } else /* rep is unicode */ {
2999 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3000 Py_UNICODE c;
3001
3002 for(k=0; k<repsize; k++) {
3003 c = prep[k];
3004 if (0x80 <= c) {
3005 raise_encode_exception(&exc, "utf-8", s, size,
3006 i-1, i, "surrogates not allowed");
3007 goto error;
3008 }
3009 *p++ = (char)prep[k];
3010 }
3011 }
3012 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003013#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003014 }
Victor Stinner445a6232010-04-22 20:01:57 +00003015#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003016 } else if (ch < 0x10000) {
3017 *p++ = (char)(0xe0 | (ch >> 12));
3018 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3019 *p++ = (char)(0x80 | (ch & 0x3f));
3020 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003021 /* Encode UCS4 Unicode ordinals */
3022 *p++ = (char)(0xf0 | (ch >> 18));
3023 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3024 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3025 *p++ = (char)(0x80 | (ch & 0x3f));
3026 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003028
Guido van Rossum98297ee2007-11-06 21:34:58 +00003029 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003030 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003031 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003032 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003033 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003034 }
3035 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003036 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003037 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003038 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003039 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003040 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003041 Py_XDECREF(errorHandler);
3042 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003043 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003044 error:
3045 Py_XDECREF(errorHandler);
3046 Py_XDECREF(exc);
3047 Py_XDECREF(result);
3048 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003049
Tim Peters602f7402002-04-27 18:03:26 +00003050#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051}
3052
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3054{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 if (!PyUnicode_Check(unicode)) {
3056 PyErr_BadArgument();
3057 return NULL;
3058 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003059 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003060 PyUnicode_GET_SIZE(unicode),
3061 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062}
3063
Walter Dörwald41980ca2007-08-16 21:55:45 +00003064/* --- UTF-32 Codec ------------------------------------------------------- */
3065
3066PyObject *
3067PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003068 Py_ssize_t size,
3069 const char *errors,
3070 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003071{
3072 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3073}
3074
3075PyObject *
3076PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 Py_ssize_t size,
3078 const char *errors,
3079 int *byteorder,
3080 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003081{
3082 const char *starts = s;
3083 Py_ssize_t startinpos;
3084 Py_ssize_t endinpos;
3085 Py_ssize_t outpos;
3086 PyUnicodeObject *unicode;
3087 Py_UNICODE *p;
3088#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003089 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003090 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003091#else
3092 const int pairs = 0;
3093#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003094 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003095 int bo = 0; /* assume native ordering by default */
3096 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003097 /* Offsets from q for retrieving bytes in the right order. */
3098#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3099 int iorder[] = {0, 1, 2, 3};
3100#else
3101 int iorder[] = {3, 2, 1, 0};
3102#endif
3103 PyObject *errorHandler = NULL;
3104 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003105
Walter Dörwald41980ca2007-08-16 21:55:45 +00003106 q = (unsigned char *)s;
3107 e = q + size;
3108
3109 if (byteorder)
3110 bo = *byteorder;
3111
3112 /* Check for BOM marks (U+FEFF) in the input and adjust current
3113 byte order setting accordingly. In native mode, the leading BOM
3114 mark is skipped, in all other modes, it is copied to the output
3115 stream as-is (giving a ZWNBSP character). */
3116 if (bo == 0) {
3117 if (size >= 4) {
3118 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003119 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003120#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003121 if (bom == 0x0000FEFF) {
3122 q += 4;
3123 bo = -1;
3124 }
3125 else if (bom == 0xFFFE0000) {
3126 q += 4;
3127 bo = 1;
3128 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003129#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003130 if (bom == 0x0000FEFF) {
3131 q += 4;
3132 bo = 1;
3133 }
3134 else if (bom == 0xFFFE0000) {
3135 q += 4;
3136 bo = -1;
3137 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003138#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003139 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003140 }
3141
3142 if (bo == -1) {
3143 /* force LE */
3144 iorder[0] = 0;
3145 iorder[1] = 1;
3146 iorder[2] = 2;
3147 iorder[3] = 3;
3148 }
3149 else if (bo == 1) {
3150 /* force BE */
3151 iorder[0] = 3;
3152 iorder[1] = 2;
3153 iorder[2] = 1;
3154 iorder[3] = 0;
3155 }
3156
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003157 /* On narrow builds we split characters outside the BMP into two
3158 codepoints => count how much extra space we need. */
3159#ifndef Py_UNICODE_WIDE
3160 for (qq = q; qq < e; qq += 4)
3161 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3162 pairs++;
3163#endif
3164
3165 /* This might be one to much, because of a BOM */
3166 unicode = _PyUnicode_New((size+3)/4+pairs);
3167 if (!unicode)
3168 return NULL;
3169 if (size == 0)
3170 return (PyObject *)unicode;
3171
3172 /* Unpack UTF-32 encoded data */
3173 p = unicode->str;
3174
Walter Dörwald41980ca2007-08-16 21:55:45 +00003175 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003176 Py_UCS4 ch;
3177 /* remaining bytes at the end? (size should be divisible by 4) */
3178 if (e-q<4) {
3179 if (consumed)
3180 break;
3181 errmsg = "truncated data";
3182 startinpos = ((const char *)q)-starts;
3183 endinpos = ((const char *)e)-starts;
3184 goto utf32Error;
3185 /* The remaining input chars are ignored if the callback
3186 chooses to skip the input */
3187 }
3188 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3189 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003190
Benjamin Peterson29060642009-01-31 22:14:21 +00003191 if (ch >= 0x110000)
3192 {
3193 errmsg = "codepoint not in range(0x110000)";
3194 startinpos = ((const char *)q)-starts;
3195 endinpos = startinpos+4;
3196 goto utf32Error;
3197 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003198#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003199 if (ch >= 0x10000)
3200 {
3201 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3202 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3203 }
3204 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003205#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003206 *p++ = ch;
3207 q += 4;
3208 continue;
3209 utf32Error:
3210 outpos = p-PyUnicode_AS_UNICODE(unicode);
3211 if (unicode_decode_call_errorhandler(
3212 errors, &errorHandler,
3213 "utf32", errmsg,
3214 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3215 &unicode, &outpos, &p))
3216 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003217 }
3218
3219 if (byteorder)
3220 *byteorder = bo;
3221
3222 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003224
3225 /* Adjust length */
3226 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3227 goto onError;
3228
3229 Py_XDECREF(errorHandler);
3230 Py_XDECREF(exc);
3231 return (PyObject *)unicode;
3232
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003234 Py_DECREF(unicode);
3235 Py_XDECREF(errorHandler);
3236 Py_XDECREF(exc);
3237 return NULL;
3238}
3239
3240PyObject *
3241PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003242 Py_ssize_t size,
3243 const char *errors,
3244 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003245{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003246 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003247 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003248 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003249#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003250 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003251#else
3252 const int pairs = 0;
3253#endif
3254 /* Offsets from p for storing byte pairs in the right order. */
3255#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3256 int iorder[] = {0, 1, 2, 3};
3257#else
3258 int iorder[] = {3, 2, 1, 0};
3259#endif
3260
Benjamin Peterson29060642009-01-31 22:14:21 +00003261#define STORECHAR(CH) \
3262 do { \
3263 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3264 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3265 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3266 p[iorder[0]] = (CH) & 0xff; \
3267 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003268 } while(0)
3269
3270 /* In narrow builds we can output surrogate pairs as one codepoint,
3271 so we need less space. */
3272#ifndef Py_UNICODE_WIDE
3273 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003274 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3275 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3276 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003277#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003278 nsize = (size - pairs + (byteorder == 0));
3279 bytesize = nsize * 4;
3280 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003281 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003282 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003283 if (v == NULL)
3284 return NULL;
3285
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003286 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003287 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003288 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003289 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003290 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003291
3292 if (byteorder == -1) {
3293 /* force LE */
3294 iorder[0] = 0;
3295 iorder[1] = 1;
3296 iorder[2] = 2;
3297 iorder[3] = 3;
3298 }
3299 else if (byteorder == 1) {
3300 /* force BE */
3301 iorder[0] = 3;
3302 iorder[1] = 2;
3303 iorder[2] = 1;
3304 iorder[3] = 0;
3305 }
3306
3307 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003308 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003309#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003310 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3311 Py_UCS4 ch2 = *s;
3312 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3313 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3314 s++;
3315 size--;
3316 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003317 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003318#endif
3319 STORECHAR(ch);
3320 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003321
3322 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003323 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003324#undef STORECHAR
3325}
3326
3327PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3328{
3329 if (!PyUnicode_Check(unicode)) {
3330 PyErr_BadArgument();
3331 return NULL;
3332 }
3333 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003334 PyUnicode_GET_SIZE(unicode),
3335 NULL,
3336 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003337}
3338
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339/* --- UTF-16 Codec ------------------------------------------------------- */
3340
Tim Peters772747b2001-08-09 22:21:55 +00003341PyObject *
3342PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003343 Py_ssize_t size,
3344 const char *errors,
3345 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346{
Walter Dörwald69652032004-09-07 20:24:22 +00003347 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3348}
3349
Antoine Pitrouab868312009-01-10 15:40:25 +00003350/* Two masks for fast checking of whether a C 'long' may contain
3351 UTF16-encoded surrogate characters. This is an efficient heuristic,
3352 assuming that non-surrogate characters with a code point >= 0x8000 are
3353 rare in most input.
3354 FAST_CHAR_MASK is used when the input is in native byte ordering,
3355 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003356*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003357#if (SIZEOF_LONG == 8)
3358# define FAST_CHAR_MASK 0x8000800080008000L
3359# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3360#elif (SIZEOF_LONG == 4)
3361# define FAST_CHAR_MASK 0x80008000L
3362# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3363#else
3364# error C 'long' size should be either 4 or 8!
3365#endif
3366
Walter Dörwald69652032004-09-07 20:24:22 +00003367PyObject *
3368PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003369 Py_ssize_t size,
3370 const char *errors,
3371 int *byteorder,
3372 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003373{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003375 Py_ssize_t startinpos;
3376 Py_ssize_t endinpos;
3377 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378 PyUnicodeObject *unicode;
3379 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003380 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003381 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003382 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003383 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003384 /* Offsets from q for retrieving byte pairs in the right order. */
3385#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3386 int ihi = 1, ilo = 0;
3387#else
3388 int ihi = 0, ilo = 1;
3389#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 PyObject *errorHandler = NULL;
3391 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392
3393 /* Note: size will always be longer than the resulting Unicode
3394 character count */
3395 unicode = _PyUnicode_New(size);
3396 if (!unicode)
3397 return NULL;
3398 if (size == 0)
3399 return (PyObject *)unicode;
3400
3401 /* Unpack UTF-16 encoded data */
3402 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003403 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003404 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003405
3406 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003407 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003409 /* Check for BOM marks (U+FEFF) in the input and adjust current
3410 byte order setting accordingly. In native mode, the leading BOM
3411 mark is skipped, in all other modes, it is copied to the output
3412 stream as-is (giving a ZWNBSP character). */
3413 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003414 if (size >= 2) {
3415 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003416#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003417 if (bom == 0xFEFF) {
3418 q += 2;
3419 bo = -1;
3420 }
3421 else if (bom == 0xFFFE) {
3422 q += 2;
3423 bo = 1;
3424 }
Tim Petersced69f82003-09-16 20:30:58 +00003425#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003426 if (bom == 0xFEFF) {
3427 q += 2;
3428 bo = 1;
3429 }
3430 else if (bom == 0xFFFE) {
3431 q += 2;
3432 bo = -1;
3433 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003434#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003435 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003436 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437
Tim Peters772747b2001-08-09 22:21:55 +00003438 if (bo == -1) {
3439 /* force LE */
3440 ihi = 1;
3441 ilo = 0;
3442 }
3443 else if (bo == 1) {
3444 /* force BE */
3445 ihi = 0;
3446 ilo = 1;
3447 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003448#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3449 native_ordering = ilo < ihi;
3450#else
3451 native_ordering = ilo > ihi;
3452#endif
Tim Peters772747b2001-08-09 22:21:55 +00003453
Antoine Pitrouab868312009-01-10 15:40:25 +00003454 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003455 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003456 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003457 /* First check for possible aligned read of a C 'long'. Unaligned
3458 reads are more expensive, better to defer to another iteration. */
3459 if (!((size_t) q & LONG_PTR_MASK)) {
3460 /* Fast path for runs of non-surrogate chars. */
3461 register const unsigned char *_q = q;
3462 Py_UNICODE *_p = p;
3463 if (native_ordering) {
3464 /* Native ordering is simple: as long as the input cannot
3465 possibly contain a surrogate char, do an unrolled copy
3466 of several 16-bit code points to the target object.
3467 The non-surrogate check is done on several input bytes
3468 at a time (as many as a C 'long' can contain). */
3469 while (_q < aligned_end) {
3470 unsigned long data = * (unsigned long *) _q;
3471 if (data & FAST_CHAR_MASK)
3472 break;
3473 _p[0] = ((unsigned short *) _q)[0];
3474 _p[1] = ((unsigned short *) _q)[1];
3475#if (SIZEOF_LONG == 8)
3476 _p[2] = ((unsigned short *) _q)[2];
3477 _p[3] = ((unsigned short *) _q)[3];
3478#endif
3479 _q += SIZEOF_LONG;
3480 _p += SIZEOF_LONG / 2;
3481 }
3482 }
3483 else {
3484 /* Byteswapped ordering is similar, but we must decompose
3485 the copy bytewise, and take care of zero'ing out the
3486 upper bytes if the target object is in 32-bit units
3487 (that is, in UCS-4 builds). */
3488 while (_q < aligned_end) {
3489 unsigned long data = * (unsigned long *) _q;
3490 if (data & SWAPPED_FAST_CHAR_MASK)
3491 break;
3492 /* Zero upper bytes in UCS-4 builds */
3493#if (Py_UNICODE_SIZE > 2)
3494 _p[0] = 0;
3495 _p[1] = 0;
3496#if (SIZEOF_LONG == 8)
3497 _p[2] = 0;
3498 _p[3] = 0;
3499#endif
3500#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003501 /* Issue #4916; UCS-4 builds on big endian machines must
3502 fill the two last bytes of each 4-byte unit. */
3503#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3504# define OFF 2
3505#else
3506# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003507#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003508 ((unsigned char *) _p)[OFF + 1] = _q[0];
3509 ((unsigned char *) _p)[OFF + 0] = _q[1];
3510 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3511 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3512#if (SIZEOF_LONG == 8)
3513 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3514 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3515 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3516 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3517#endif
3518#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003519 _q += SIZEOF_LONG;
3520 _p += SIZEOF_LONG / 2;
3521 }
3522 }
3523 p = _p;
3524 q = _q;
3525 if (q >= e)
3526 break;
3527 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003528 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529
Benjamin Peterson14339b62009-01-31 16:36:08 +00003530 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003531
3532 if (ch < 0xD800 || ch > 0xDFFF) {
3533 *p++ = ch;
3534 continue;
3535 }
3536
3537 /* UTF-16 code pair: */
3538 if (q > e) {
3539 errmsg = "unexpected end of data";
3540 startinpos = (((const char *)q) - 2) - starts;
3541 endinpos = ((const char *)e) + 1 - starts;
3542 goto utf16Error;
3543 }
3544 if (0xD800 <= ch && ch <= 0xDBFF) {
3545 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3546 q += 2;
3547 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003548#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003549 *p++ = ch;
3550 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003551#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003552 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003553#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003554 continue;
3555 }
3556 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003557 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003558 startinpos = (((const char *)q)-4)-starts;
3559 endinpos = startinpos+2;
3560 goto utf16Error;
3561 }
3562
Benjamin Peterson14339b62009-01-31 16:36:08 +00003563 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003564 errmsg = "illegal encoding";
3565 startinpos = (((const char *)q)-2)-starts;
3566 endinpos = startinpos+2;
3567 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003568
Benjamin Peterson29060642009-01-31 22:14:21 +00003569 utf16Error:
3570 outpos = p - PyUnicode_AS_UNICODE(unicode);
3571 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003572 errors,
3573 &errorHandler,
3574 "utf16", errmsg,
3575 &starts,
3576 (const char **)&e,
3577 &startinpos,
3578 &endinpos,
3579 &exc,
3580 (const char **)&q,
3581 &unicode,
3582 &outpos,
3583 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003584 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003585 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003586 /* remaining byte at the end? (size should be even) */
3587 if (e == q) {
3588 if (!consumed) {
3589 errmsg = "truncated data";
3590 startinpos = ((const char *)q) - starts;
3591 endinpos = ((const char *)e) + 1 - starts;
3592 outpos = p - PyUnicode_AS_UNICODE(unicode);
3593 if (unicode_decode_call_errorhandler(
3594 errors,
3595 &errorHandler,
3596 "utf16", errmsg,
3597 &starts,
3598 (const char **)&e,
3599 &startinpos,
3600 &endinpos,
3601 &exc,
3602 (const char **)&q,
3603 &unicode,
3604 &outpos,
3605 &p))
3606 goto onError;
3607 /* The remaining input chars are ignored if the callback
3608 chooses to skip the input */
3609 }
3610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611
3612 if (byteorder)
3613 *byteorder = bo;
3614
Walter Dörwald69652032004-09-07 20:24:22 +00003615 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003616 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003617
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003619 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 goto onError;
3621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 Py_XDECREF(errorHandler);
3623 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624 return (PyObject *)unicode;
3625
Benjamin Peterson29060642009-01-31 22:14:21 +00003626 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003627 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 Py_XDECREF(errorHandler);
3629 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 return NULL;
3631}
3632
Antoine Pitrouab868312009-01-10 15:40:25 +00003633#undef FAST_CHAR_MASK
3634#undef SWAPPED_FAST_CHAR_MASK
3635
Tim Peters772747b2001-08-09 22:21:55 +00003636PyObject *
3637PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003638 Py_ssize_t size,
3639 const char *errors,
3640 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003642 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003643 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003644 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003645#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003646 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003647#else
3648 const int pairs = 0;
3649#endif
Tim Peters772747b2001-08-09 22:21:55 +00003650 /* Offsets from p for storing byte pairs in the right order. */
3651#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3652 int ihi = 1, ilo = 0;
3653#else
3654 int ihi = 0, ilo = 1;
3655#endif
3656
Benjamin Peterson29060642009-01-31 22:14:21 +00003657#define STORECHAR(CH) \
3658 do { \
3659 p[ihi] = ((CH) >> 8) & 0xff; \
3660 p[ilo] = (CH) & 0xff; \
3661 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003662 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003664#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003665 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003666 if (s[i] >= 0x10000)
3667 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003668#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003669 /* 2 * (size + pairs + (byteorder == 0)) */
3670 if (size > PY_SSIZE_T_MAX ||
3671 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003672 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003673 nsize = size + pairs + (byteorder == 0);
3674 bytesize = nsize * 2;
3675 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003677 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678 if (v == NULL)
3679 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003681 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003683 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003684 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003685 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003686
3687 if (byteorder == -1) {
3688 /* force LE */
3689 ihi = 1;
3690 ilo = 0;
3691 }
3692 else if (byteorder == 1) {
3693 /* force BE */
3694 ihi = 0;
3695 ilo = 1;
3696 }
3697
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003698 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003699 Py_UNICODE ch = *s++;
3700 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003701#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003702 if (ch >= 0x10000) {
3703 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3704 ch = 0xD800 | ((ch-0x10000) >> 10);
3705 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003706#endif
Tim Peters772747b2001-08-09 22:21:55 +00003707 STORECHAR(ch);
3708 if (ch2)
3709 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003710 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003711
3712 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003713 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003714#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715}
3716
3717PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3718{
3719 if (!PyUnicode_Check(unicode)) {
3720 PyErr_BadArgument();
3721 return NULL;
3722 }
3723 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003724 PyUnicode_GET_SIZE(unicode),
3725 NULL,
3726 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003727}
3728
3729/* --- Unicode Escape Codec ----------------------------------------------- */
3730
Fredrik Lundh06d12682001-01-24 07:59:11 +00003731static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003732
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003734 Py_ssize_t size,
3735 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003736{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003738 Py_ssize_t startinpos;
3739 Py_ssize_t endinpos;
3740 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003741 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003743 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003745 char* message;
3746 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003747 PyObject *errorHandler = NULL;
3748 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003749
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750 /* Escaped strings will always be longer than the resulting
3751 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 length after conversion to the true value.
3753 (but if the error callback returns a long replacement string
3754 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 v = _PyUnicode_New(size);
3756 if (v == NULL)
3757 goto onError;
3758 if (size == 0)
3759 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003760
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003761 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003763
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764 while (s < end) {
3765 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003766 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003767 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768
3769 /* Non-escape characters are interpreted as Unicode ordinals */
3770 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003771 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772 continue;
3773 }
3774
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 /* \ - Escapes */
3777 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003778 c = *s++;
3779 if (s > end)
3780 c = '\0'; /* Invalid after \ */
3781 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782
Benjamin Peterson29060642009-01-31 22:14:21 +00003783 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 case '\n': break;
3785 case '\\': *p++ = '\\'; break;
3786 case '\'': *p++ = '\''; break;
3787 case '\"': *p++ = '\"'; break;
3788 case 'b': *p++ = '\b'; break;
3789 case 'f': *p++ = '\014'; break; /* FF */
3790 case 't': *p++ = '\t'; break;
3791 case 'n': *p++ = '\n'; break;
3792 case 'r': *p++ = '\r'; break;
3793 case 'v': *p++ = '\013'; break; /* VT */
3794 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3795
Benjamin Peterson29060642009-01-31 22:14:21 +00003796 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797 case '0': case '1': case '2': case '3':
3798 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003799 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003800 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003801 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003802 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003803 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003805 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806 break;
3807
Benjamin Peterson29060642009-01-31 22:14:21 +00003808 /* hex escapes */
3809 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003811 digits = 2;
3812 message = "truncated \\xXX escape";
3813 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814
Benjamin Peterson29060642009-01-31 22:14:21 +00003815 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003817 digits = 4;
3818 message = "truncated \\uXXXX escape";
3819 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820
Benjamin Peterson29060642009-01-31 22:14:21 +00003821 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003822 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003823 digits = 8;
3824 message = "truncated \\UXXXXXXXX escape";
3825 hexescape:
3826 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003827 outpos = p-PyUnicode_AS_UNICODE(v);
3828 if (s+digits>end) {
3829 endinpos = size;
3830 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003831 errors, &errorHandler,
3832 "unicodeescape", "end of string in escape sequence",
3833 &starts, &end, &startinpos, &endinpos, &exc, &s,
3834 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835 goto onError;
3836 goto nextByte;
3837 }
3838 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003839 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003840 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003841 endinpos = (s+i+1)-starts;
3842 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003843 errors, &errorHandler,
3844 "unicodeescape", message,
3845 &starts, &end, &startinpos, &endinpos, &exc, &s,
3846 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003847 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003848 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003849 }
3850 chr = (chr<<4) & ~0xF;
3851 if (c >= '0' && c <= '9')
3852 chr += c - '0';
3853 else if (c >= 'a' && c <= 'f')
3854 chr += 10 + c - 'a';
3855 else
3856 chr += 10 + c - 'A';
3857 }
3858 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003859 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860 /* _decoding_error will have already written into the
3861 target buffer. */
3862 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003863 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003864 /* when we get here, chr is a 32-bit unicode character */
3865 if (chr <= 0xffff)
3866 /* UCS-2 character */
3867 *p++ = (Py_UNICODE) chr;
3868 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003869 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003870 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003871#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003872 *p++ = chr;
3873#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003874 chr -= 0x10000L;
3875 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003876 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003877#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003878 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003879 endinpos = s-starts;
3880 outpos = p-PyUnicode_AS_UNICODE(v);
3881 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003882 errors, &errorHandler,
3883 "unicodeescape", "illegal Unicode character",
3884 &starts, &end, &startinpos, &endinpos, &exc, &s,
3885 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003886 goto onError;
3887 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003888 break;
3889
Benjamin Peterson29060642009-01-31 22:14:21 +00003890 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003891 case 'N':
3892 message = "malformed \\N character escape";
3893 if (ucnhash_CAPI == NULL) {
3894 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003895 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003896 if (ucnhash_CAPI == NULL)
3897 goto ucnhashError;
3898 }
3899 if (*s == '{') {
3900 const char *start = s+1;
3901 /* look for the closing brace */
3902 while (*s != '}' && s < end)
3903 s++;
3904 if (s > start && s < end && *s == '}') {
3905 /* found a name. look it up in the unicode database */
3906 message = "unknown Unicode character name";
3907 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003908 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003909 goto store;
3910 }
3911 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003912 endinpos = s-starts;
3913 outpos = p-PyUnicode_AS_UNICODE(v);
3914 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003915 errors, &errorHandler,
3916 "unicodeescape", message,
3917 &starts, &end, &startinpos, &endinpos, &exc, &s,
3918 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003919 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003920 break;
3921
3922 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003923 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003924 message = "\\ at end of string";
3925 s--;
3926 endinpos = s-starts;
3927 outpos = p-PyUnicode_AS_UNICODE(v);
3928 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003929 errors, &errorHandler,
3930 "unicodeescape", message,
3931 &starts, &end, &startinpos, &endinpos, &exc, &s,
3932 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003933 goto onError;
3934 }
3935 else {
3936 *p++ = '\\';
3937 *p++ = (unsigned char)s[-1];
3938 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003939 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003941 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003942 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003944 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003946 Py_XDECREF(errorHandler);
3947 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003949
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003951 PyErr_SetString(
3952 PyExc_UnicodeError,
3953 "\\N escapes not supported (can't load unicodedata module)"
3954 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003955 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003956 Py_XDECREF(errorHandler);
3957 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003958 return NULL;
3959
Benjamin Peterson29060642009-01-31 22:14:21 +00003960 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 Py_XDECREF(errorHandler);
3963 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 return NULL;
3965}
3966
3967/* Return a Unicode-Escape string version of the Unicode object.
3968
3969 If quotes is true, the string is enclosed in u"" or u'' quotes as
3970 appropriate.
3971
3972*/
3973
Thomas Wouters477c8d52006-05-27 19:21:47 +00003974Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003975 Py_ssize_t size,
3976 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003977{
3978 /* like wcschr, but doesn't stop at NULL characters */
3979
3980 while (size-- > 0) {
3981 if (*s == ch)
3982 return s;
3983 s++;
3984 }
3985
3986 return NULL;
3987}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003988
Walter Dörwald79e913e2007-05-12 11:08:06 +00003989static const char *hexdigits = "0123456789abcdef";
3990
3991PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003992 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003994 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003997#ifdef Py_UNICODE_WIDE
3998 const Py_ssize_t expandsize = 10;
3999#else
4000 const Py_ssize_t expandsize = 6;
4001#endif
4002
Thomas Wouters89f507f2006-12-13 04:49:30 +00004003 /* XXX(nnorwitz): rather than over-allocating, it would be
4004 better to choose a different scheme. Perhaps scan the
4005 first N-chars of the string and allocate based on that size.
4006 */
4007 /* Initial allocation is based on the longest-possible unichr
4008 escape.
4009
4010 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4011 unichr, so in this case it's the longest unichr escape. In
4012 narrow (UTF-16) builds this is five chars per source unichr
4013 since there are two unichrs in the surrogate pair, so in narrow
4014 (UTF-16) builds it's not the longest unichr escape.
4015
4016 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4017 so in the narrow (UTF-16) build case it's the longest unichr
4018 escape.
4019 */
4020
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004021 if (size == 0)
4022 return PyBytes_FromStringAndSize(NULL, 0);
4023
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004024 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004025 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004026
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004027 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004028 2
4029 + expandsize*size
4030 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031 if (repr == NULL)
4032 return NULL;
4033
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004034 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 while (size-- > 0) {
4037 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004038
Walter Dörwald79e913e2007-05-12 11:08:06 +00004039 /* Escape backslashes */
4040 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041 *p++ = '\\';
4042 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004043 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004044 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004045
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004046#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004047 /* Map 21-bit characters to '\U00xxxxxx' */
4048 else if (ch >= 0x10000) {
4049 *p++ = '\\';
4050 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004051 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4052 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4053 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4054 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4055 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4056 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4057 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4058 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004060 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004061#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004062 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4063 else if (ch >= 0xD800 && ch < 0xDC00) {
4064 Py_UNICODE ch2;
4065 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004066
Benjamin Peterson29060642009-01-31 22:14:21 +00004067 ch2 = *s++;
4068 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004069 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4071 *p++ = '\\';
4072 *p++ = 'U';
4073 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4074 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4075 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4076 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4077 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4078 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4079 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4080 *p++ = hexdigits[ucs & 0x0000000F];
4081 continue;
4082 }
4083 /* Fall through: isolated surrogates are copied as-is */
4084 s--;
4085 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004086 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004087#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004088
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004090 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091 *p++ = '\\';
4092 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004093 *p++ = hexdigits[(ch >> 12) & 0x000F];
4094 *p++ = hexdigits[(ch >> 8) & 0x000F];
4095 *p++ = hexdigits[(ch >> 4) & 0x000F];
4096 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004098
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004099 /* Map special whitespace to '\t', \n', '\r' */
4100 else if (ch == '\t') {
4101 *p++ = '\\';
4102 *p++ = 't';
4103 }
4104 else if (ch == '\n') {
4105 *p++ = '\\';
4106 *p++ = 'n';
4107 }
4108 else if (ch == '\r') {
4109 *p++ = '\\';
4110 *p++ = 'r';
4111 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004112
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004113 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004114 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004116 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004117 *p++ = hexdigits[(ch >> 4) & 0x000F];
4118 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004119 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004120
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 /* Copy everything else as-is */
4122 else
4123 *p++ = (char) ch;
4124 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004126 assert(p - PyBytes_AS_STRING(repr) > 0);
4127 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4128 return NULL;
4129 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130}
4131
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00004132PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004134 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 if (!PyUnicode_Check(unicode)) {
4136 PyErr_BadArgument();
4137 return NULL;
4138 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004139 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4140 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004141 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142}
4143
4144/* --- Raw Unicode Escape Codec ------------------------------------------- */
4145
4146PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 Py_ssize_t size,
4148 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004151 Py_ssize_t startinpos;
4152 Py_ssize_t endinpos;
4153 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 const char *end;
4157 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 PyObject *errorHandler = NULL;
4159 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004160
Guido van Rossumd57fd912000-03-10 22:53:23 +00004161 /* Escaped strings will always be longer than the resulting
4162 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163 length after conversion to the true value. (But decoding error
4164 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 v = _PyUnicode_New(size);
4166 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004167 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171 end = s + size;
4172 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004173 unsigned char c;
4174 Py_UCS4 x;
4175 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004176 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 /* Non-escape characters are interpreted as Unicode ordinals */
4179 if (*s != '\\') {
4180 *p++ = (unsigned char)*s++;
4181 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004182 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 startinpos = s-starts;
4184
4185 /* \u-escapes are only interpreted iff the number of leading
4186 backslashes if odd */
4187 bs = s;
4188 for (;s < end;) {
4189 if (*s != '\\')
4190 break;
4191 *p++ = (unsigned char)*s++;
4192 }
4193 if (((s - bs) & 1) == 0 ||
4194 s >= end ||
4195 (*s != 'u' && *s != 'U')) {
4196 continue;
4197 }
4198 p--;
4199 count = *s=='u' ? 4 : 8;
4200 s++;
4201
4202 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4203 outpos = p-PyUnicode_AS_UNICODE(v);
4204 for (x = 0, i = 0; i < count; ++i, ++s) {
4205 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004206 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004207 endinpos = s-starts;
4208 if (unicode_decode_call_errorhandler(
4209 errors, &errorHandler,
4210 "rawunicodeescape", "truncated \\uXXXX",
4211 &starts, &end, &startinpos, &endinpos, &exc, &s,
4212 &v, &outpos, &p))
4213 goto onError;
4214 goto nextByte;
4215 }
4216 x = (x<<4) & ~0xF;
4217 if (c >= '0' && c <= '9')
4218 x += c - '0';
4219 else if (c >= 'a' && c <= 'f')
4220 x += 10 + c - 'a';
4221 else
4222 x += 10 + c - 'A';
4223 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004224 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 /* UCS-2 character */
4226 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004227 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004228 /* UCS-4 character. Either store directly, or as
4229 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004230#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004231 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004232#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004233 x -= 0x10000L;
4234 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4235 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004236#endif
4237 } else {
4238 endinpos = s-starts;
4239 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004240 if (unicode_decode_call_errorhandler(
4241 errors, &errorHandler,
4242 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004243 &starts, &end, &startinpos, &endinpos, &exc, &s,
4244 &v, &outpos, &p))
4245 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004246 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 nextByte:
4248 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004250 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004251 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252 Py_XDECREF(errorHandler);
4253 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004255
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258 Py_XDECREF(errorHandler);
4259 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260 return NULL;
4261}
4262
4263PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004264 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004265{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004266 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267 char *p;
4268 char *q;
4269
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004270#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004271 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004272#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004273 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004274#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004275
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004276 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004277 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004278
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004279 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 if (repr == NULL)
4281 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004282 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004283 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004285 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286 while (size-- > 0) {
4287 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004288#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 /* Map 32-bit characters to '\Uxxxxxxxx' */
4290 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004291 *p++ = '\\';
4292 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004293 *p++ = hexdigits[(ch >> 28) & 0xf];
4294 *p++ = hexdigits[(ch >> 24) & 0xf];
4295 *p++ = hexdigits[(ch >> 20) & 0xf];
4296 *p++ = hexdigits[(ch >> 16) & 0xf];
4297 *p++ = hexdigits[(ch >> 12) & 0xf];
4298 *p++ = hexdigits[(ch >> 8) & 0xf];
4299 *p++ = hexdigits[(ch >> 4) & 0xf];
4300 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004301 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004302 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004303#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004304 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4305 if (ch >= 0xD800 && ch < 0xDC00) {
4306 Py_UNICODE ch2;
4307 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004308
Benjamin Peterson29060642009-01-31 22:14:21 +00004309 ch2 = *s++;
4310 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004311 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004312 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4313 *p++ = '\\';
4314 *p++ = 'U';
4315 *p++ = hexdigits[(ucs >> 28) & 0xf];
4316 *p++ = hexdigits[(ucs >> 24) & 0xf];
4317 *p++ = hexdigits[(ucs >> 20) & 0xf];
4318 *p++ = hexdigits[(ucs >> 16) & 0xf];
4319 *p++ = hexdigits[(ucs >> 12) & 0xf];
4320 *p++ = hexdigits[(ucs >> 8) & 0xf];
4321 *p++ = hexdigits[(ucs >> 4) & 0xf];
4322 *p++ = hexdigits[ucs & 0xf];
4323 continue;
4324 }
4325 /* Fall through: isolated surrogates are copied as-is */
4326 s--;
4327 size++;
4328 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004329#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004330 /* Map 16-bit characters to '\uxxxx' */
4331 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 *p++ = '\\';
4333 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004334 *p++ = hexdigits[(ch >> 12) & 0xf];
4335 *p++ = hexdigits[(ch >> 8) & 0xf];
4336 *p++ = hexdigits[(ch >> 4) & 0xf];
4337 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004339 /* Copy everything else as-is */
4340 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341 *p++ = (char) ch;
4342 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004343 size = p - q;
4344
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004345 assert(size > 0);
4346 if (_PyBytes_Resize(&repr, size) < 0)
4347 return NULL;
4348 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349}
4350
4351PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4352{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004353 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004355 PyErr_BadArgument();
4356 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004358 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4359 PyUnicode_GET_SIZE(unicode));
4360
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004361 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362}
4363
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004364/* --- Unicode Internal Codec ------------------------------------------- */
4365
4366PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 Py_ssize_t size,
4368 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004369{
4370 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004371 Py_ssize_t startinpos;
4372 Py_ssize_t endinpos;
4373 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004374 PyUnicodeObject *v;
4375 Py_UNICODE *p;
4376 const char *end;
4377 const char *reason;
4378 PyObject *errorHandler = NULL;
4379 PyObject *exc = NULL;
4380
Neal Norwitzd43069c2006-01-08 01:12:10 +00004381#ifdef Py_UNICODE_WIDE
4382 Py_UNICODE unimax = PyUnicode_GetMax();
4383#endif
4384
Thomas Wouters89f507f2006-12-13 04:49:30 +00004385 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004386 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4387 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004388 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004389 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004390 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004391 p = PyUnicode_AS_UNICODE(v);
4392 end = s + size;
4393
4394 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004395 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004396 /* We have to sanity check the raw data, otherwise doom looms for
4397 some malformed UCS-4 data. */
4398 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004399#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004400 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004401#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004402 end-s < Py_UNICODE_SIZE
4403 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004404 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004405 startinpos = s - starts;
4406 if (end-s < Py_UNICODE_SIZE) {
4407 endinpos = end-starts;
4408 reason = "truncated input";
4409 }
4410 else {
4411 endinpos = s - starts + Py_UNICODE_SIZE;
4412 reason = "illegal code point (> 0x10FFFF)";
4413 }
4414 outpos = p - PyUnicode_AS_UNICODE(v);
4415 if (unicode_decode_call_errorhandler(
4416 errors, &errorHandler,
4417 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004418 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004419 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004420 goto onError;
4421 }
4422 }
4423 else {
4424 p++;
4425 s += Py_UNICODE_SIZE;
4426 }
4427 }
4428
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004429 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004430 goto onError;
4431 Py_XDECREF(errorHandler);
4432 Py_XDECREF(exc);
4433 return (PyObject *)v;
4434
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004436 Py_XDECREF(v);
4437 Py_XDECREF(errorHandler);
4438 Py_XDECREF(exc);
4439 return NULL;
4440}
4441
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442/* --- Latin-1 Codec ------------------------------------------------------ */
4443
4444PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 Py_ssize_t size,
4446 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447{
4448 PyUnicodeObject *v;
4449 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004450 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004451
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004453 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004454 Py_UNICODE r = *(unsigned char*)s;
4455 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004456 }
4457
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 v = _PyUnicode_New(size);
4459 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004460 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004462 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004464 e = s + size;
4465 /* Unrolling the copy makes it much faster by reducing the looping
4466 overhead. This is similar to what many memcpy() implementations do. */
4467 unrolled_end = e - 4;
4468 while (s < unrolled_end) {
4469 p[0] = (unsigned char) s[0];
4470 p[1] = (unsigned char) s[1];
4471 p[2] = (unsigned char) s[2];
4472 p[3] = (unsigned char) s[3];
4473 s += 4;
4474 p += 4;
4475 }
4476 while (s < e)
4477 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004479
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481 Py_XDECREF(v);
4482 return NULL;
4483}
4484
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485/* create or adjust a UnicodeEncodeError */
4486static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004487 const char *encoding,
4488 const Py_UNICODE *unicode, Py_ssize_t size,
4489 Py_ssize_t startpos, Py_ssize_t endpos,
4490 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 *exceptionObject = PyUnicodeEncodeError_Create(
4494 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495 }
4496 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4498 goto onError;
4499 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4500 goto onError;
4501 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4502 goto onError;
4503 return;
4504 onError:
4505 Py_DECREF(*exceptionObject);
4506 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507 }
4508}
4509
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510/* raises a UnicodeEncodeError */
4511static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 const char *encoding,
4513 const Py_UNICODE *unicode, Py_ssize_t size,
4514 Py_ssize_t startpos, Py_ssize_t endpos,
4515 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516{
4517 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004519 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521}
4522
4523/* error handling callback helper:
4524 build arguments, call the callback and check the arguments,
4525 put the result into newpos and return the replacement string, which
4526 has to be freed by the caller */
4527static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 PyObject **errorHandler,
4529 const char *encoding, const char *reason,
4530 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4531 Py_ssize_t startpos, Py_ssize_t endpos,
4532 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004533{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004534 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535
4536 PyObject *restuple;
4537 PyObject *resunicode;
4538
4539 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004543 }
4544
4545 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004546 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549
4550 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004551 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004555 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004556 Py_DECREF(restuple);
4557 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004559 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004560 &resunicode, newpos)) {
4561 Py_DECREF(restuple);
4562 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004564 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4565 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4566 Py_DECREF(restuple);
4567 return NULL;
4568 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004571 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004572 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4573 Py_DECREF(restuple);
4574 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004575 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 Py_INCREF(resunicode);
4577 Py_DECREF(restuple);
4578 return resunicode;
4579}
4580
4581static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004582 Py_ssize_t size,
4583 const char *errors,
4584 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585{
4586 /* output object */
4587 PyObject *res;
4588 /* pointers to the beginning and end+1 of input */
4589 const Py_UNICODE *startp = p;
4590 const Py_UNICODE *endp = p + size;
4591 /* pointer to the beginning of the unencodable characters */
4592 /* const Py_UNICODE *badp = NULL; */
4593 /* pointer into the output */
4594 char *str;
4595 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004596 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004597 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4598 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 PyObject *errorHandler = NULL;
4600 PyObject *exc = NULL;
4601 /* the following variable is used for caching string comparisons
4602 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4603 int known_errorHandler = -1;
4604
4605 /* allocate enough for a simple encoding without
4606 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004607 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004608 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004609 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004611 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004612 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004613 ressize = size;
4614
4615 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 /* can we encode this? */
4619 if (c<limit) {
4620 /* no overflow check, because we know that the space is enough */
4621 *str++ = (char)c;
4622 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004623 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004624 else {
4625 Py_ssize_t unicodepos = p-startp;
4626 Py_ssize_t requiredsize;
4627 PyObject *repunicode;
4628 Py_ssize_t repsize;
4629 Py_ssize_t newpos;
4630 Py_ssize_t respos;
4631 Py_UNICODE *uni2;
4632 /* startpos for collecting unencodable chars */
4633 const Py_UNICODE *collstart = p;
4634 const Py_UNICODE *collend = p;
4635 /* find all unecodable characters */
4636 while ((collend < endp) && ((*collend)>=limit))
4637 ++collend;
4638 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4639 if (known_errorHandler==-1) {
4640 if ((errors==NULL) || (!strcmp(errors, "strict")))
4641 known_errorHandler = 1;
4642 else if (!strcmp(errors, "replace"))
4643 known_errorHandler = 2;
4644 else if (!strcmp(errors, "ignore"))
4645 known_errorHandler = 3;
4646 else if (!strcmp(errors, "xmlcharrefreplace"))
4647 known_errorHandler = 4;
4648 else
4649 known_errorHandler = 0;
4650 }
4651 switch (known_errorHandler) {
4652 case 1: /* strict */
4653 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4654 goto onError;
4655 case 2: /* replace */
4656 while (collstart++<collend)
4657 *str++ = '?'; /* fall through */
4658 case 3: /* ignore */
4659 p = collend;
4660 break;
4661 case 4: /* xmlcharrefreplace */
4662 respos = str - PyBytes_AS_STRING(res);
4663 /* determine replacement size (temporarily (mis)uses p) */
4664 for (p = collstart, repsize = 0; p < collend; ++p) {
4665 if (*p<10)
4666 repsize += 2+1+1;
4667 else if (*p<100)
4668 repsize += 2+2+1;
4669 else if (*p<1000)
4670 repsize += 2+3+1;
4671 else if (*p<10000)
4672 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004673#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 else
4675 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004676#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004677 else if (*p<100000)
4678 repsize += 2+5+1;
4679 else if (*p<1000000)
4680 repsize += 2+6+1;
4681 else
4682 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004683#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004684 }
4685 requiredsize = respos+repsize+(endp-collend);
4686 if (requiredsize > ressize) {
4687 if (requiredsize<2*ressize)
4688 requiredsize = 2*ressize;
4689 if (_PyBytes_Resize(&res, requiredsize))
4690 goto onError;
4691 str = PyBytes_AS_STRING(res) + respos;
4692 ressize = requiredsize;
4693 }
4694 /* generate replacement (temporarily (mis)uses p) */
4695 for (p = collstart; p < collend; ++p) {
4696 str += sprintf(str, "&#%d;", (int)*p);
4697 }
4698 p = collend;
4699 break;
4700 default:
4701 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4702 encoding, reason, startp, size, &exc,
4703 collstart-startp, collend-startp, &newpos);
4704 if (repunicode == NULL)
4705 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004706 if (PyBytes_Check(repunicode)) {
4707 /* Directly copy bytes result to output. */
4708 repsize = PyBytes_Size(repunicode);
4709 if (repsize > 1) {
4710 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004711 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004712 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4713 Py_DECREF(repunicode);
4714 goto onError;
4715 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004716 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004717 ressize += repsize-1;
4718 }
4719 memcpy(str, PyBytes_AsString(repunicode), repsize);
4720 str += repsize;
4721 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004722 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004723 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004724 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004725 /* need more space? (at least enough for what we
4726 have+the replacement+the rest of the string, so
4727 we won't have to check space for encodable characters) */
4728 respos = str - PyBytes_AS_STRING(res);
4729 repsize = PyUnicode_GET_SIZE(repunicode);
4730 requiredsize = respos+repsize+(endp-collend);
4731 if (requiredsize > ressize) {
4732 if (requiredsize<2*ressize)
4733 requiredsize = 2*ressize;
4734 if (_PyBytes_Resize(&res, requiredsize)) {
4735 Py_DECREF(repunicode);
4736 goto onError;
4737 }
4738 str = PyBytes_AS_STRING(res) + respos;
4739 ressize = requiredsize;
4740 }
4741 /* check if there is anything unencodable in the replacement
4742 and copy it to the output */
4743 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4744 c = *uni2;
4745 if (c >= limit) {
4746 raise_encode_exception(&exc, encoding, startp, size,
4747 unicodepos, unicodepos+1, reason);
4748 Py_DECREF(repunicode);
4749 goto onError;
4750 }
4751 *str = (char)c;
4752 }
4753 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004754 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004755 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004756 }
4757 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004758 /* Resize if we allocated to much */
4759 size = str - PyBytes_AS_STRING(res);
4760 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004761 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004762 if (_PyBytes_Resize(&res, size) < 0)
4763 goto onError;
4764 }
4765
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004766 Py_XDECREF(errorHandler);
4767 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004768 return res;
4769
4770 onError:
4771 Py_XDECREF(res);
4772 Py_XDECREF(errorHandler);
4773 Py_XDECREF(exc);
4774 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004775}
4776
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004778 Py_ssize_t size,
4779 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004781 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782}
4783
4784PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4785{
4786 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004787 PyErr_BadArgument();
4788 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 }
4790 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004791 PyUnicode_GET_SIZE(unicode),
4792 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793}
4794
4795/* --- 7-bit ASCII Codec -------------------------------------------------- */
4796
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 Py_ssize_t size,
4799 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802 PyUnicodeObject *v;
4803 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004804 Py_ssize_t startinpos;
4805 Py_ssize_t endinpos;
4806 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 const char *e;
4808 PyObject *errorHandler = NULL;
4809 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004810
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004812 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004813 Py_UNICODE r = *(unsigned char*)s;
4814 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004815 }
Tim Petersced69f82003-09-16 20:30:58 +00004816
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 v = _PyUnicode_New(size);
4818 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004819 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004821 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 e = s + size;
4824 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004825 register unsigned char c = (unsigned char)*s;
4826 if (c < 128) {
4827 *p++ = c;
4828 ++s;
4829 }
4830 else {
4831 startinpos = s-starts;
4832 endinpos = startinpos + 1;
4833 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4834 if (unicode_decode_call_errorhandler(
4835 errors, &errorHandler,
4836 "ascii", "ordinal not in range(128)",
4837 &starts, &e, &startinpos, &endinpos, &exc, &s,
4838 &v, &outpos, &p))
4839 goto onError;
4840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004842 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004843 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4844 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004845 Py_XDECREF(errorHandler);
4846 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004848
Benjamin Peterson29060642009-01-31 22:14:21 +00004849 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004851 Py_XDECREF(errorHandler);
4852 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853 return NULL;
4854}
4855
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004857 Py_ssize_t size,
4858 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004860 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861}
4862
4863PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4864{
4865 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004866 PyErr_BadArgument();
4867 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 }
4869 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004870 PyUnicode_GET_SIZE(unicode),
4871 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872}
4873
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004874#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004875
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004876/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004877
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004878#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004879#define NEED_RETRY
4880#endif
4881
4882/* XXX This code is limited to "true" double-byte encodings, as
4883 a) it assumes an incomplete character consists of a single byte, and
4884 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004885 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004886
4887static int is_dbcs_lead_byte(const char *s, int offset)
4888{
4889 const char *curr = s + offset;
4890
4891 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 const char *prev = CharPrev(s, curr);
4893 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004894 }
4895 return 0;
4896}
4897
4898/*
4899 * Decode MBCS string into unicode object. If 'final' is set, converts
4900 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4901 */
4902static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 const char *s, /* MBCS string */
4904 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004905 int final,
4906 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004907{
4908 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004909 Py_ssize_t n;
4910 DWORD usize;
4911 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004912
4913 assert(size >= 0);
4914
Victor Stinner554f3f02010-06-16 23:33:54 +00004915 /* check and handle 'errors' arg */
4916 if (errors==NULL || strcmp(errors, "strict")==0)
4917 flags = MB_ERR_INVALID_CHARS;
4918 else if (strcmp(errors, "ignore")==0)
4919 flags = 0;
4920 else {
4921 PyErr_Format(PyExc_ValueError,
4922 "mbcs encoding does not support errors='%s'",
4923 errors);
4924 return -1;
4925 }
4926
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004927 /* Skip trailing lead-byte unless 'final' is set */
4928 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004930
4931 /* First get the size of the result */
4932 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004933 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4934 if (usize==0)
4935 goto mbcs_decode_error;
4936 } else
4937 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004938
4939 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 /* Create unicode object */
4941 *v = _PyUnicode_New(usize);
4942 if (*v == NULL)
4943 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004944 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004945 }
4946 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004947 /* Extend unicode object */
4948 n = PyUnicode_GET_SIZE(*v);
4949 if (_PyUnicode_Resize(v, n + usize) < 0)
4950 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004951 }
4952
4953 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004954 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004956 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4957 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004959 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004960 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004961
4962mbcs_decode_error:
4963 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4964 we raise a UnicodeDecodeError - else it is a 'generic'
4965 windows error
4966 */
4967 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4968 /* Ideally, we should get reason from FormatMessage - this
4969 is the Windows 2000 English version of the message
4970 */
4971 PyObject *exc = NULL;
4972 const char *reason = "No mapping for the Unicode character exists "
4973 "in the target multi-byte code page.";
4974 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4975 if (exc != NULL) {
4976 PyCodec_StrictErrors(exc);
4977 Py_DECREF(exc);
4978 }
4979 } else {
4980 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4981 }
4982 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004983}
4984
4985PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 Py_ssize_t size,
4987 const char *errors,
4988 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004989{
4990 PyUnicodeObject *v = NULL;
4991 int done;
4992
4993 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004994 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004995
4996#ifdef NEED_RETRY
4997 retry:
4998 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004999 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005000 else
5001#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005002 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005003
5004 if (done < 0) {
5005 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005006 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005007 }
5008
5009 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005011
5012#ifdef NEED_RETRY
5013 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005014 s += done;
5015 size -= done;
5016 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005017 }
5018#endif
5019
5020 return (PyObject *)v;
5021}
5022
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005023PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005024 Py_ssize_t size,
5025 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005026{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005027 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5028}
5029
5030/*
5031 * Convert unicode into string object (MBCS).
5032 * Returns 0 if succeed, -1 otherwise.
5033 */
5034static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00005036 int size, /* size of unicode */
5037 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005038{
Victor Stinner554f3f02010-06-16 23:33:54 +00005039 BOOL usedDefaultChar = FALSE;
5040 BOOL *pusedDefaultChar;
5041 int mbcssize;
5042 Py_ssize_t n;
5043 PyObject *exc = NULL;
5044 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005045
5046 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005047
Victor Stinner554f3f02010-06-16 23:33:54 +00005048 /* check and handle 'errors' arg */
5049 if (errors==NULL || strcmp(errors, "strict")==0) {
5050 flags = WC_NO_BEST_FIT_CHARS;
5051 pusedDefaultChar = &usedDefaultChar;
5052 } else if (strcmp(errors, "replace")==0) {
5053 flags = 0;
5054 pusedDefaultChar = NULL;
5055 } else {
5056 PyErr_Format(PyExc_ValueError,
5057 "mbcs encoding does not support errors='%s'",
5058 errors);
5059 return -1;
5060 }
5061
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005062 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005063 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005064 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5065 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 if (mbcssize == 0) {
5067 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5068 return -1;
5069 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005070 /* If we used a default char, then we failed! */
5071 if (pusedDefaultChar && *pusedDefaultChar)
5072 goto mbcs_encode_error;
5073 } else {
5074 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005075 }
5076
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005077 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 /* Create string object */
5079 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5080 if (*repr == NULL)
5081 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005082 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005083 }
5084 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005085 /* Extend string object */
5086 n = PyBytes_Size(*repr);
5087 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5088 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005089 }
5090
5091 /* Do the conversion */
5092 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005094 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5095 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005096 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5097 return -1;
5098 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005099 if (pusedDefaultChar && *pusedDefaultChar)
5100 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005101 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005102 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005103
5104mbcs_encode_error:
5105 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5106 Py_XDECREF(exc);
5107 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005108}
5109
5110PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 Py_ssize_t size,
5112 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005113{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005114 PyObject *repr = NULL;
5115 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005116
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005117#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005119 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005120 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005121 else
5122#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005123 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005124
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005125 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005126 Py_XDECREF(repr);
5127 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005128 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005129
5130#ifdef NEED_RETRY
5131 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 p += INT_MAX;
5133 size -= INT_MAX;
5134 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005135 }
5136#endif
5137
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005138 return repr;
5139}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005140
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005141PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5142{
5143 if (!PyUnicode_Check(unicode)) {
5144 PyErr_BadArgument();
5145 return NULL;
5146 }
5147 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005148 PyUnicode_GET_SIZE(unicode),
5149 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005150}
5151
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005152#undef NEED_RETRY
5153
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005154#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005155
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156/* --- Character Mapping Codec -------------------------------------------- */
5157
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005159 Py_ssize_t size,
5160 PyObject *mapping,
5161 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005163 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005164 Py_ssize_t startinpos;
5165 Py_ssize_t endinpos;
5166 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005167 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168 PyUnicodeObject *v;
5169 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005170 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005171 PyObject *errorHandler = NULL;
5172 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005173 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005174 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005175
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176 /* Default to Latin-1 */
5177 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005178 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179
5180 v = _PyUnicode_New(size);
5181 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005184 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005186 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005187 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 mapstring = PyUnicode_AS_UNICODE(mapping);
5189 maplen = PyUnicode_GET_SIZE(mapping);
5190 while (s < e) {
5191 unsigned char ch = *s;
5192 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 if (ch < maplen)
5195 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 if (x == 0xfffe) {
5198 /* undefined mapping */
5199 outpos = p-PyUnicode_AS_UNICODE(v);
5200 startinpos = s-starts;
5201 endinpos = startinpos+1;
5202 if (unicode_decode_call_errorhandler(
5203 errors, &errorHandler,
5204 "charmap", "character maps to <undefined>",
5205 &starts, &e, &startinpos, &endinpos, &exc, &s,
5206 &v, &outpos, &p)) {
5207 goto onError;
5208 }
5209 continue;
5210 }
5211 *p++ = x;
5212 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005213 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005214 }
5215 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005216 while (s < e) {
5217 unsigned char ch = *s;
5218 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005219
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5221 w = PyLong_FromLong((long)ch);
5222 if (w == NULL)
5223 goto onError;
5224 x = PyObject_GetItem(mapping, w);
5225 Py_DECREF(w);
5226 if (x == NULL) {
5227 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5228 /* No mapping found means: mapping is undefined. */
5229 PyErr_Clear();
5230 x = Py_None;
5231 Py_INCREF(x);
5232 } else
5233 goto onError;
5234 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005235
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 /* Apply mapping */
5237 if (PyLong_Check(x)) {
5238 long value = PyLong_AS_LONG(x);
5239 if (value < 0 || value > 65535) {
5240 PyErr_SetString(PyExc_TypeError,
5241 "character mapping must be in range(65536)");
5242 Py_DECREF(x);
5243 goto onError;
5244 }
5245 *p++ = (Py_UNICODE)value;
5246 }
5247 else if (x == Py_None) {
5248 /* undefined mapping */
5249 outpos = p-PyUnicode_AS_UNICODE(v);
5250 startinpos = s-starts;
5251 endinpos = startinpos+1;
5252 if (unicode_decode_call_errorhandler(
5253 errors, &errorHandler,
5254 "charmap", "character maps to <undefined>",
5255 &starts, &e, &startinpos, &endinpos, &exc, &s,
5256 &v, &outpos, &p)) {
5257 Py_DECREF(x);
5258 goto onError;
5259 }
5260 Py_DECREF(x);
5261 continue;
5262 }
5263 else if (PyUnicode_Check(x)) {
5264 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005265
Benjamin Peterson29060642009-01-31 22:14:21 +00005266 if (targetsize == 1)
5267 /* 1-1 mapping */
5268 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005269
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 else if (targetsize > 1) {
5271 /* 1-n mapping */
5272 if (targetsize > extrachars) {
5273 /* resize first */
5274 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5275 Py_ssize_t needed = (targetsize - extrachars) + \
5276 (targetsize << 2);
5277 extrachars += needed;
5278 /* XXX overflow detection missing */
5279 if (_PyUnicode_Resize(&v,
5280 PyUnicode_GET_SIZE(v) + needed) < 0) {
5281 Py_DECREF(x);
5282 goto onError;
5283 }
5284 p = PyUnicode_AS_UNICODE(v) + oldpos;
5285 }
5286 Py_UNICODE_COPY(p,
5287 PyUnicode_AS_UNICODE(x),
5288 targetsize);
5289 p += targetsize;
5290 extrachars -= targetsize;
5291 }
5292 /* 1-0 mapping: skip the character */
5293 }
5294 else {
5295 /* wrong return value */
5296 PyErr_SetString(PyExc_TypeError,
5297 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005298 Py_DECREF(x);
5299 goto onError;
5300 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005301 Py_DECREF(x);
5302 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005303 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 }
5305 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005306 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5307 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005308 Py_XDECREF(errorHandler);
5309 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005311
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005313 Py_XDECREF(errorHandler);
5314 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 Py_XDECREF(v);
5316 return NULL;
5317}
5318
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005319/* Charmap encoding: the lookup table */
5320
5321struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 PyObject_HEAD
5323 unsigned char level1[32];
5324 int count2, count3;
5325 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005326};
5327
5328static PyObject*
5329encoding_map_size(PyObject *obj, PyObject* args)
5330{
5331 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005332 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005334}
5335
5336static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005337 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 PyDoc_STR("Return the size (in bytes) of this object") },
5339 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005340};
5341
5342static void
5343encoding_map_dealloc(PyObject* o)
5344{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005345 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005346}
5347
5348static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005349 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 "EncodingMap", /*tp_name*/
5351 sizeof(struct encoding_map), /*tp_basicsize*/
5352 0, /*tp_itemsize*/
5353 /* methods */
5354 encoding_map_dealloc, /*tp_dealloc*/
5355 0, /*tp_print*/
5356 0, /*tp_getattr*/
5357 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005358 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 0, /*tp_repr*/
5360 0, /*tp_as_number*/
5361 0, /*tp_as_sequence*/
5362 0, /*tp_as_mapping*/
5363 0, /*tp_hash*/
5364 0, /*tp_call*/
5365 0, /*tp_str*/
5366 0, /*tp_getattro*/
5367 0, /*tp_setattro*/
5368 0, /*tp_as_buffer*/
5369 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5370 0, /*tp_doc*/
5371 0, /*tp_traverse*/
5372 0, /*tp_clear*/
5373 0, /*tp_richcompare*/
5374 0, /*tp_weaklistoffset*/
5375 0, /*tp_iter*/
5376 0, /*tp_iternext*/
5377 encoding_map_methods, /*tp_methods*/
5378 0, /*tp_members*/
5379 0, /*tp_getset*/
5380 0, /*tp_base*/
5381 0, /*tp_dict*/
5382 0, /*tp_descr_get*/
5383 0, /*tp_descr_set*/
5384 0, /*tp_dictoffset*/
5385 0, /*tp_init*/
5386 0, /*tp_alloc*/
5387 0, /*tp_new*/
5388 0, /*tp_free*/
5389 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005390};
5391
5392PyObject*
5393PyUnicode_BuildEncodingMap(PyObject* string)
5394{
5395 Py_UNICODE *decode;
5396 PyObject *result;
5397 struct encoding_map *mresult;
5398 int i;
5399 int need_dict = 0;
5400 unsigned char level1[32];
5401 unsigned char level2[512];
5402 unsigned char *mlevel1, *mlevel2, *mlevel3;
5403 int count2 = 0, count3 = 0;
5404
5405 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5406 PyErr_BadArgument();
5407 return NULL;
5408 }
5409 decode = PyUnicode_AS_UNICODE(string);
5410 memset(level1, 0xFF, sizeof level1);
5411 memset(level2, 0xFF, sizeof level2);
5412
5413 /* If there isn't a one-to-one mapping of NULL to \0,
5414 or if there are non-BMP characters, we need to use
5415 a mapping dictionary. */
5416 if (decode[0] != 0)
5417 need_dict = 1;
5418 for (i = 1; i < 256; i++) {
5419 int l1, l2;
5420 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005421#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005422 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005423#endif
5424 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005425 need_dict = 1;
5426 break;
5427 }
5428 if (decode[i] == 0xFFFE)
5429 /* unmapped character */
5430 continue;
5431 l1 = decode[i] >> 11;
5432 l2 = decode[i] >> 7;
5433 if (level1[l1] == 0xFF)
5434 level1[l1] = count2++;
5435 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005436 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005437 }
5438
5439 if (count2 >= 0xFF || count3 >= 0xFF)
5440 need_dict = 1;
5441
5442 if (need_dict) {
5443 PyObject *result = PyDict_New();
5444 PyObject *key, *value;
5445 if (!result)
5446 return NULL;
5447 for (i = 0; i < 256; i++) {
5448 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005449 key = PyLong_FromLong(decode[i]);
5450 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005451 if (!key || !value)
5452 goto failed1;
5453 if (PyDict_SetItem(result, key, value) == -1)
5454 goto failed1;
5455 Py_DECREF(key);
5456 Py_DECREF(value);
5457 }
5458 return result;
5459 failed1:
5460 Py_XDECREF(key);
5461 Py_XDECREF(value);
5462 Py_DECREF(result);
5463 return NULL;
5464 }
5465
5466 /* Create a three-level trie */
5467 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5468 16*count2 + 128*count3 - 1);
5469 if (!result)
5470 return PyErr_NoMemory();
5471 PyObject_Init(result, &EncodingMapType);
5472 mresult = (struct encoding_map*)result;
5473 mresult->count2 = count2;
5474 mresult->count3 = count3;
5475 mlevel1 = mresult->level1;
5476 mlevel2 = mresult->level23;
5477 mlevel3 = mresult->level23 + 16*count2;
5478 memcpy(mlevel1, level1, 32);
5479 memset(mlevel2, 0xFF, 16*count2);
5480 memset(mlevel3, 0, 128*count3);
5481 count3 = 0;
5482 for (i = 1; i < 256; i++) {
5483 int o1, o2, o3, i2, i3;
5484 if (decode[i] == 0xFFFE)
5485 /* unmapped character */
5486 continue;
5487 o1 = decode[i]>>11;
5488 o2 = (decode[i]>>7) & 0xF;
5489 i2 = 16*mlevel1[o1] + o2;
5490 if (mlevel2[i2] == 0xFF)
5491 mlevel2[i2] = count3++;
5492 o3 = decode[i] & 0x7F;
5493 i3 = 128*mlevel2[i2] + o3;
5494 mlevel3[i3] = i;
5495 }
5496 return result;
5497}
5498
5499static int
5500encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5501{
5502 struct encoding_map *map = (struct encoding_map*)mapping;
5503 int l1 = c>>11;
5504 int l2 = (c>>7) & 0xF;
5505 int l3 = c & 0x7F;
5506 int i;
5507
5508#ifdef Py_UNICODE_WIDE
5509 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005511 }
5512#endif
5513 if (c == 0)
5514 return 0;
5515 /* level 1*/
5516 i = map->level1[l1];
5517 if (i == 0xFF) {
5518 return -1;
5519 }
5520 /* level 2*/
5521 i = map->level23[16*i+l2];
5522 if (i == 0xFF) {
5523 return -1;
5524 }
5525 /* level 3 */
5526 i = map->level23[16*map->count2 + 128*i + l3];
5527 if (i == 0) {
5528 return -1;
5529 }
5530 return i;
5531}
5532
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005533/* Lookup the character ch in the mapping. If the character
5534 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005535 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005536static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537{
Christian Heimes217cfd12007-12-02 14:31:20 +00005538 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005539 PyObject *x;
5540
5541 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005543 x = PyObject_GetItem(mapping, w);
5544 Py_DECREF(w);
5545 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5547 /* No mapping found means: mapping is undefined. */
5548 PyErr_Clear();
5549 x = Py_None;
5550 Py_INCREF(x);
5551 return x;
5552 } else
5553 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005555 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005557 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 long value = PyLong_AS_LONG(x);
5559 if (value < 0 || value > 255) {
5560 PyErr_SetString(PyExc_TypeError,
5561 "character mapping must be in range(256)");
5562 Py_DECREF(x);
5563 return NULL;
5564 }
5565 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005567 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 /* wrong return value */
5571 PyErr_Format(PyExc_TypeError,
5572 "character mapping must return integer, bytes or None, not %.400s",
5573 x->ob_type->tp_name);
5574 Py_DECREF(x);
5575 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 }
5577}
5578
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005579static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005580charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005581{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005582 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5583 /* exponentially overallocate to minimize reallocations */
5584 if (requiredsize < 2*outsize)
5585 requiredsize = 2*outsize;
5586 if (_PyBytes_Resize(outobj, requiredsize))
5587 return -1;
5588 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005589}
5590
Benjamin Peterson14339b62009-01-31 16:36:08 +00005591typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005593}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005594/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005595 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005596 space is available. Return a new reference to the object that
5597 was put in the output buffer, or Py_None, if the mapping was undefined
5598 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005599 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005600static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005601charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005602 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005604 PyObject *rep;
5605 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005606 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005607
Christian Heimes90aa7642007-12-19 02:45:37 +00005608 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005609 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005611 if (res == -1)
5612 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 if (outsize<requiredsize)
5614 if (charmapencode_resize(outobj, outpos, requiredsize))
5615 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005616 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 outstart[(*outpos)++] = (char)res;
5618 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005619 }
5620
5621 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005622 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005624 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005625 Py_DECREF(rep);
5626 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005627 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 if (PyLong_Check(rep)) {
5629 Py_ssize_t requiredsize = *outpos+1;
5630 if (outsize<requiredsize)
5631 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5632 Py_DECREF(rep);
5633 return enc_EXCEPTION;
5634 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005635 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005636 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005637 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005638 else {
5639 const char *repchars = PyBytes_AS_STRING(rep);
5640 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5641 Py_ssize_t requiredsize = *outpos+repsize;
5642 if (outsize<requiredsize)
5643 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5644 Py_DECREF(rep);
5645 return enc_EXCEPTION;
5646 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005647 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005648 memcpy(outstart + *outpos, repchars, repsize);
5649 *outpos += repsize;
5650 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005651 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005652 Py_DECREF(rep);
5653 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005654}
5655
5656/* handle an error in PyUnicode_EncodeCharmap
5657 Return 0 on success, -1 on error */
5658static
5659int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005660 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005661 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005662 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005663 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005664{
5665 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005666 Py_ssize_t repsize;
5667 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005668 Py_UNICODE *uni2;
5669 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005670 Py_ssize_t collstartpos = *inpos;
5671 Py_ssize_t collendpos = *inpos+1;
5672 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005673 char *encoding = "charmap";
5674 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005675 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005676
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005677 /* find all unencodable characters */
5678 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005679 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005680 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 int res = encoding_map_lookup(p[collendpos], mapping);
5682 if (res != -1)
5683 break;
5684 ++collendpos;
5685 continue;
5686 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005687
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 rep = charmapencode_lookup(p[collendpos], mapping);
5689 if (rep==NULL)
5690 return -1;
5691 else if (rep!=Py_None) {
5692 Py_DECREF(rep);
5693 break;
5694 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005695 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697 }
5698 /* cache callback name lookup
5699 * (if not done yet, i.e. it's the first error) */
5700 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 if ((errors==NULL) || (!strcmp(errors, "strict")))
5702 *known_errorHandler = 1;
5703 else if (!strcmp(errors, "replace"))
5704 *known_errorHandler = 2;
5705 else if (!strcmp(errors, "ignore"))
5706 *known_errorHandler = 3;
5707 else if (!strcmp(errors, "xmlcharrefreplace"))
5708 *known_errorHandler = 4;
5709 else
5710 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005711 }
5712 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005713 case 1: /* strict */
5714 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5715 return -1;
5716 case 2: /* replace */
5717 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005718 x = charmapencode_output('?', mapping, res, respos);
5719 if (x==enc_EXCEPTION) {
5720 return -1;
5721 }
5722 else if (x==enc_FAILED) {
5723 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5724 return -1;
5725 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005726 }
5727 /* fall through */
5728 case 3: /* ignore */
5729 *inpos = collendpos;
5730 break;
5731 case 4: /* xmlcharrefreplace */
5732 /* generate replacement (temporarily (mis)uses p) */
5733 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005734 char buffer[2+29+1+1];
5735 char *cp;
5736 sprintf(buffer, "&#%d;", (int)p[collpos]);
5737 for (cp = buffer; *cp; ++cp) {
5738 x = charmapencode_output(*cp, mapping, res, respos);
5739 if (x==enc_EXCEPTION)
5740 return -1;
5741 else if (x==enc_FAILED) {
5742 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5743 return -1;
5744 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005745 }
5746 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005747 *inpos = collendpos;
5748 break;
5749 default:
5750 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 encoding, reason, p, size, exceptionObject,
5752 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005753 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005754 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005755 if (PyBytes_Check(repunicode)) {
5756 /* Directly copy bytes result to output. */
5757 Py_ssize_t outsize = PyBytes_Size(*res);
5758 Py_ssize_t requiredsize;
5759 repsize = PyBytes_Size(repunicode);
5760 requiredsize = *respos + repsize;
5761 if (requiredsize > outsize)
5762 /* Make room for all additional bytes. */
5763 if (charmapencode_resize(res, respos, requiredsize)) {
5764 Py_DECREF(repunicode);
5765 return -1;
5766 }
5767 memcpy(PyBytes_AsString(*res) + *respos,
5768 PyBytes_AsString(repunicode), repsize);
5769 *respos += repsize;
5770 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005771 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005772 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005773 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005774 /* generate replacement */
5775 repsize = PyUnicode_GET_SIZE(repunicode);
5776 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 x = charmapencode_output(*uni2, mapping, res, respos);
5778 if (x==enc_EXCEPTION) {
5779 return -1;
5780 }
5781 else if (x==enc_FAILED) {
5782 Py_DECREF(repunicode);
5783 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5784 return -1;
5785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005786 }
5787 *inpos = newpos;
5788 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005789 }
5790 return 0;
5791}
5792
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 Py_ssize_t size,
5795 PyObject *mapping,
5796 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005798 /* output object */
5799 PyObject *res = NULL;
5800 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005801 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005802 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005803 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005804 PyObject *errorHandler = NULL;
5805 PyObject *exc = NULL;
5806 /* the following variable is used for caching string comparisons
5807 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5808 * 3=ignore, 4=xmlcharrefreplace */
5809 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810
5811 /* Default to Latin-1 */
5812 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005815 /* allocate enough for a simple encoding without
5816 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005817 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005818 if (res == NULL)
5819 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005820 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005823 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 /* try to encode it */
5825 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5826 if (x==enc_EXCEPTION) /* error */
5827 goto onError;
5828 if (x==enc_FAILED) { /* unencodable character */
5829 if (charmap_encoding_error(p, size, &inpos, mapping,
5830 &exc,
5831 &known_errorHandler, &errorHandler, errors,
5832 &res, &respos)) {
5833 goto onError;
5834 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005835 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 else
5837 /* done with this character => adjust input position */
5838 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005841 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005842 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005843 if (_PyBytes_Resize(&res, respos) < 0)
5844 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005845
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005846 Py_XDECREF(exc);
5847 Py_XDECREF(errorHandler);
5848 return res;
5849
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005851 Py_XDECREF(res);
5852 Py_XDECREF(exc);
5853 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 return NULL;
5855}
5856
5857PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005858 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859{
5860 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 PyErr_BadArgument();
5862 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 }
5864 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 PyUnicode_GET_SIZE(unicode),
5866 mapping,
5867 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868}
5869
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005870/* create or adjust a UnicodeTranslateError */
5871static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 const Py_UNICODE *unicode, Py_ssize_t size,
5873 Py_ssize_t startpos, Py_ssize_t endpos,
5874 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005876 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005877 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 }
5880 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5882 goto onError;
5883 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5884 goto onError;
5885 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5886 goto onError;
5887 return;
5888 onError:
5889 Py_DECREF(*exceptionObject);
5890 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 }
5892}
5893
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005894/* raises a UnicodeTranslateError */
5895static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 const Py_UNICODE *unicode, Py_ssize_t size,
5897 Py_ssize_t startpos, Py_ssize_t endpos,
5898 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005899{
5900 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005902 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005904}
5905
5906/* error handling callback helper:
5907 build arguments, call the callback and check the arguments,
5908 put the result into newpos and return the replacement string, which
5909 has to be freed by the caller */
5910static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005911 PyObject **errorHandler,
5912 const char *reason,
5913 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5914 Py_ssize_t startpos, Py_ssize_t endpos,
5915 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005917 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005918
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005919 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005920 PyObject *restuple;
5921 PyObject *resunicode;
5922
5923 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005924 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005925 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005927 }
5928
5929 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005930 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005931 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005932 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005933
5934 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005936 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005938 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005939 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 Py_DECREF(restuple);
5941 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005942 }
5943 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 &resunicode, &i_newpos)) {
5945 Py_DECREF(restuple);
5946 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005947 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005948 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005950 else
5951 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005952 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5954 Py_DECREF(restuple);
5955 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005956 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005957 Py_INCREF(resunicode);
5958 Py_DECREF(restuple);
5959 return resunicode;
5960}
5961
5962/* Lookup the character ch in the mapping and put the result in result,
5963 which must be decrefed by the caller.
5964 Return 0 on success, -1 on error */
5965static
5966int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5967{
Christian Heimes217cfd12007-12-02 14:31:20 +00005968 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005969 PyObject *x;
5970
5971 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005973 x = PyObject_GetItem(mapping, w);
5974 Py_DECREF(w);
5975 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5977 /* No mapping found means: use 1:1 mapping. */
5978 PyErr_Clear();
5979 *result = NULL;
5980 return 0;
5981 } else
5982 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005983 }
5984 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 *result = x;
5986 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005988 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 long value = PyLong_AS_LONG(x);
5990 long max = PyUnicode_GetMax();
5991 if (value < 0 || value > max) {
5992 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005993 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 Py_DECREF(x);
5995 return -1;
5996 }
5997 *result = x;
5998 return 0;
5999 }
6000 else if (PyUnicode_Check(x)) {
6001 *result = x;
6002 return 0;
6003 }
6004 else {
6005 /* wrong return value */
6006 PyErr_SetString(PyExc_TypeError,
6007 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006008 Py_DECREF(x);
6009 return -1;
6010 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011}
6012/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 if not reallocate and adjust various state variables.
6014 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006015static
Walter Dörwald4894c302003-10-24 14:25:28 +00006016int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006018{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006019 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006020 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 /* remember old output position */
6022 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6023 /* exponentially overallocate to minimize reallocations */
6024 if (requiredsize < 2 * oldsize)
6025 requiredsize = 2 * oldsize;
6026 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6027 return -1;
6028 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006029 }
6030 return 0;
6031}
6032/* lookup the character, put the result in the output string and adjust
6033 various state variables. Return a new reference to the object that
6034 was put in the output buffer in *result, or Py_None, if the mapping was
6035 undefined (in which case no character was written).
6036 The called must decref result.
6037 Return 0 on success, -1 on error. */
6038static
Walter Dörwald4894c302003-10-24 14:25:28 +00006039int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6041 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042{
Walter Dörwald4894c302003-10-24 14:25:28 +00006043 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006044 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006045 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 /* not found => default to 1:1 mapping */
6047 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006048 }
6049 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006051 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 /* no overflow check, because we know that the space is enough */
6053 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006054 }
6055 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6057 if (repsize==1) {
6058 /* no overflow check, because we know that the space is enough */
6059 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6060 }
6061 else if (repsize!=0) {
6062 /* more than one character */
6063 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6064 (insize - (curinp-startinp)) +
6065 repsize - 1;
6066 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6067 return -1;
6068 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6069 *outp += repsize;
6070 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006071 }
6072 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006074 return 0;
6075}
6076
6077PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 Py_ssize_t size,
6079 PyObject *mapping,
6080 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006082 /* output object */
6083 PyObject *res = NULL;
6084 /* pointers to the beginning and end+1 of input */
6085 const Py_UNICODE *startp = p;
6086 const Py_UNICODE *endp = p + size;
6087 /* pointer into the output */
6088 Py_UNICODE *str;
6089 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006090 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006091 char *reason = "character maps to <undefined>";
6092 PyObject *errorHandler = NULL;
6093 PyObject *exc = NULL;
6094 /* the following variable is used for caching string comparisons
6095 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6096 * 3=ignore, 4=xmlcharrefreplace */
6097 int known_errorHandler = -1;
6098
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 PyErr_BadArgument();
6101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006103
6104 /* allocate enough for a simple 1:1 translation without
6105 replacements, if we need more, we'll resize */
6106 res = PyUnicode_FromUnicode(NULL, size);
6107 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006113 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 /* try to encode it */
6115 PyObject *x = NULL;
6116 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6117 Py_XDECREF(x);
6118 goto onError;
6119 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006120 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 if (x!=Py_None) /* it worked => adjust input pointer */
6122 ++p;
6123 else { /* untranslatable character */
6124 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6125 Py_ssize_t repsize;
6126 Py_ssize_t newpos;
6127 Py_UNICODE *uni2;
6128 /* startpos for collecting untranslatable chars */
6129 const Py_UNICODE *collstart = p;
6130 const Py_UNICODE *collend = p+1;
6131 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 /* find all untranslatable characters */
6134 while (collend < endp) {
6135 if (charmaptranslate_lookup(*collend, mapping, &x))
6136 goto onError;
6137 Py_XDECREF(x);
6138 if (x!=Py_None)
6139 break;
6140 ++collend;
6141 }
6142 /* cache callback name lookup
6143 * (if not done yet, i.e. it's the first error) */
6144 if (known_errorHandler==-1) {
6145 if ((errors==NULL) || (!strcmp(errors, "strict")))
6146 known_errorHandler = 1;
6147 else if (!strcmp(errors, "replace"))
6148 known_errorHandler = 2;
6149 else if (!strcmp(errors, "ignore"))
6150 known_errorHandler = 3;
6151 else if (!strcmp(errors, "xmlcharrefreplace"))
6152 known_errorHandler = 4;
6153 else
6154 known_errorHandler = 0;
6155 }
6156 switch (known_errorHandler) {
6157 case 1: /* strict */
6158 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006159 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 case 2: /* replace */
6161 /* No need to check for space, this is a 1:1 replacement */
6162 for (coll = collstart; coll<collend; ++coll)
6163 *str++ = '?';
6164 /* fall through */
6165 case 3: /* ignore */
6166 p = collend;
6167 break;
6168 case 4: /* xmlcharrefreplace */
6169 /* generate replacement (temporarily (mis)uses p) */
6170 for (p = collstart; p < collend; ++p) {
6171 char buffer[2+29+1+1];
6172 char *cp;
6173 sprintf(buffer, "&#%d;", (int)*p);
6174 if (charmaptranslate_makespace(&res, &str,
6175 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6176 goto onError;
6177 for (cp = buffer; *cp; ++cp)
6178 *str++ = *cp;
6179 }
6180 p = collend;
6181 break;
6182 default:
6183 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6184 reason, startp, size, &exc,
6185 collstart-startp, collend-startp, &newpos);
6186 if (repunicode == NULL)
6187 goto onError;
6188 /* generate replacement */
6189 repsize = PyUnicode_GET_SIZE(repunicode);
6190 if (charmaptranslate_makespace(&res, &str,
6191 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6192 Py_DECREF(repunicode);
6193 goto onError;
6194 }
6195 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6196 *str++ = *uni2;
6197 p = startp + newpos;
6198 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006199 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006200 }
6201 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006202 /* Resize if we allocated to much */
6203 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006204 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 if (PyUnicode_Resize(&res, respos) < 0)
6206 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006207 }
6208 Py_XDECREF(exc);
6209 Py_XDECREF(errorHandler);
6210 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006213 Py_XDECREF(res);
6214 Py_XDECREF(exc);
6215 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 return NULL;
6217}
6218
6219PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 PyObject *mapping,
6221 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222{
6223 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006224
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 str = PyUnicode_FromObject(str);
6226 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006227 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 PyUnicode_GET_SIZE(str),
6230 mapping,
6231 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 Py_DECREF(str);
6233 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006234
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 Py_XDECREF(str);
6237 return NULL;
6238}
Tim Petersced69f82003-09-16 20:30:58 +00006239
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006240PyObject *
6241PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6242 Py_ssize_t length)
6243{
6244 PyObject *result;
6245 Py_UNICODE *p; /* write pointer into result */
6246 Py_ssize_t i;
6247 /* Copy to a new string */
6248 result = (PyObject *)_PyUnicode_New(length);
6249 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6250 if (result == NULL)
6251 return result;
6252 p = PyUnicode_AS_UNICODE(result);
6253 /* Iterate over code points */
6254 for (i = 0; i < length; i++) {
6255 Py_UNICODE ch =s[i];
6256 if (ch > 127) {
6257 int decimal = Py_UNICODE_TODECIMAL(ch);
6258 if (decimal >= 0)
6259 p[i] = '0' + decimal;
6260 }
6261 }
6262 return result;
6263}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006264/* --- Decimal Encoder ---------------------------------------------------- */
6265
6266int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006267 Py_ssize_t length,
6268 char *output,
6269 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006270{
6271 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006272 PyObject *errorHandler = NULL;
6273 PyObject *exc = NULL;
6274 const char *encoding = "decimal";
6275 const char *reason = "invalid decimal Unicode string";
6276 /* the following variable is used for caching string comparisons
6277 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6278 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006279
6280 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 PyErr_BadArgument();
6282 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006283 }
6284
6285 p = s;
6286 end = s + length;
6287 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 register Py_UNICODE ch = *p;
6289 int decimal;
6290 PyObject *repunicode;
6291 Py_ssize_t repsize;
6292 Py_ssize_t newpos;
6293 Py_UNICODE *uni2;
6294 Py_UNICODE *collstart;
6295 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006296
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006298 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 ++p;
6300 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006301 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 decimal = Py_UNICODE_TODECIMAL(ch);
6303 if (decimal >= 0) {
6304 *output++ = '0' + decimal;
6305 ++p;
6306 continue;
6307 }
6308 if (0 < ch && ch < 256) {
6309 *output++ = (char)ch;
6310 ++p;
6311 continue;
6312 }
6313 /* All other characters are considered unencodable */
6314 collstart = p;
6315 collend = p+1;
6316 while (collend < end) {
6317 if ((0 < *collend && *collend < 256) ||
6318 !Py_UNICODE_ISSPACE(*collend) ||
6319 Py_UNICODE_TODECIMAL(*collend))
6320 break;
6321 }
6322 /* cache callback name lookup
6323 * (if not done yet, i.e. it's the first error) */
6324 if (known_errorHandler==-1) {
6325 if ((errors==NULL) || (!strcmp(errors, "strict")))
6326 known_errorHandler = 1;
6327 else if (!strcmp(errors, "replace"))
6328 known_errorHandler = 2;
6329 else if (!strcmp(errors, "ignore"))
6330 known_errorHandler = 3;
6331 else if (!strcmp(errors, "xmlcharrefreplace"))
6332 known_errorHandler = 4;
6333 else
6334 known_errorHandler = 0;
6335 }
6336 switch (known_errorHandler) {
6337 case 1: /* strict */
6338 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6339 goto onError;
6340 case 2: /* replace */
6341 for (p = collstart; p < collend; ++p)
6342 *output++ = '?';
6343 /* fall through */
6344 case 3: /* ignore */
6345 p = collend;
6346 break;
6347 case 4: /* xmlcharrefreplace */
6348 /* generate replacement (temporarily (mis)uses p) */
6349 for (p = collstart; p < collend; ++p)
6350 output += sprintf(output, "&#%d;", (int)*p);
6351 p = collend;
6352 break;
6353 default:
6354 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6355 encoding, reason, s, length, &exc,
6356 collstart-s, collend-s, &newpos);
6357 if (repunicode == NULL)
6358 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006359 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006360 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006361 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6362 Py_DECREF(repunicode);
6363 goto onError;
6364 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 /* generate replacement */
6366 repsize = PyUnicode_GET_SIZE(repunicode);
6367 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6368 Py_UNICODE ch = *uni2;
6369 if (Py_UNICODE_ISSPACE(ch))
6370 *output++ = ' ';
6371 else {
6372 decimal = Py_UNICODE_TODECIMAL(ch);
6373 if (decimal >= 0)
6374 *output++ = '0' + decimal;
6375 else if (0 < ch && ch < 256)
6376 *output++ = (char)ch;
6377 else {
6378 Py_DECREF(repunicode);
6379 raise_encode_exception(&exc, encoding,
6380 s, length, collstart-s, collend-s, reason);
6381 goto onError;
6382 }
6383 }
6384 }
6385 p = s + newpos;
6386 Py_DECREF(repunicode);
6387 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006388 }
6389 /* 0-terminate the output string */
6390 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006391 Py_XDECREF(exc);
6392 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006393 return 0;
6394
Benjamin Peterson29060642009-01-31 22:14:21 +00006395 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396 Py_XDECREF(exc);
6397 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006398 return -1;
6399}
6400
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401/* --- Helpers ------------------------------------------------------------ */
6402
Eric Smith8c663262007-08-25 02:26:07 +00006403#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006404#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006405
Thomas Wouters477c8d52006-05-27 19:21:47 +00006406#include "stringlib/count.h"
6407#include "stringlib/find.h"
6408#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006409#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006410
Eric Smith5807c412008-05-11 21:00:57 +00006411#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006412#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006413#include "stringlib/localeutil.h"
6414
Thomas Wouters477c8d52006-05-27 19:21:47 +00006415/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006416#define ADJUST_INDICES(start, end, len) \
6417 if (end > len) \
6418 end = len; \
6419 else if (end < 0) { \
6420 end += len; \
6421 if (end < 0) \
6422 end = 0; \
6423 } \
6424 if (start < 0) { \
6425 start += len; \
6426 if (start < 0) \
6427 start = 0; \
6428 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006429
Martin v. Löwis18e16552006-02-15 17:27:45 +00006430Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006431 PyObject *substr,
6432 Py_ssize_t start,
6433 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006435 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006436 PyUnicodeObject* str_obj;
6437 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006438
Thomas Wouters477c8d52006-05-27 19:21:47 +00006439 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6440 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006442 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6443 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006444 Py_DECREF(str_obj);
6445 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 }
Tim Petersced69f82003-09-16 20:30:58 +00006447
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006448 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006449 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006450 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6451 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006452 );
6453
6454 Py_DECREF(sub_obj);
6455 Py_DECREF(str_obj);
6456
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 return result;
6458}
6459
Martin v. Löwis18e16552006-02-15 17:27:45 +00006460Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006461 PyObject *sub,
6462 Py_ssize_t start,
6463 Py_ssize_t end,
6464 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006466 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006467
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006469 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006471 sub = PyUnicode_FromObject(sub);
6472 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 Py_DECREF(str);
6474 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 }
Tim Petersced69f82003-09-16 20:30:58 +00006476
Thomas Wouters477c8d52006-05-27 19:21:47 +00006477 if (direction > 0)
6478 result = stringlib_find_slice(
6479 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6480 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6481 start, end
6482 );
6483 else
6484 result = stringlib_rfind_slice(
6485 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6486 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6487 start, end
6488 );
6489
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006491 Py_DECREF(sub);
6492
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493 return result;
6494}
6495
Tim Petersced69f82003-09-16 20:30:58 +00006496static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 PyUnicodeObject *substring,
6499 Py_ssize_t start,
6500 Py_ssize_t end,
6501 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 if (substring->length == 0)
6504 return 1;
6505
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006506 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 end -= substring->length;
6508 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510
6511 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 if (Py_UNICODE_MATCH(self, end, substring))
6513 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 } else {
6515 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 }
6518
6519 return 0;
6520}
6521
Martin v. Löwis18e16552006-02-15 17:27:45 +00006522Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 PyObject *substr,
6524 Py_ssize_t start,
6525 Py_ssize_t end,
6526 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006528 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006529
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 str = PyUnicode_FromObject(str);
6531 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 substr = PyUnicode_FromObject(substr);
6534 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 Py_DECREF(str);
6536 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 }
Tim Petersced69f82003-09-16 20:30:58 +00006538
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 (PyUnicodeObject *)substr,
6541 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 Py_DECREF(str);
6543 Py_DECREF(substr);
6544 return result;
6545}
6546
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547/* Apply fixfct filter to the Unicode object self and return a
6548 reference to the modified object */
6549
Tim Petersced69f82003-09-16 20:30:58 +00006550static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553{
6554
6555 PyUnicodeObject *u;
6556
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006557 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006560
6561 Py_UNICODE_COPY(u->str, self->str, self->length);
6562
Tim Peters7a29bd52001-09-12 03:03:31 +00006563 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 /* fixfct should return TRUE if it modified the buffer. If
6565 FALSE, return a reference to the original buffer instead
6566 (to save space, not time) */
6567 Py_INCREF(self);
6568 Py_DECREF(u);
6569 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 }
6571 return (PyObject*) u;
6572}
6573
Tim Petersced69f82003-09-16 20:30:58 +00006574static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575int fixupper(PyUnicodeObject *self)
6576{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006577 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578 Py_UNICODE *s = self->str;
6579 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006580
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006582 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006583
Benjamin Peterson29060642009-01-31 22:14:21 +00006584 ch = Py_UNICODE_TOUPPER(*s);
6585 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 *s = ch;
6588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 s++;
6590 }
6591
6592 return status;
6593}
6594
Tim Petersced69f82003-09-16 20:30:58 +00006595static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596int fixlower(PyUnicodeObject *self)
6597{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006598 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 Py_UNICODE *s = self->str;
6600 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006601
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006604
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 ch = Py_UNICODE_TOLOWER(*s);
6606 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 *s = ch;
6609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 s++;
6611 }
6612
6613 return status;
6614}
6615
Tim Petersced69f82003-09-16 20:30:58 +00006616static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617int fixswapcase(PyUnicodeObject *self)
6618{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006619 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 Py_UNICODE *s = self->str;
6621 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006622
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623 while (len-- > 0) {
6624 if (Py_UNICODE_ISUPPER(*s)) {
6625 *s = Py_UNICODE_TOLOWER(*s);
6626 status = 1;
6627 } else if (Py_UNICODE_ISLOWER(*s)) {
6628 *s = Py_UNICODE_TOUPPER(*s);
6629 status = 1;
6630 }
6631 s++;
6632 }
6633
6634 return status;
6635}
6636
Tim Petersced69f82003-09-16 20:30:58 +00006637static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638int fixcapitalize(PyUnicodeObject *self)
6639{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006640 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006641 Py_UNICODE *s = self->str;
6642 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006643
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006644 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006646 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 *s = Py_UNICODE_TOUPPER(*s);
6648 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006650 s++;
6651 while (--len > 0) {
6652 if (Py_UNICODE_ISUPPER(*s)) {
6653 *s = Py_UNICODE_TOLOWER(*s);
6654 status = 1;
6655 }
6656 s++;
6657 }
6658 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659}
6660
6661static
6662int fixtitle(PyUnicodeObject *self)
6663{
6664 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6665 register Py_UNICODE *e;
6666 int previous_is_cased;
6667
6668 /* Shortcut for single character strings */
6669 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6671 if (*p != ch) {
6672 *p = ch;
6673 return 1;
6674 }
6675 else
6676 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 }
Tim Petersced69f82003-09-16 20:30:58 +00006678
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 e = p + PyUnicode_GET_SIZE(self);
6680 previous_is_cased = 0;
6681 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006682 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006683
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 if (previous_is_cased)
6685 *p = Py_UNICODE_TOLOWER(ch);
6686 else
6687 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006688
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 if (Py_UNICODE_ISLOWER(ch) ||
6690 Py_UNICODE_ISUPPER(ch) ||
6691 Py_UNICODE_ISTITLE(ch))
6692 previous_is_cased = 1;
6693 else
6694 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 }
6696 return 1;
6697}
6698
Tim Peters8ce9f162004-08-27 01:49:32 +00006699PyObject *
6700PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701{
Skip Montanaro6543b452004-09-16 03:28:13 +00006702 const Py_UNICODE blank = ' ';
6703 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006704 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006705 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006706 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6707 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006708 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6709 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006710 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006711 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712
Tim Peters05eba1f2004-08-27 21:32:02 +00006713 fseq = PySequence_Fast(seq, "");
6714 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006715 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006716 }
6717
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006718 /* NOTE: the following code can't call back into Python code,
6719 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006720 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006721
Tim Peters05eba1f2004-08-27 21:32:02 +00006722 seqlen = PySequence_Fast_GET_SIZE(fseq);
6723 /* If empty sequence, return u"". */
6724 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006725 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6726 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006727 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006728 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006729 /* If singleton sequence with an exact Unicode, return that. */
6730 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 item = items[0];
6732 if (PyUnicode_CheckExact(item)) {
6733 Py_INCREF(item);
6734 res = (PyUnicodeObject *)item;
6735 goto Done;
6736 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006737 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006738 else {
6739 /* Set up sep and seplen */
6740 if (separator == NULL) {
6741 sep = &blank;
6742 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006743 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006744 else {
6745 if (!PyUnicode_Check(separator)) {
6746 PyErr_Format(PyExc_TypeError,
6747 "separator: expected str instance,"
6748 " %.80s found",
6749 Py_TYPE(separator)->tp_name);
6750 goto onError;
6751 }
6752 sep = PyUnicode_AS_UNICODE(separator);
6753 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006754 }
6755 }
6756
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006757 /* There are at least two things to join, or else we have a subclass
6758 * of str in the sequence.
6759 * Do a pre-pass to figure out the total amount of space we'll
6760 * need (sz), and see whether all argument are strings.
6761 */
6762 sz = 0;
6763 for (i = 0; i < seqlen; i++) {
6764 const Py_ssize_t old_sz = sz;
6765 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 if (!PyUnicode_Check(item)) {
6767 PyErr_Format(PyExc_TypeError,
6768 "sequence item %zd: expected str instance,"
6769 " %.80s found",
6770 i, Py_TYPE(item)->tp_name);
6771 goto onError;
6772 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006773 sz += PyUnicode_GET_SIZE(item);
6774 if (i != 0)
6775 sz += seplen;
6776 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6777 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006779 goto onError;
6780 }
6781 }
Tim Petersced69f82003-09-16 20:30:58 +00006782
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006783 res = _PyUnicode_New(sz);
6784 if (res == NULL)
6785 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006786
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006787 /* Catenate everything. */
6788 res_p = PyUnicode_AS_UNICODE(res);
6789 for (i = 0; i < seqlen; ++i) {
6790 Py_ssize_t itemlen;
6791 item = items[i];
6792 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006793 /* Copy item, and maybe the separator. */
6794 if (i) {
6795 Py_UNICODE_COPY(res_p, sep, seplen);
6796 res_p += seplen;
6797 }
6798 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6799 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006800 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006801
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006803 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804 return (PyObject *)res;
6805
Benjamin Peterson29060642009-01-31 22:14:21 +00006806 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006807 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006808 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809 return NULL;
6810}
6811
Tim Petersced69f82003-09-16 20:30:58 +00006812static
6813PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 Py_ssize_t left,
6815 Py_ssize_t right,
6816 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817{
6818 PyUnicodeObject *u;
6819
6820 if (left < 0)
6821 left = 0;
6822 if (right < 0)
6823 right = 0;
6824
Tim Peters7a29bd52001-09-12 03:03:31 +00006825 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 Py_INCREF(self);
6827 return self;
6828 }
6829
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006830 if (left > PY_SSIZE_T_MAX - self->length ||
6831 right > PY_SSIZE_T_MAX - (left + self->length)) {
6832 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6833 return NULL;
6834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 u = _PyUnicode_New(left + self->length + right);
6836 if (u) {
6837 if (left)
6838 Py_UNICODE_FILL(u->str, fill, left);
6839 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6840 if (right)
6841 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6842 }
6843
6844 return u;
6845}
6846
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006847PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850
6851 string = PyUnicode_FromObject(string);
6852 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006853 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006855 list = stringlib_splitlines(
6856 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6857 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858
6859 Py_DECREF(string);
6860 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861}
6862
Tim Petersced69f82003-09-16 20:30:58 +00006863static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 PyUnicodeObject *substring,
6866 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006869 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006872 return stringlib_split_whitespace(
6873 (PyObject*) self, self->str, self->length, maxcount
6874 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006876 return stringlib_split(
6877 (PyObject*) self, self->str, self->length,
6878 substring->str, substring->length,
6879 maxcount
6880 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881}
6882
Tim Petersced69f82003-09-16 20:30:58 +00006883static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006884PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 PyUnicodeObject *substring,
6886 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006887{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006888 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006889 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006890
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006891 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006892 return stringlib_rsplit_whitespace(
6893 (PyObject*) self, self->str, self->length, maxcount
6894 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006895
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006896 return stringlib_rsplit(
6897 (PyObject*) self, self->str, self->length,
6898 substring->str, substring->length,
6899 maxcount
6900 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006901}
6902
6903static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006905 PyUnicodeObject *str1,
6906 PyUnicodeObject *str2,
6907 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908{
6909 PyUnicodeObject *u;
6910
6911 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006912 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006913 else if (maxcount == 0 || self->length == 0)
6914 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915
Thomas Wouters477c8d52006-05-27 19:21:47 +00006916 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006917 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006918 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006919 if (str1->length == 0)
6920 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006921 if (str1->length == 1) {
6922 /* replace characters */
6923 Py_UNICODE u1, u2;
6924 if (!findchar(self->str, self->length, str1->str[0]))
6925 goto nothing;
6926 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6927 if (!u)
6928 return NULL;
6929 Py_UNICODE_COPY(u->str, self->str, self->length);
6930 u1 = str1->str[0];
6931 u2 = str2->str[0];
6932 for (i = 0; i < u->length; i++)
6933 if (u->str[i] == u1) {
6934 if (--maxcount < 0)
6935 break;
6936 u->str[i] = u2;
6937 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006939 i = stringlib_find(
6940 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006942 if (i < 0)
6943 goto nothing;
6944 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6945 if (!u)
6946 return NULL;
6947 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006948
6949 /* change everything in-place, starting with this one */
6950 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6951 i += str1->length;
6952
6953 while ( --maxcount > 0) {
6954 i = stringlib_find(self->str+i, self->length-i,
6955 str1->str, str1->length,
6956 i);
6957 if (i == -1)
6958 break;
6959 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6960 i += str1->length;
6961 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006964
6965 Py_ssize_t n, i, j, e;
6966 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967 Py_UNICODE *p;
6968
6969 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006970 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6971 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006972 if (n == 0)
6973 goto nothing;
6974 /* new_size = self->length + n * (str2->length - str1->length)); */
6975 delta = (str2->length - str1->length);
6976 if (delta == 0) {
6977 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006979 product = n * (str2->length - str1->length);
6980 if ((product / (str2->length - str1->length)) != n) {
6981 PyErr_SetString(PyExc_OverflowError,
6982 "replace string is too long");
6983 return NULL;
6984 }
6985 new_size = self->length + product;
6986 if (new_size < 0) {
6987 PyErr_SetString(PyExc_OverflowError,
6988 "replace string is too long");
6989 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990 }
6991 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006992 u = _PyUnicode_New(new_size);
6993 if (!u)
6994 return NULL;
6995 i = 0;
6996 p = u->str;
6997 e = self->length - str1->length;
6998 if (str1->length > 0) {
6999 while (n-- > 0) {
7000 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007001 j = stringlib_find(self->str+i, self->length-i,
7002 str1->str, str1->length,
7003 i);
7004 if (j == -1)
7005 break;
7006 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007007 /* copy unchanged part [i:j] */
7008 Py_UNICODE_COPY(p, self->str+i, j-i);
7009 p += j - i;
7010 }
7011 /* copy substitution string */
7012 if (str2->length > 0) {
7013 Py_UNICODE_COPY(p, str2->str, str2->length);
7014 p += str2->length;
7015 }
7016 i = j + str1->length;
7017 }
7018 if (i < self->length)
7019 /* copy tail [i:] */
7020 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7021 } else {
7022 /* interleave */
7023 while (n > 0) {
7024 Py_UNICODE_COPY(p, str2->str, str2->length);
7025 p += str2->length;
7026 if (--n <= 0)
7027 break;
7028 *p++ = self->str[i++];
7029 }
7030 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007034
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007036 /* nothing to replace; return original string (when possible) */
7037 if (PyUnicode_CheckExact(self)) {
7038 Py_INCREF(self);
7039 return (PyObject *) self;
7040 }
7041 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042}
7043
7044/* --- Unicode Object Methods --------------------------------------------- */
7045
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007046PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048\n\
7049Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007050characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051
7052static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007053unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055 return fixup(self, fixtitle);
7056}
7057
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007058PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060\n\
7061Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007062have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063
7064static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007065unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067 return fixup(self, fixcapitalize);
7068}
7069
7070#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007071PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007072 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073\n\
7074Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007075normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076
7077static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007078unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079{
7080 PyObject *list;
7081 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007082 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084 /* Split into words */
7085 list = split(self, NULL, -1);
7086 if (!list)
7087 return NULL;
7088
7089 /* Capitalize each word */
7090 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7091 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093 if (item == NULL)
7094 goto onError;
7095 Py_DECREF(PyList_GET_ITEM(list, i));
7096 PyList_SET_ITEM(list, i, item);
7097 }
7098
7099 /* Join the words to form a new string */
7100 item = PyUnicode_Join(NULL, list);
7101
Benjamin Peterson29060642009-01-31 22:14:21 +00007102 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103 Py_DECREF(list);
7104 return (PyObject *)item;
7105}
7106#endif
7107
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007108/* Argument converter. Coerces to a single unicode character */
7109
7110static int
7111convert_uc(PyObject *obj, void *addr)
7112{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007113 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7114 PyObject *uniobj;
7115 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007116
Benjamin Peterson14339b62009-01-31 16:36:08 +00007117 uniobj = PyUnicode_FromObject(obj);
7118 if (uniobj == NULL) {
7119 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007121 return 0;
7122 }
7123 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7124 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007126 Py_DECREF(uniobj);
7127 return 0;
7128 }
7129 unistr = PyUnicode_AS_UNICODE(uniobj);
7130 *fillcharloc = unistr[0];
7131 Py_DECREF(uniobj);
7132 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007133}
7134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007135PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007138Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007139done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140
7141static PyObject *
7142unicode_center(PyUnicodeObject *self, PyObject *args)
7143{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007144 Py_ssize_t marg, left;
7145 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007146 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147
Thomas Woutersde017742006-02-16 19:34:37 +00007148 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 return NULL;
7150
Tim Peters7a29bd52001-09-12 03:03:31 +00007151 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 Py_INCREF(self);
7153 return (PyObject*) self;
7154 }
7155
7156 marg = width - self->length;
7157 left = marg / 2 + (marg & width & 1);
7158
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007159 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160}
7161
Marc-André Lemburge5034372000-08-08 08:04:29 +00007162#if 0
7163
7164/* This code should go into some future Unicode collation support
7165 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007166 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007167
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007168/* speedy UTF-16 code point order comparison */
7169/* gleaned from: */
7170/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7171
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007172static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007173{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007174 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007175 0, 0, 0, 0, 0, 0, 0, 0,
7176 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007177 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007178};
7179
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180static int
7181unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7182{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007183 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007184
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185 Py_UNICODE *s1 = str1->str;
7186 Py_UNICODE *s2 = str2->str;
7187
7188 len1 = str1->length;
7189 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007190
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007192 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007193
7194 c1 = *s1++;
7195 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007196
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 if (c1 > (1<<11) * 26)
7198 c1 += utf16Fixup[c1>>11];
7199 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007200 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007201 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007202
7203 if (c1 != c2)
7204 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007205
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007206 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 }
7208
7209 return (len1 < len2) ? -1 : (len1 != len2);
7210}
7211
Marc-André Lemburge5034372000-08-08 08:04:29 +00007212#else
7213
7214static int
7215unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7216{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007217 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007218
7219 Py_UNICODE *s1 = str1->str;
7220 Py_UNICODE *s2 = str2->str;
7221
7222 len1 = str1->length;
7223 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007224
Marc-André Lemburge5034372000-08-08 08:04:29 +00007225 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007226 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007227
Fredrik Lundh45714e92001-06-26 16:39:36 +00007228 c1 = *s1++;
7229 c2 = *s2++;
7230
7231 if (c1 != c2)
7232 return (c1 < c2) ? -1 : 1;
7233
Marc-André Lemburge5034372000-08-08 08:04:29 +00007234 len1--; len2--;
7235 }
7236
7237 return (len1 < len2) ? -1 : (len1 != len2);
7238}
7239
7240#endif
7241
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007243 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007245 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7246 return unicode_compare((PyUnicodeObject *)left,
7247 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007248 PyErr_Format(PyExc_TypeError,
7249 "Can't compare %.100s and %.100s",
7250 left->ob_type->tp_name,
7251 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252 return -1;
7253}
7254
Martin v. Löwis5b222132007-06-10 09:51:05 +00007255int
7256PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7257{
7258 int i;
7259 Py_UNICODE *id;
7260 assert(PyUnicode_Check(uni));
7261 id = PyUnicode_AS_UNICODE(uni);
7262 /* Compare Unicode string and source character set string */
7263 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007264 if (id[i] != str[i])
7265 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007266 /* This check keeps Python strings that end in '\0' from comparing equal
7267 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007268 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007269 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007270 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007271 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007272 return 0;
7273}
7274
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007275
Benjamin Peterson29060642009-01-31 22:14:21 +00007276#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007277 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007278
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007279PyObject *PyUnicode_RichCompare(PyObject *left,
7280 PyObject *right,
7281 int op)
7282{
7283 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007284
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007285 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7286 PyObject *v;
7287 if (((PyUnicodeObject *) left)->length !=
7288 ((PyUnicodeObject *) right)->length) {
7289 if (op == Py_EQ) {
7290 Py_INCREF(Py_False);
7291 return Py_False;
7292 }
7293 if (op == Py_NE) {
7294 Py_INCREF(Py_True);
7295 return Py_True;
7296 }
7297 }
7298 if (left == right)
7299 result = 0;
7300 else
7301 result = unicode_compare((PyUnicodeObject *)left,
7302 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007303
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007304 /* Convert the return value to a Boolean */
7305 switch (op) {
7306 case Py_EQ:
7307 v = TEST_COND(result == 0);
7308 break;
7309 case Py_NE:
7310 v = TEST_COND(result != 0);
7311 break;
7312 case Py_LE:
7313 v = TEST_COND(result <= 0);
7314 break;
7315 case Py_GE:
7316 v = TEST_COND(result >= 0);
7317 break;
7318 case Py_LT:
7319 v = TEST_COND(result == -1);
7320 break;
7321 case Py_GT:
7322 v = TEST_COND(result == 1);
7323 break;
7324 default:
7325 PyErr_BadArgument();
7326 return NULL;
7327 }
7328 Py_INCREF(v);
7329 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007330 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007331
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007332 Py_INCREF(Py_NotImplemented);
7333 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007334}
7335
Guido van Rossum403d68b2000-03-13 15:55:09 +00007336int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007337 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007338{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007339 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007340 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007341
7342 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007343 sub = PyUnicode_FromObject(element);
7344 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 PyErr_Format(PyExc_TypeError,
7346 "'in <string>' requires string as left operand, not %s",
7347 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007348 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007349 }
7350
Thomas Wouters477c8d52006-05-27 19:21:47 +00007351 str = PyUnicode_FromObject(container);
7352 if (!str) {
7353 Py_DECREF(sub);
7354 return -1;
7355 }
7356
7357 result = stringlib_contains_obj(str, sub);
7358
7359 Py_DECREF(str);
7360 Py_DECREF(sub);
7361
Guido van Rossum403d68b2000-03-13 15:55:09 +00007362 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007363}
7364
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365/* Concat to string or Unicode object giving a new Unicode object. */
7366
7367PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369{
7370 PyUnicodeObject *u = NULL, *v = NULL, *w;
7371
7372 /* Coerce the two arguments */
7373 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7374 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7377 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007378 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379
7380 /* Shortcuts */
7381 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 Py_DECREF(v);
7383 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384 }
7385 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 Py_DECREF(u);
7387 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388 }
7389
7390 /* Concat the two Unicode strings */
7391 w = _PyUnicode_New(u->length + v->length);
7392 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394 Py_UNICODE_COPY(w->str, u->str, u->length);
7395 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7396
7397 Py_DECREF(u);
7398 Py_DECREF(v);
7399 return (PyObject *)w;
7400
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402 Py_XDECREF(u);
7403 Py_XDECREF(v);
7404 return NULL;
7405}
7406
Walter Dörwald1ab83302007-05-18 17:15:44 +00007407void
7408PyUnicode_Append(PyObject **pleft, PyObject *right)
7409{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007410 PyObject *new;
7411 if (*pleft == NULL)
7412 return;
7413 if (right == NULL || !PyUnicode_Check(*pleft)) {
7414 Py_DECREF(*pleft);
7415 *pleft = NULL;
7416 return;
7417 }
7418 new = PyUnicode_Concat(*pleft, right);
7419 Py_DECREF(*pleft);
7420 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007421}
7422
7423void
7424PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7425{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007426 PyUnicode_Append(pleft, right);
7427 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007428}
7429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007430PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007433Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007434string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007435interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436
7437static PyObject *
7438unicode_count(PyUnicodeObject *self, PyObject *args)
7439{
7440 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007441 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007442 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443 PyObject *result;
7444
Guido van Rossumb8872e62000-05-09 14:14:27 +00007445 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007446 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447 return NULL;
7448
7449 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007450 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007453
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007454 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007455 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007456 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007457 substring->str, substring->length,
7458 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007459 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007460
7461 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007462
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463 return result;
7464}
7465
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007466PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007467 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007469Encode S using the codec registered for encoding. Default encoding\n\
7470is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007471handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007472a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7473'xmlcharrefreplace' as well as any other name registered with\n\
7474codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475
7476static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007477unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007478{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007479 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480 char *encoding = NULL;
7481 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007482
Benjamin Peterson308d6372009-09-18 21:42:35 +00007483 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7484 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007486 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007487}
7488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007489PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491\n\
7492Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007493If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007494
7495static PyObject*
7496unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7497{
7498 Py_UNICODE *e;
7499 Py_UNICODE *p;
7500 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007501 Py_UNICODE *qe;
7502 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503 PyUnicodeObject *u;
7504 int tabsize = 8;
7505
7506 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508
Thomas Wouters7e474022000-07-16 12:04:32 +00007509 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007510 i = 0; /* chars up to and including most recent \n or \r */
7511 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7512 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007513 for (p = self->str; p < e; p++)
7514 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 if (tabsize > 0) {
7516 incr = tabsize - (j % tabsize); /* cannot overflow */
7517 if (j > PY_SSIZE_T_MAX - incr)
7518 goto overflow1;
7519 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007520 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007521 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 if (j > PY_SSIZE_T_MAX - 1)
7524 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525 j++;
7526 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007527 if (i > PY_SSIZE_T_MAX - j)
7528 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007530 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531 }
7532 }
7533
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007534 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007535 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007536
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537 /* Second pass: create output string and fill it */
7538 u = _PyUnicode_New(i + j);
7539 if (!u)
7540 return NULL;
7541
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007542 j = 0; /* same as in first pass */
7543 q = u->str; /* next output char */
7544 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545
7546 for (p = self->str; p < e; p++)
7547 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007548 if (tabsize > 0) {
7549 i = tabsize - (j % tabsize);
7550 j += i;
7551 while (i--) {
7552 if (q >= qe)
7553 goto overflow2;
7554 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007555 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007556 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007557 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007558 else {
7559 if (q >= qe)
7560 goto overflow2;
7561 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007562 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563 if (*p == '\n' || *p == '\r')
7564 j = 0;
7565 }
7566
7567 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007568
7569 overflow2:
7570 Py_DECREF(u);
7571 overflow1:
7572 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7573 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574}
7575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007576PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578\n\
7579Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007580such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581arguments start and end are interpreted as in slice notation.\n\
7582\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007583Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584
7585static PyObject *
7586unicode_find(PyUnicodeObject *self, PyObject *args)
7587{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007588 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007589 Py_ssize_t start;
7590 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007591 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592
Christian Heimes9cd17752007-11-18 19:35:23 +00007593 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595
Thomas Wouters477c8d52006-05-27 19:21:47 +00007596 result = stringlib_find_slice(
7597 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7598 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7599 start, end
7600 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601
7602 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007603
Christian Heimes217cfd12007-12-02 14:31:20 +00007604 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605}
7606
7607static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007608unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609{
7610 if (index < 0 || index >= self->length) {
7611 PyErr_SetString(PyExc_IndexError, "string index out of range");
7612 return NULL;
7613 }
7614
7615 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7616}
7617
Guido van Rossumc2504932007-09-18 19:42:40 +00007618/* Believe it or not, this produces the same value for ASCII strings
7619 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007620static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007621unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622{
Guido van Rossumc2504932007-09-18 19:42:40 +00007623 Py_ssize_t len;
7624 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007625 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007626
7627 if (self->hash != -1)
7628 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007629 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007630 p = self->str;
7631 x = *p << 7;
7632 while (--len >= 0)
7633 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007634 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007635 if (x == -1)
7636 x = -2;
7637 self->hash = x;
7638 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639}
7640
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007641PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007644Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645
7646static PyObject *
7647unicode_index(PyUnicodeObject *self, PyObject *args)
7648{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007649 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007650 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007651 Py_ssize_t start;
7652 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653
Christian Heimes9cd17752007-11-18 19:35:23 +00007654 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007656
Thomas Wouters477c8d52006-05-27 19:21:47 +00007657 result = stringlib_find_slice(
7658 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7659 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7660 start, end
7661 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662
7663 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007664
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665 if (result < 0) {
7666 PyErr_SetString(PyExc_ValueError, "substring not found");
7667 return NULL;
7668 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007669
Christian Heimes217cfd12007-12-02 14:31:20 +00007670 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671}
7672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007673PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007676Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007677at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678
7679static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007680unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007681{
7682 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7683 register const Py_UNICODE *e;
7684 int cased;
7685
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686 /* Shortcut for single character strings */
7687 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007690 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007691 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007692 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007693
Guido van Rossumd57fd912000-03-10 22:53:23 +00007694 e = p + PyUnicode_GET_SIZE(self);
7695 cased = 0;
7696 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007697 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007698
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7700 return PyBool_FromLong(0);
7701 else if (!cased && Py_UNICODE_ISLOWER(ch))
7702 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007704 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705}
7706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007707PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007708 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007710Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007711at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712
7713static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007714unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715{
7716 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7717 register const Py_UNICODE *e;
7718 int cased;
7719
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 /* Shortcut for single character strings */
7721 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007722 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007724 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007725 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007727
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728 e = p + PyUnicode_GET_SIZE(self);
7729 cased = 0;
7730 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007732
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7734 return PyBool_FromLong(0);
7735 else if (!cased && Py_UNICODE_ISUPPER(ch))
7736 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007738 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739}
7740
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007741PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007744Return True if S is a titlecased string and there is at least one\n\
7745character in S, i.e. upper- and titlecase characters may only\n\
7746follow uncased characters and lowercase characters only cased ones.\n\
7747Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748
7749static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007750unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751{
7752 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7753 register const Py_UNICODE *e;
7754 int cased, previous_is_cased;
7755
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756 /* Shortcut for single character strings */
7757 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007758 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7759 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007761 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007762 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007764
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765 e = p + PyUnicode_GET_SIZE(self);
7766 cased = 0;
7767 previous_is_cased = 0;
7768 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007770
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7772 if (previous_is_cased)
7773 return PyBool_FromLong(0);
7774 previous_is_cased = 1;
7775 cased = 1;
7776 }
7777 else if (Py_UNICODE_ISLOWER(ch)) {
7778 if (!previous_is_cased)
7779 return PyBool_FromLong(0);
7780 previous_is_cased = 1;
7781 cased = 1;
7782 }
7783 else
7784 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007786 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787}
7788
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007789PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007792Return True if all characters in S are whitespace\n\
7793and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794
7795static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007796unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797{
7798 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7799 register const Py_UNICODE *e;
7800
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 /* Shortcut for single character strings */
7802 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007803 Py_UNICODE_ISSPACE(*p))
7804 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007806 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007807 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007808 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007809
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 e = p + PyUnicode_GET_SIZE(self);
7811 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 if (!Py_UNICODE_ISSPACE(*p))
7813 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007814 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007815 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816}
7817
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007818PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007820\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007821Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007822and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007823
7824static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007825unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007826{
7827 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7828 register const Py_UNICODE *e;
7829
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007830 /* Shortcut for single character strings */
7831 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 Py_UNICODE_ISALPHA(*p))
7833 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007834
7835 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007836 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007837 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007838
7839 e = p + PyUnicode_GET_SIZE(self);
7840 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 if (!Py_UNICODE_ISALPHA(*p))
7842 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007843 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007844 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007845}
7846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007847PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007849\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007850Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007851and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007852
7853static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007854unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007855{
7856 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7857 register const Py_UNICODE *e;
7858
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007859 /* Shortcut for single character strings */
7860 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007861 Py_UNICODE_ISALNUM(*p))
7862 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007863
7864 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007865 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007866 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007867
7868 e = p + PyUnicode_GET_SIZE(self);
7869 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 if (!Py_UNICODE_ISALNUM(*p))
7871 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007872 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007873 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007874}
7875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007876PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007879Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007880False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007881
7882static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007883unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884{
7885 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7886 register const Py_UNICODE *e;
7887
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888 /* Shortcut for single character strings */
7889 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007890 Py_UNICODE_ISDECIMAL(*p))
7891 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007892
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007893 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007894 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007896
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897 e = p + PyUnicode_GET_SIZE(self);
7898 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 if (!Py_UNICODE_ISDECIMAL(*p))
7900 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007902 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903}
7904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007905PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007908Return True if all characters in S are digits\n\
7909and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007910
7911static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007912unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913{
7914 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7915 register const Py_UNICODE *e;
7916
Guido van Rossumd57fd912000-03-10 22:53:23 +00007917 /* Shortcut for single character strings */
7918 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007919 Py_UNICODE_ISDIGIT(*p))
7920 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007921
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007922 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007923 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007924 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007925
Guido van Rossumd57fd912000-03-10 22:53:23 +00007926 e = p + PyUnicode_GET_SIZE(self);
7927 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 if (!Py_UNICODE_ISDIGIT(*p))
7929 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007930 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007931 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932}
7933
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007934PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007937Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007938False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007939
7940static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007941unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007942{
7943 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7944 register const Py_UNICODE *e;
7945
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946 /* Shortcut for single character strings */
7947 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007948 Py_UNICODE_ISNUMERIC(*p))
7949 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007951 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007952 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007954
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955 e = p + PyUnicode_GET_SIZE(self);
7956 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007957 if (!Py_UNICODE_ISNUMERIC(*p))
7958 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007959 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007960 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961}
7962
Martin v. Löwis47383402007-08-15 07:32:56 +00007963int
7964PyUnicode_IsIdentifier(PyObject *self)
7965{
7966 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7967 register const Py_UNICODE *e;
7968
7969 /* Special case for empty strings */
7970 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007971 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007972
7973 /* PEP 3131 says that the first character must be in
7974 XID_Start and subsequent characters in XID_Continue,
7975 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007976 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007977 letters, digits, underscore). However, given the current
7978 definition of XID_Start and XID_Continue, it is sufficient
7979 to check just for these, except that _ must be allowed
7980 as starting an identifier. */
7981 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7982 return 0;
7983
7984 e = p + PyUnicode_GET_SIZE(self);
7985 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 if (!_PyUnicode_IsXidContinue(*p))
7987 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007988 }
7989 return 1;
7990}
7991
7992PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007994\n\
7995Return True if S is a valid identifier according\n\
7996to the language definition.");
7997
7998static PyObject*
7999unicode_isidentifier(PyObject *self)
8000{
8001 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8002}
8003
Georg Brandl559e5d72008-06-11 18:37:52 +00008004PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008006\n\
8007Return True if all characters in S are considered\n\
8008printable in repr() or S is empty, False otherwise.");
8009
8010static PyObject*
8011unicode_isprintable(PyObject *self)
8012{
8013 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8014 register const Py_UNICODE *e;
8015
8016 /* Shortcut for single character strings */
8017 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8018 Py_RETURN_TRUE;
8019 }
8020
8021 e = p + PyUnicode_GET_SIZE(self);
8022 for (; p < e; p++) {
8023 if (!Py_UNICODE_ISPRINTABLE(*p)) {
8024 Py_RETURN_FALSE;
8025 }
8026 }
8027 Py_RETURN_TRUE;
8028}
8029
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008030PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008031 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032\n\
8033Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008034iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035
8036static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008037unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008039 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040}
8041
Martin v. Löwis18e16552006-02-15 17:27:45 +00008042static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043unicode_length(PyUnicodeObject *self)
8044{
8045 return self->length;
8046}
8047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008048PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008051Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008052done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053
8054static PyObject *
8055unicode_ljust(PyUnicodeObject *self, PyObject *args)
8056{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008057 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008058 Py_UNICODE fillchar = ' ';
8059
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008060 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061 return NULL;
8062
Tim Peters7a29bd52001-09-12 03:03:31 +00008063 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064 Py_INCREF(self);
8065 return (PyObject*) self;
8066 }
8067
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008068 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069}
8070
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008071PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008074Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075
8076static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008077unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008078{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 return fixup(self, fixlower);
8080}
8081
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008082#define LEFTSTRIP 0
8083#define RIGHTSTRIP 1
8084#define BOTHSTRIP 2
8085
8086/* Arrays indexed by above */
8087static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8088
8089#define STRIPNAME(i) (stripformat[i]+3)
8090
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008091/* externally visible for str.strip(unicode) */
8092PyObject *
8093_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8094{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008095 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8096 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8097 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8098 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8099 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008100
Benjamin Peterson29060642009-01-31 22:14:21 +00008101 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008102
Benjamin Peterson14339b62009-01-31 16:36:08 +00008103 i = 0;
8104 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8106 i++;
8107 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008108 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008109
Benjamin Peterson14339b62009-01-31 16:36:08 +00008110 j = len;
8111 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 do {
8113 j--;
8114 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8115 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008116 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008117
Benjamin Peterson14339b62009-01-31 16:36:08 +00008118 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 Py_INCREF(self);
8120 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008121 }
8122 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008124}
8125
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126
8127static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008128do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008130 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8131 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008132
Benjamin Peterson14339b62009-01-31 16:36:08 +00008133 i = 0;
8134 if (striptype != RIGHTSTRIP) {
8135 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8136 i++;
8137 }
8138 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008139
Benjamin Peterson14339b62009-01-31 16:36:08 +00008140 j = len;
8141 if (striptype != LEFTSTRIP) {
8142 do {
8143 j--;
8144 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8145 j++;
8146 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008147
Benjamin Peterson14339b62009-01-31 16:36:08 +00008148 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8149 Py_INCREF(self);
8150 return (PyObject*)self;
8151 }
8152 else
8153 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154}
8155
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008156
8157static PyObject *
8158do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8159{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008160 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008161
Benjamin Peterson14339b62009-01-31 16:36:08 +00008162 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8163 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008164
Benjamin Peterson14339b62009-01-31 16:36:08 +00008165 if (sep != NULL && sep != Py_None) {
8166 if (PyUnicode_Check(sep))
8167 return _PyUnicode_XStrip(self, striptype, sep);
8168 else {
8169 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008170 "%s arg must be None or str",
8171 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008172 return NULL;
8173 }
8174 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008175
Benjamin Peterson14339b62009-01-31 16:36:08 +00008176 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008177}
8178
8179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008180PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008182\n\
8183Return a copy of the string S with leading and trailing\n\
8184whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008185If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008186
8187static PyObject *
8188unicode_strip(PyUnicodeObject *self, PyObject *args)
8189{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008190 if (PyTuple_GET_SIZE(args) == 0)
8191 return do_strip(self, BOTHSTRIP); /* Common case */
8192 else
8193 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008194}
8195
8196
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008197PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008199\n\
8200Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008201If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008202
8203static PyObject *
8204unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8205{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008206 if (PyTuple_GET_SIZE(args) == 0)
8207 return do_strip(self, LEFTSTRIP); /* Common case */
8208 else
8209 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008210}
8211
8212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008213PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008215\n\
8216Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008217If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008218
8219static PyObject *
8220unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8221{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008222 if (PyTuple_GET_SIZE(args) == 0)
8223 return do_strip(self, RIGHTSTRIP); /* Common case */
8224 else
8225 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008226}
8227
8228
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008230unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231{
8232 PyUnicodeObject *u;
8233 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008234 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008235 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008236
Georg Brandl222de0f2009-04-12 12:01:50 +00008237 if (len < 1) {
8238 Py_INCREF(unicode_empty);
8239 return (PyObject *)unicode_empty;
8240 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241
Tim Peters7a29bd52001-09-12 03:03:31 +00008242 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243 /* no repeat, return original string */
8244 Py_INCREF(str);
8245 return (PyObject*) str;
8246 }
Tim Peters8f422462000-09-09 06:13:41 +00008247
8248 /* ensure # of chars needed doesn't overflow int and # of bytes
8249 * needed doesn't overflow size_t
8250 */
8251 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008252 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008253 PyErr_SetString(PyExc_OverflowError,
8254 "repeated string is too long");
8255 return NULL;
8256 }
8257 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8258 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8259 PyErr_SetString(PyExc_OverflowError,
8260 "repeated string is too long");
8261 return NULL;
8262 }
8263 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264 if (!u)
8265 return NULL;
8266
8267 p = u->str;
8268
Georg Brandl222de0f2009-04-12 12:01:50 +00008269 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008270 Py_UNICODE_FILL(p, str->str[0], len);
8271 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008272 Py_ssize_t done = str->length; /* number of characters copied this far */
8273 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008275 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008276 Py_UNICODE_COPY(p+done, p, n);
8277 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279 }
8280
8281 return (PyObject*) u;
8282}
8283
8284PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 PyObject *subobj,
8286 PyObject *replobj,
8287 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288{
8289 PyObject *self;
8290 PyObject *str1;
8291 PyObject *str2;
8292 PyObject *result;
8293
8294 self = PyUnicode_FromObject(obj);
8295 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297 str1 = PyUnicode_FromObject(subobj);
8298 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 Py_DECREF(self);
8300 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301 }
8302 str2 = PyUnicode_FromObject(replobj);
8303 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 Py_DECREF(self);
8305 Py_DECREF(str1);
8306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 }
Tim Petersced69f82003-09-16 20:30:58 +00008308 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 (PyUnicodeObject *)str1,
8310 (PyUnicodeObject *)str2,
8311 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312 Py_DECREF(self);
8313 Py_DECREF(str1);
8314 Py_DECREF(str2);
8315 return result;
8316}
8317
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008318PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008319 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320\n\
8321Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008322old replaced by new. If the optional argument count is\n\
8323given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324
8325static PyObject*
8326unicode_replace(PyUnicodeObject *self, PyObject *args)
8327{
8328 PyUnicodeObject *str1;
8329 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008330 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331 PyObject *result;
8332
Martin v. Löwis18e16552006-02-15 17:27:45 +00008333 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334 return NULL;
8335 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8336 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008339 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 Py_DECREF(str1);
8341 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008342 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343
8344 result = replace(self, str1, str2, maxcount);
8345
8346 Py_DECREF(str1);
8347 Py_DECREF(str2);
8348 return result;
8349}
8350
8351static
8352PyObject *unicode_repr(PyObject *unicode)
8353{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008354 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008355 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008356 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8357 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8358
8359 /* XXX(nnorwitz): rather than over-allocating, it would be
8360 better to choose a different scheme. Perhaps scan the
8361 first N-chars of the string and allocate based on that size.
8362 */
8363 /* Initial allocation is based on the longest-possible unichr
8364 escape.
8365
8366 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8367 unichr, so in this case it's the longest unichr escape. In
8368 narrow (UTF-16) builds this is five chars per source unichr
8369 since there are two unichrs in the surrogate pair, so in narrow
8370 (UTF-16) builds it's not the longest unichr escape.
8371
8372 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8373 so in the narrow (UTF-16) build case it's the longest unichr
8374 escape.
8375 */
8376
Walter Dörwald1ab83302007-05-18 17:15:44 +00008377 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008379#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008381#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008383#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008385 if (repr == NULL)
8386 return NULL;
8387
Walter Dörwald1ab83302007-05-18 17:15:44 +00008388 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008389
8390 /* Add quote */
8391 *p++ = (findchar(s, size, '\'') &&
8392 !findchar(s, size, '"')) ? '"' : '\'';
8393 while (size-- > 0) {
8394 Py_UNICODE ch = *s++;
8395
8396 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008397 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008398 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008399 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008400 continue;
8401 }
8402
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008404 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008405 *p++ = '\\';
8406 *p++ = 't';
8407 }
8408 else if (ch == '\n') {
8409 *p++ = '\\';
8410 *p++ = 'n';
8411 }
8412 else if (ch == '\r') {
8413 *p++ = '\\';
8414 *p++ = 'r';
8415 }
8416
8417 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008418 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008419 *p++ = '\\';
8420 *p++ = 'x';
8421 *p++ = hexdigits[(ch >> 4) & 0x000F];
8422 *p++ = hexdigits[ch & 0x000F];
8423 }
8424
Georg Brandl559e5d72008-06-11 18:37:52 +00008425 /* Copy ASCII characters as-is */
8426 else if (ch < 0x7F) {
8427 *p++ = ch;
8428 }
8429
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008431 else {
8432 Py_UCS4 ucs = ch;
8433
8434#ifndef Py_UNICODE_WIDE
8435 Py_UNICODE ch2 = 0;
8436 /* Get code point from surrogate pair */
8437 if (size > 0) {
8438 ch2 = *s;
8439 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008441 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008443 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008444 size--;
8445 }
8446 }
8447#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008448 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008449 (categories Z* and C* except ASCII space)
8450 */
8451 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8452 /* Map 8-bit characters to '\xhh' */
8453 if (ucs <= 0xff) {
8454 *p++ = '\\';
8455 *p++ = 'x';
8456 *p++ = hexdigits[(ch >> 4) & 0x000F];
8457 *p++ = hexdigits[ch & 0x000F];
8458 }
8459 /* Map 21-bit characters to '\U00xxxxxx' */
8460 else if (ucs >= 0x10000) {
8461 *p++ = '\\';
8462 *p++ = 'U';
8463 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8464 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8465 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8466 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8467 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8468 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8469 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8470 *p++ = hexdigits[ucs & 0x0000000F];
8471 }
8472 /* Map 16-bit characters to '\uxxxx' */
8473 else {
8474 *p++ = '\\';
8475 *p++ = 'u';
8476 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8477 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8478 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8479 *p++ = hexdigits[ucs & 0x000F];
8480 }
8481 }
8482 /* Copy characters as-is */
8483 else {
8484 *p++ = ch;
8485#ifndef Py_UNICODE_WIDE
8486 if (ucs >= 0x10000)
8487 *p++ = ch2;
8488#endif
8489 }
8490 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008491 }
8492 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008493 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008494
8495 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008496 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008497 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498}
8499
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008500PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502\n\
8503Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008504such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505arguments start and end are interpreted as in slice notation.\n\
8506\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008507Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508
8509static PyObject *
8510unicode_rfind(PyUnicodeObject *self, PyObject *args)
8511{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008512 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008513 Py_ssize_t start;
8514 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008515 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516
Christian Heimes9cd17752007-11-18 19:35:23 +00008517 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008518 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519
Thomas Wouters477c8d52006-05-27 19:21:47 +00008520 result = stringlib_rfind_slice(
8521 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8522 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8523 start, end
8524 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008525
8526 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008527
Christian Heimes217cfd12007-12-02 14:31:20 +00008528 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529}
8530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008531PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008534Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535
8536static PyObject *
8537unicode_rindex(PyUnicodeObject *self, PyObject *args)
8538{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008539 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008540 Py_ssize_t start;
8541 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008542 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543
Christian Heimes9cd17752007-11-18 19:35:23 +00008544 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008545 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546
Thomas Wouters477c8d52006-05-27 19:21:47 +00008547 result = stringlib_rfind_slice(
8548 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8549 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8550 start, end
8551 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552
8553 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008554
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555 if (result < 0) {
8556 PyErr_SetString(PyExc_ValueError, "substring not found");
8557 return NULL;
8558 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008559 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560}
8561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008562PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008565Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008566done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567
8568static PyObject *
8569unicode_rjust(PyUnicodeObject *self, PyObject *args)
8570{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008571 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008572 Py_UNICODE fillchar = ' ';
8573
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008574 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575 return NULL;
8576
Tim Peters7a29bd52001-09-12 03:03:31 +00008577 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 Py_INCREF(self);
8579 return (PyObject*) self;
8580 }
8581
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008582 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583}
8584
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 PyObject *sep,
8587 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588{
8589 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008590
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591 s = PyUnicode_FromObject(s);
8592 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008593 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 if (sep != NULL) {
8595 sep = PyUnicode_FromObject(sep);
8596 if (sep == NULL) {
8597 Py_DECREF(s);
8598 return NULL;
8599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600 }
8601
8602 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8603
8604 Py_DECREF(s);
8605 Py_XDECREF(sep);
8606 return result;
8607}
8608
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008609PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611\n\
8612Return a list of the words in S, using sep as the\n\
8613delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008614splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008615whitespace string is a separator and empty strings are\n\
8616removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617
8618static PyObject*
8619unicode_split(PyUnicodeObject *self, PyObject *args)
8620{
8621 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008622 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623
Martin v. Löwis18e16552006-02-15 17:27:45 +00008624 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 return NULL;
8626
8627 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633}
8634
Thomas Wouters477c8d52006-05-27 19:21:47 +00008635PyObject *
8636PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8637{
8638 PyObject* str_obj;
8639 PyObject* sep_obj;
8640 PyObject* out;
8641
8642 str_obj = PyUnicode_FromObject(str_in);
8643 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008645 sep_obj = PyUnicode_FromObject(sep_in);
8646 if (!sep_obj) {
8647 Py_DECREF(str_obj);
8648 return NULL;
8649 }
8650
8651 out = stringlib_partition(
8652 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8653 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8654 );
8655
8656 Py_DECREF(sep_obj);
8657 Py_DECREF(str_obj);
8658
8659 return out;
8660}
8661
8662
8663PyObject *
8664PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8665{
8666 PyObject* str_obj;
8667 PyObject* sep_obj;
8668 PyObject* out;
8669
8670 str_obj = PyUnicode_FromObject(str_in);
8671 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008673 sep_obj = PyUnicode_FromObject(sep_in);
8674 if (!sep_obj) {
8675 Py_DECREF(str_obj);
8676 return NULL;
8677 }
8678
8679 out = stringlib_rpartition(
8680 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8681 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8682 );
8683
8684 Py_DECREF(sep_obj);
8685 Py_DECREF(str_obj);
8686
8687 return out;
8688}
8689
8690PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008692\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008693Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008694the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008695found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008696
8697static PyObject*
8698unicode_partition(PyUnicodeObject *self, PyObject *separator)
8699{
8700 return PyUnicode_Partition((PyObject *)self, separator);
8701}
8702
8703PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008704 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008705\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008706Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008707the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008708separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008709
8710static PyObject*
8711unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8712{
8713 return PyUnicode_RPartition((PyObject *)self, separator);
8714}
8715
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008716PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 PyObject *sep,
8718 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008719{
8720 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008721
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008722 s = PyUnicode_FromObject(s);
8723 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008724 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 if (sep != NULL) {
8726 sep = PyUnicode_FromObject(sep);
8727 if (sep == NULL) {
8728 Py_DECREF(s);
8729 return NULL;
8730 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008731 }
8732
8733 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8734
8735 Py_DECREF(s);
8736 Py_XDECREF(sep);
8737 return result;
8738}
8739
8740PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008741 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008742\n\
8743Return a list of the words in S, using sep as the\n\
8744delimiter string, starting at the end of the string and\n\
8745working to the front. If maxsplit is given, at most maxsplit\n\
8746splits are done. If sep is not specified, any whitespace string\n\
8747is a separator.");
8748
8749static PyObject*
8750unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8751{
8752 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008753 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008754
Martin v. Löwis18e16552006-02-15 17:27:45 +00008755 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008756 return NULL;
8757
8758 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008760 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008762 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008764}
8765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008766PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768\n\
8769Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008770Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008771is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772
8773static PyObject*
8774unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8775{
Guido van Rossum86662912000-04-11 15:38:46 +00008776 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777
Guido van Rossum86662912000-04-11 15:38:46 +00008778 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008779 return NULL;
8780
Guido van Rossum86662912000-04-11 15:38:46 +00008781 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782}
8783
8784static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008785PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008786{
Walter Dörwald346737f2007-05-31 10:44:43 +00008787 if (PyUnicode_CheckExact(self)) {
8788 Py_INCREF(self);
8789 return self;
8790 } else
8791 /* Subtype -- return genuine unicode string with the same value. */
8792 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8793 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008794}
8795
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008796PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008798\n\
8799Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008800and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801
8802static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008803unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008805 return fixup(self, fixswapcase);
8806}
8807
Georg Brandlceee0772007-11-27 23:48:05 +00008808PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008809 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008810\n\
8811Return a translation table usable for str.translate().\n\
8812If there is only one argument, it must be a dictionary mapping Unicode\n\
8813ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008814Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008815If there are two arguments, they must be strings of equal length, and\n\
8816in the resulting dictionary, each character in x will be mapped to the\n\
8817character at the same position in y. If there is a third argument, it\n\
8818must be a string, whose characters will be mapped to None in the result.");
8819
8820static PyObject*
8821unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8822{
8823 PyObject *x, *y = NULL, *z = NULL;
8824 PyObject *new = NULL, *key, *value;
8825 Py_ssize_t i = 0;
8826 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008827
Georg Brandlceee0772007-11-27 23:48:05 +00008828 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8829 return NULL;
8830 new = PyDict_New();
8831 if (!new)
8832 return NULL;
8833 if (y != NULL) {
8834 /* x must be a string too, of equal length */
8835 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8836 if (!PyUnicode_Check(x)) {
8837 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8838 "be a string if there is a second argument");
8839 goto err;
8840 }
8841 if (PyUnicode_GET_SIZE(x) != ylen) {
8842 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8843 "arguments must have equal length");
8844 goto err;
8845 }
8846 /* create entries for translating chars in x to those in y */
8847 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008848 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8849 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008850 if (!key || !value)
8851 goto err;
8852 res = PyDict_SetItem(new, key, value);
8853 Py_DECREF(key);
8854 Py_DECREF(value);
8855 if (res < 0)
8856 goto err;
8857 }
8858 /* create entries for deleting chars in z */
8859 if (z != NULL) {
8860 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008861 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008862 if (!key)
8863 goto err;
8864 res = PyDict_SetItem(new, key, Py_None);
8865 Py_DECREF(key);
8866 if (res < 0)
8867 goto err;
8868 }
8869 }
8870 } else {
8871 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008872 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008873 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8874 "to maketrans it must be a dict");
8875 goto err;
8876 }
8877 /* copy entries into the new dict, converting string keys to int keys */
8878 while (PyDict_Next(x, &i, &key, &value)) {
8879 if (PyUnicode_Check(key)) {
8880 /* convert string keys to integer keys */
8881 PyObject *newkey;
8882 if (PyUnicode_GET_SIZE(key) != 1) {
8883 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8884 "table must be of length 1");
8885 goto err;
8886 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008887 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008888 if (!newkey)
8889 goto err;
8890 res = PyDict_SetItem(new, newkey, value);
8891 Py_DECREF(newkey);
8892 if (res < 0)
8893 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008894 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008895 /* just keep integer keys */
8896 if (PyDict_SetItem(new, key, value) < 0)
8897 goto err;
8898 } else {
8899 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8900 "be strings or integers");
8901 goto err;
8902 }
8903 }
8904 }
8905 return new;
8906 err:
8907 Py_DECREF(new);
8908 return NULL;
8909}
8910
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008911PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008913\n\
8914Return a copy of the string S, where all characters have been mapped\n\
8915through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008916Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008917Unmapped characters are left untouched. Characters mapped to None\n\
8918are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919
8920static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008921unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008922{
Georg Brandlceee0772007-11-27 23:48:05 +00008923 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924}
8925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008926PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008927 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008929Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008930
8931static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008932unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934 return fixup(self, fixupper);
8935}
8936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008937PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008940Pad a numeric string S with zeros on the left, to fill a field\n\
8941of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942
8943static PyObject *
8944unicode_zfill(PyUnicodeObject *self, PyObject *args)
8945{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008946 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008947 PyUnicodeObject *u;
8948
Martin v. Löwis18e16552006-02-15 17:27:45 +00008949 Py_ssize_t width;
8950 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008951 return NULL;
8952
8953 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008954 if (PyUnicode_CheckExact(self)) {
8955 Py_INCREF(self);
8956 return (PyObject*) self;
8957 }
8958 else
8959 return PyUnicode_FromUnicode(
8960 PyUnicode_AS_UNICODE(self),
8961 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008962 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963 }
8964
8965 fill = width - self->length;
8966
8967 u = pad(self, fill, 0, '0');
8968
Walter Dörwald068325e2002-04-15 13:36:47 +00008969 if (u == NULL)
8970 return NULL;
8971
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972 if (u->str[fill] == '+' || u->str[fill] == '-') {
8973 /* move sign to beginning of string */
8974 u->str[0] = u->str[fill];
8975 u->str[fill] = '0';
8976 }
8977
8978 return (PyObject*) u;
8979}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980
8981#if 0
8982static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008983unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984{
Christian Heimes2202f872008-02-06 14:31:34 +00008985 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008987
8988static PyObject *
8989unicode__decimal2ascii(PyObject *self)
8990{
8991 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
8992 PyUnicode_GET_SIZE(self));
8993}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994#endif
8995
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008996PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008997 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008999Return True if S starts with the specified prefix, False otherwise.\n\
9000With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009001With optional end, stop comparing S at that position.\n\
9002prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003
9004static PyObject *
9005unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009006 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009008 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009010 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009011 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009012 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009014 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009015 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9016 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009017 if (PyTuple_Check(subobj)) {
9018 Py_ssize_t i;
9019 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9020 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009021 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009022 if (substring == NULL)
9023 return NULL;
9024 result = tailmatch(self, substring, start, end, -1);
9025 Py_DECREF(substring);
9026 if (result) {
9027 Py_RETURN_TRUE;
9028 }
9029 }
9030 /* nothing matched */
9031 Py_RETURN_FALSE;
9032 }
9033 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009035 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009036 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009038 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039}
9040
9041
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009042PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009043 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009045Return True if S ends with the specified suffix, False otherwise.\n\
9046With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009047With optional end, stop comparing S at that position.\n\
9048suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049
9050static PyObject *
9051unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009052 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009054 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009056 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009057 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009058 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009060 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009061 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9062 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009063 if (PyTuple_Check(subobj)) {
9064 Py_ssize_t i;
9065 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9066 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009067 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009068 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009069 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009070 result = tailmatch(self, substring, start, end, +1);
9071 Py_DECREF(substring);
9072 if (result) {
9073 Py_RETURN_TRUE;
9074 }
9075 }
9076 Py_RETURN_FALSE;
9077 }
9078 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009080 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009082 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009084 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085}
9086
Eric Smith8c663262007-08-25 02:26:07 +00009087#include "stringlib/string_format.h"
9088
9089PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009090 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009091\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009092Return a formatted version of S, using substitutions from args and kwargs.\n\
9093The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009094
Eric Smith27bbca62010-11-04 17:06:58 +00009095PyDoc_STRVAR(format_map__doc__,
9096 "S.format_map(mapping) -> str\n\
9097\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009098Return a formatted version of S, using substitutions from mapping.\n\
9099The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009100
Eric Smith4a7d76d2008-05-30 18:10:19 +00009101static PyObject *
9102unicode__format__(PyObject* self, PyObject* args)
9103{
9104 PyObject *format_spec;
9105
9106 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9107 return NULL;
9108
9109 return _PyUnicode_FormatAdvanced(self,
9110 PyUnicode_AS_UNICODE(format_spec),
9111 PyUnicode_GET_SIZE(format_spec));
9112}
9113
Eric Smith8c663262007-08-25 02:26:07 +00009114PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009115 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009116\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009117Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009118
9119static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009120unicode__sizeof__(PyUnicodeObject *v)
9121{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009122 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9123 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009124}
9125
9126PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009127 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009128
9129static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009130unicode_getnewargs(PyUnicodeObject *v)
9131{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009132 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009133}
9134
Guido van Rossumd57fd912000-03-10 22:53:23 +00009135static PyMethodDef unicode_methods[] = {
9136
9137 /* Order is according to common usage: often used methods should
9138 appear first, since lookup is done sequentially. */
9139
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009140 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009141 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9142 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009143 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009144 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9145 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9146 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9147 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9148 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9149 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9150 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009151 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009152 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9153 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9154 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009155 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009156 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9157 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9158 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009159 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009160 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009161 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009162 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009163 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9164 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9165 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9166 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9167 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9168 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9169 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9170 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9171 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9172 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9173 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9174 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9175 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9176 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009177 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009178 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009179 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009180 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009181 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009182 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009183 {"maketrans", (PyCFunction) unicode_maketrans,
9184 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009185 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009186#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009187 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009188#endif
9189
9190#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009191 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009192 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009193 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009194#endif
9195
Benjamin Peterson14339b62009-01-31 16:36:08 +00009196 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009197 {NULL, NULL}
9198};
9199
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009200static PyObject *
9201unicode_mod(PyObject *v, PyObject *w)
9202{
Benjamin Peterson29060642009-01-31 22:14:21 +00009203 if (!PyUnicode_Check(v)) {
9204 Py_INCREF(Py_NotImplemented);
9205 return Py_NotImplemented;
9206 }
9207 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009208}
9209
9210static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009211 0, /*nb_add*/
9212 0, /*nb_subtract*/
9213 0, /*nb_multiply*/
9214 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009215};
9216
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009218 (lenfunc) unicode_length, /* sq_length */
9219 PyUnicode_Concat, /* sq_concat */
9220 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9221 (ssizeargfunc) unicode_getitem, /* sq_item */
9222 0, /* sq_slice */
9223 0, /* sq_ass_item */
9224 0, /* sq_ass_slice */
9225 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226};
9227
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009228static PyObject*
9229unicode_subscript(PyUnicodeObject* self, PyObject* item)
9230{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009231 if (PyIndex_Check(item)) {
9232 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009233 if (i == -1 && PyErr_Occurred())
9234 return NULL;
9235 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009236 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009237 return unicode_getitem(self, i);
9238 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009239 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009240 Py_UNICODE* source_buf;
9241 Py_UNICODE* result_buf;
9242 PyObject* result;
9243
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009244 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009245 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009246 return NULL;
9247 }
9248
9249 if (slicelength <= 0) {
9250 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009251 } else if (start == 0 && step == 1 && slicelength == self->length &&
9252 PyUnicode_CheckExact(self)) {
9253 Py_INCREF(self);
9254 return (PyObject *)self;
9255 } else if (step == 1) {
9256 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009257 } else {
9258 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009259 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9260 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009261
Benjamin Peterson29060642009-01-31 22:14:21 +00009262 if (result_buf == NULL)
9263 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009264
9265 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9266 result_buf[i] = source_buf[cur];
9267 }
Tim Petersced69f82003-09-16 20:30:58 +00009268
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009269 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009270 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009271 return result;
9272 }
9273 } else {
9274 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9275 return NULL;
9276 }
9277}
9278
9279static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009280 (lenfunc)unicode_length, /* mp_length */
9281 (binaryfunc)unicode_subscript, /* mp_subscript */
9282 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009283};
9284
Guido van Rossumd57fd912000-03-10 22:53:23 +00009285
Guido van Rossumd57fd912000-03-10 22:53:23 +00009286/* Helpers for PyUnicode_Format() */
9287
9288static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009289getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009291 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009293 (*p_argidx)++;
9294 if (arglen < 0)
9295 return args;
9296 else
9297 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298 }
9299 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009300 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009301 return NULL;
9302}
9303
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009304/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009306static PyObject *
9307formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009309 char *p;
9310 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009311 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009312
Guido van Rossumd57fd912000-03-10 22:53:23 +00009313 x = PyFloat_AsDouble(v);
9314 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009315 return NULL;
9316
Guido van Rossumd57fd912000-03-10 22:53:23 +00009317 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009318 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009319
Eric Smith0923d1d2009-04-16 20:16:10 +00009320 p = PyOS_double_to_string(x, type, prec,
9321 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009322 if (p == NULL)
9323 return NULL;
9324 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009325 PyMem_Free(p);
9326 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009327}
9328
Tim Peters38fd5b62000-09-21 05:43:11 +00009329static PyObject*
9330formatlong(PyObject *val, int flags, int prec, int type)
9331{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009332 char *buf;
9333 int len;
9334 PyObject *str; /* temporary string object. */
9335 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009336
Benjamin Peterson14339b62009-01-31 16:36:08 +00009337 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9338 if (!str)
9339 return NULL;
9340 result = PyUnicode_FromStringAndSize(buf, len);
9341 Py_DECREF(str);
9342 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009343}
9344
Guido van Rossumd57fd912000-03-10 22:53:23 +00009345static int
9346formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009347 size_t buflen,
9348 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009350 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009351 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009352 if (PyUnicode_GET_SIZE(v) == 1) {
9353 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9354 buf[1] = '\0';
9355 return 1;
9356 }
9357#ifndef Py_UNICODE_WIDE
9358 if (PyUnicode_GET_SIZE(v) == 2) {
9359 /* Decode a valid surrogate pair */
9360 int c0 = PyUnicode_AS_UNICODE(v)[0];
9361 int c1 = PyUnicode_AS_UNICODE(v)[1];
9362 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9363 0xDC00 <= c1 && c1 <= 0xDFFF) {
9364 buf[0] = c0;
9365 buf[1] = c1;
9366 buf[2] = '\0';
9367 return 2;
9368 }
9369 }
9370#endif
9371 goto onError;
9372 }
9373 else {
9374 /* Integer input truncated to a character */
9375 long x;
9376 x = PyLong_AsLong(v);
9377 if (x == -1 && PyErr_Occurred())
9378 goto onError;
9379
9380 if (x < 0 || x > 0x10ffff) {
9381 PyErr_SetString(PyExc_OverflowError,
9382 "%c arg not in range(0x110000)");
9383 return -1;
9384 }
9385
9386#ifndef Py_UNICODE_WIDE
9387 if (x > 0xffff) {
9388 x -= 0x10000;
9389 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9390 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9391 return 2;
9392 }
9393#endif
9394 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009395 buf[1] = '\0';
9396 return 1;
9397 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009398
Benjamin Peterson29060642009-01-31 22:14:21 +00009399 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009400 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009401 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009402 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403}
9404
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009405/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009406 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009407*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009408#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009409
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009411 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009412{
9413 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009414 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415 int args_owned = 0;
9416 PyUnicodeObject *result = NULL;
9417 PyObject *dict = NULL;
9418 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009419
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009421 PyErr_BadInternalCall();
9422 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009423 }
9424 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009425 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009426 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009427 fmt = PyUnicode_AS_UNICODE(uformat);
9428 fmtcnt = PyUnicode_GET_SIZE(uformat);
9429
9430 reslen = rescnt = fmtcnt + 100;
9431 result = _PyUnicode_New(reslen);
9432 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009433 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009434 res = PyUnicode_AS_UNICODE(result);
9435
9436 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009437 arglen = PyTuple_Size(args);
9438 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009439 }
9440 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009441 arglen = -1;
9442 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009444 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009445 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009446 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447
9448 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009449 if (*fmt != '%') {
9450 if (--rescnt < 0) {
9451 rescnt = fmtcnt + 100;
9452 reslen += rescnt;
9453 if (_PyUnicode_Resize(&result, reslen) < 0)
9454 goto onError;
9455 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9456 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009457 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009458 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009459 }
9460 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009461 /* Got a format specifier */
9462 int flags = 0;
9463 Py_ssize_t width = -1;
9464 int prec = -1;
9465 Py_UNICODE c = '\0';
9466 Py_UNICODE fill;
9467 int isnumok;
9468 PyObject *v = NULL;
9469 PyObject *temp = NULL;
9470 Py_UNICODE *pbuf;
9471 Py_UNICODE sign;
9472 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009473 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009474
Benjamin Peterson29060642009-01-31 22:14:21 +00009475 fmt++;
9476 if (*fmt == '(') {
9477 Py_UNICODE *keystart;
9478 Py_ssize_t keylen;
9479 PyObject *key;
9480 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009481
Benjamin Peterson29060642009-01-31 22:14:21 +00009482 if (dict == NULL) {
9483 PyErr_SetString(PyExc_TypeError,
9484 "format requires a mapping");
9485 goto onError;
9486 }
9487 ++fmt;
9488 --fmtcnt;
9489 keystart = fmt;
9490 /* Skip over balanced parentheses */
9491 while (pcount > 0 && --fmtcnt >= 0) {
9492 if (*fmt == ')')
9493 --pcount;
9494 else if (*fmt == '(')
9495 ++pcount;
9496 fmt++;
9497 }
9498 keylen = fmt - keystart - 1;
9499 if (fmtcnt < 0 || pcount > 0) {
9500 PyErr_SetString(PyExc_ValueError,
9501 "incomplete format key");
9502 goto onError;
9503 }
9504#if 0
9505 /* keys are converted to strings using UTF-8 and
9506 then looked up since Python uses strings to hold
9507 variables names etc. in its namespaces and we
9508 wouldn't want to break common idioms. */
9509 key = PyUnicode_EncodeUTF8(keystart,
9510 keylen,
9511 NULL);
9512#else
9513 key = PyUnicode_FromUnicode(keystart, keylen);
9514#endif
9515 if (key == NULL)
9516 goto onError;
9517 if (args_owned) {
9518 Py_DECREF(args);
9519 args_owned = 0;
9520 }
9521 args = PyObject_GetItem(dict, key);
9522 Py_DECREF(key);
9523 if (args == NULL) {
9524 goto onError;
9525 }
9526 args_owned = 1;
9527 arglen = -1;
9528 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009529 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009530 while (--fmtcnt >= 0) {
9531 switch (c = *fmt++) {
9532 case '-': flags |= F_LJUST; continue;
9533 case '+': flags |= F_SIGN; continue;
9534 case ' ': flags |= F_BLANK; continue;
9535 case '#': flags |= F_ALT; continue;
9536 case '0': flags |= F_ZERO; continue;
9537 }
9538 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009539 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009540 if (c == '*') {
9541 v = getnextarg(args, arglen, &argidx);
9542 if (v == NULL)
9543 goto onError;
9544 if (!PyLong_Check(v)) {
9545 PyErr_SetString(PyExc_TypeError,
9546 "* wants int");
9547 goto onError;
9548 }
9549 width = PyLong_AsLong(v);
9550 if (width == -1 && PyErr_Occurred())
9551 goto onError;
9552 if (width < 0) {
9553 flags |= F_LJUST;
9554 width = -width;
9555 }
9556 if (--fmtcnt >= 0)
9557 c = *fmt++;
9558 }
9559 else if (c >= '0' && c <= '9') {
9560 width = c - '0';
9561 while (--fmtcnt >= 0) {
9562 c = *fmt++;
9563 if (c < '0' || c > '9')
9564 break;
9565 if ((width*10) / 10 != width) {
9566 PyErr_SetString(PyExc_ValueError,
9567 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009568 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009569 }
9570 width = width*10 + (c - '0');
9571 }
9572 }
9573 if (c == '.') {
9574 prec = 0;
9575 if (--fmtcnt >= 0)
9576 c = *fmt++;
9577 if (c == '*') {
9578 v = getnextarg(args, arglen, &argidx);
9579 if (v == NULL)
9580 goto onError;
9581 if (!PyLong_Check(v)) {
9582 PyErr_SetString(PyExc_TypeError,
9583 "* wants int");
9584 goto onError;
9585 }
9586 prec = PyLong_AsLong(v);
9587 if (prec == -1 && PyErr_Occurred())
9588 goto onError;
9589 if (prec < 0)
9590 prec = 0;
9591 if (--fmtcnt >= 0)
9592 c = *fmt++;
9593 }
9594 else if (c >= '0' && c <= '9') {
9595 prec = c - '0';
9596 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009597 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009598 if (c < '0' || c > '9')
9599 break;
9600 if ((prec*10) / 10 != prec) {
9601 PyErr_SetString(PyExc_ValueError,
9602 "prec too big");
9603 goto onError;
9604 }
9605 prec = prec*10 + (c - '0');
9606 }
9607 }
9608 } /* prec */
9609 if (fmtcnt >= 0) {
9610 if (c == 'h' || c == 'l' || c == 'L') {
9611 if (--fmtcnt >= 0)
9612 c = *fmt++;
9613 }
9614 }
9615 if (fmtcnt < 0) {
9616 PyErr_SetString(PyExc_ValueError,
9617 "incomplete format");
9618 goto onError;
9619 }
9620 if (c != '%') {
9621 v = getnextarg(args, arglen, &argidx);
9622 if (v == NULL)
9623 goto onError;
9624 }
9625 sign = 0;
9626 fill = ' ';
9627 switch (c) {
9628
9629 case '%':
9630 pbuf = formatbuf;
9631 /* presume that buffer length is at least 1 */
9632 pbuf[0] = '%';
9633 len = 1;
9634 break;
9635
9636 case 's':
9637 case 'r':
9638 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009639 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009640 temp = v;
9641 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009642 }
9643 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009644 if (c == 's')
9645 temp = PyObject_Str(v);
9646 else if (c == 'r')
9647 temp = PyObject_Repr(v);
9648 else
9649 temp = PyObject_ASCII(v);
9650 if (temp == NULL)
9651 goto onError;
9652 if (PyUnicode_Check(temp))
9653 /* nothing to do */;
9654 else {
9655 Py_DECREF(temp);
9656 PyErr_SetString(PyExc_TypeError,
9657 "%s argument has non-string str()");
9658 goto onError;
9659 }
9660 }
9661 pbuf = PyUnicode_AS_UNICODE(temp);
9662 len = PyUnicode_GET_SIZE(temp);
9663 if (prec >= 0 && len > prec)
9664 len = prec;
9665 break;
9666
9667 case 'i':
9668 case 'd':
9669 case 'u':
9670 case 'o':
9671 case 'x':
9672 case 'X':
9673 if (c == 'i')
9674 c = 'd';
9675 isnumok = 0;
9676 if (PyNumber_Check(v)) {
9677 PyObject *iobj=NULL;
9678
9679 if (PyLong_Check(v)) {
9680 iobj = v;
9681 Py_INCREF(iobj);
9682 }
9683 else {
9684 iobj = PyNumber_Long(v);
9685 }
9686 if (iobj!=NULL) {
9687 if (PyLong_Check(iobj)) {
9688 isnumok = 1;
9689 temp = formatlong(iobj, flags, prec, c);
9690 Py_DECREF(iobj);
9691 if (!temp)
9692 goto onError;
9693 pbuf = PyUnicode_AS_UNICODE(temp);
9694 len = PyUnicode_GET_SIZE(temp);
9695 sign = 1;
9696 }
9697 else {
9698 Py_DECREF(iobj);
9699 }
9700 }
9701 }
9702 if (!isnumok) {
9703 PyErr_Format(PyExc_TypeError,
9704 "%%%c format: a number is required, "
9705 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9706 goto onError;
9707 }
9708 if (flags & F_ZERO)
9709 fill = '0';
9710 break;
9711
9712 case 'e':
9713 case 'E':
9714 case 'f':
9715 case 'F':
9716 case 'g':
9717 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009718 temp = formatfloat(v, flags, prec, c);
9719 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009720 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009721 pbuf = PyUnicode_AS_UNICODE(temp);
9722 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009723 sign = 1;
9724 if (flags & F_ZERO)
9725 fill = '0';
9726 break;
9727
9728 case 'c':
9729 pbuf = formatbuf;
9730 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9731 if (len < 0)
9732 goto onError;
9733 break;
9734
9735 default:
9736 PyErr_Format(PyExc_ValueError,
9737 "unsupported format character '%c' (0x%x) "
9738 "at index %zd",
9739 (31<=c && c<=126) ? (char)c : '?',
9740 (int)c,
9741 (Py_ssize_t)(fmt - 1 -
9742 PyUnicode_AS_UNICODE(uformat)));
9743 goto onError;
9744 }
9745 if (sign) {
9746 if (*pbuf == '-' || *pbuf == '+') {
9747 sign = *pbuf++;
9748 len--;
9749 }
9750 else if (flags & F_SIGN)
9751 sign = '+';
9752 else if (flags & F_BLANK)
9753 sign = ' ';
9754 else
9755 sign = 0;
9756 }
9757 if (width < len)
9758 width = len;
9759 if (rescnt - (sign != 0) < width) {
9760 reslen -= rescnt;
9761 rescnt = width + fmtcnt + 100;
9762 reslen += rescnt;
9763 if (reslen < 0) {
9764 Py_XDECREF(temp);
9765 PyErr_NoMemory();
9766 goto onError;
9767 }
9768 if (_PyUnicode_Resize(&result, reslen) < 0) {
9769 Py_XDECREF(temp);
9770 goto onError;
9771 }
9772 res = PyUnicode_AS_UNICODE(result)
9773 + reslen - rescnt;
9774 }
9775 if (sign) {
9776 if (fill != ' ')
9777 *res++ = sign;
9778 rescnt--;
9779 if (width > len)
9780 width--;
9781 }
9782 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9783 assert(pbuf[0] == '0');
9784 assert(pbuf[1] == c);
9785 if (fill != ' ') {
9786 *res++ = *pbuf++;
9787 *res++ = *pbuf++;
9788 }
9789 rescnt -= 2;
9790 width -= 2;
9791 if (width < 0)
9792 width = 0;
9793 len -= 2;
9794 }
9795 if (width > len && !(flags & F_LJUST)) {
9796 do {
9797 --rescnt;
9798 *res++ = fill;
9799 } while (--width > len);
9800 }
9801 if (fill == ' ') {
9802 if (sign)
9803 *res++ = sign;
9804 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9805 assert(pbuf[0] == '0');
9806 assert(pbuf[1] == c);
9807 *res++ = *pbuf++;
9808 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009809 }
9810 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009811 Py_UNICODE_COPY(res, pbuf, len);
9812 res += len;
9813 rescnt -= len;
9814 while (--width >= len) {
9815 --rescnt;
9816 *res++ = ' ';
9817 }
9818 if (dict && (argidx < arglen) && c != '%') {
9819 PyErr_SetString(PyExc_TypeError,
9820 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009821 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009822 goto onError;
9823 }
9824 Py_XDECREF(temp);
9825 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826 } /* until end */
9827 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009828 PyErr_SetString(PyExc_TypeError,
9829 "not all arguments converted during string formatting");
9830 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009831 }
9832
Thomas Woutersa96affe2006-03-12 00:29:36 +00009833 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009834 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009836 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009837 }
9838 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009839 return (PyObject *)result;
9840
Benjamin Peterson29060642009-01-31 22:14:21 +00009841 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009842 Py_XDECREF(result);
9843 Py_DECREF(uformat);
9844 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009845 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846 }
9847 return NULL;
9848}
9849
Jeremy Hylton938ace62002-07-17 16:30:39 +00009850static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009851unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9852
Tim Peters6d6c1a32001-08-02 04:15:00 +00009853static PyObject *
9854unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9855{
Benjamin Peterson29060642009-01-31 22:14:21 +00009856 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009857 static char *kwlist[] = {"object", "encoding", "errors", 0};
9858 char *encoding = NULL;
9859 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009860
Benjamin Peterson14339b62009-01-31 16:36:08 +00009861 if (type != &PyUnicode_Type)
9862 return unicode_subtype_new(type, args, kwds);
9863 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009864 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009865 return NULL;
9866 if (x == NULL)
9867 return (PyObject *)_PyUnicode_New(0);
9868 if (encoding == NULL && errors == NULL)
9869 return PyObject_Str(x);
9870 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009871 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009872}
9873
Guido van Rossume023fe02001-08-30 03:12:59 +00009874static PyObject *
9875unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9876{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009877 PyUnicodeObject *tmp, *pnew;
9878 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009879
Benjamin Peterson14339b62009-01-31 16:36:08 +00009880 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9881 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9882 if (tmp == NULL)
9883 return NULL;
9884 assert(PyUnicode_Check(tmp));
9885 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9886 if (pnew == NULL) {
9887 Py_DECREF(tmp);
9888 return NULL;
9889 }
9890 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9891 if (pnew->str == NULL) {
9892 _Py_ForgetReference((PyObject *)pnew);
9893 PyObject_Del(pnew);
9894 Py_DECREF(tmp);
9895 return PyErr_NoMemory();
9896 }
9897 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9898 pnew->length = n;
9899 pnew->hash = tmp->hash;
9900 Py_DECREF(tmp);
9901 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009902}
9903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009904PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009905 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009906\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009907Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009908encoding defaults to the current default string encoding.\n\
9909errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009910
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009911static PyObject *unicode_iter(PyObject *seq);
9912
Guido van Rossumd57fd912000-03-10 22:53:23 +00009913PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009914 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009915 "str", /* tp_name */
9916 sizeof(PyUnicodeObject), /* tp_size */
9917 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009919 (destructor)unicode_dealloc, /* tp_dealloc */
9920 0, /* tp_print */
9921 0, /* tp_getattr */
9922 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009923 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009924 unicode_repr, /* tp_repr */
9925 &unicode_as_number, /* tp_as_number */
9926 &unicode_as_sequence, /* tp_as_sequence */
9927 &unicode_as_mapping, /* tp_as_mapping */
9928 (hashfunc) unicode_hash, /* tp_hash*/
9929 0, /* tp_call*/
9930 (reprfunc) unicode_str, /* tp_str */
9931 PyObject_GenericGetAttr, /* tp_getattro */
9932 0, /* tp_setattro */
9933 0, /* tp_as_buffer */
9934 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009935 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009936 unicode_doc, /* tp_doc */
9937 0, /* tp_traverse */
9938 0, /* tp_clear */
9939 PyUnicode_RichCompare, /* tp_richcompare */
9940 0, /* tp_weaklistoffset */
9941 unicode_iter, /* tp_iter */
9942 0, /* tp_iternext */
9943 unicode_methods, /* tp_methods */
9944 0, /* tp_members */
9945 0, /* tp_getset */
9946 &PyBaseObject_Type, /* tp_base */
9947 0, /* tp_dict */
9948 0, /* tp_descr_get */
9949 0, /* tp_descr_set */
9950 0, /* tp_dictoffset */
9951 0, /* tp_init */
9952 0, /* tp_alloc */
9953 unicode_new, /* tp_new */
9954 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955};
9956
9957/* Initialize the Unicode implementation */
9958
Thomas Wouters78890102000-07-22 19:25:51 +00009959void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009960{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009961 int i;
9962
Thomas Wouters477c8d52006-05-27 19:21:47 +00009963 /* XXX - move this array to unicodectype.c ? */
9964 Py_UNICODE linebreak[] = {
9965 0x000A, /* LINE FEED */
9966 0x000D, /* CARRIAGE RETURN */
9967 0x001C, /* FILE SEPARATOR */
9968 0x001D, /* GROUP SEPARATOR */
9969 0x001E, /* RECORD SEPARATOR */
9970 0x0085, /* NEXT LINE */
9971 0x2028, /* LINE SEPARATOR */
9972 0x2029, /* PARAGRAPH SEPARATOR */
9973 };
9974
Fred Drakee4315f52000-05-09 19:53:39 +00009975 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009976 free_list = NULL;
9977 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009978 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009979 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009980 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009981
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009982 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009983 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009984 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009985 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009986
9987 /* initialize the linebreak bloom filter */
9988 bloom_linebreak = make_bloom_mask(
9989 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9990 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009991
9992 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993}
9994
9995/* Finalize the Unicode implementation */
9996
Christian Heimesa156e092008-02-16 07:38:31 +00009997int
9998PyUnicode_ClearFreeList(void)
9999{
10000 int freelist_size = numfree;
10001 PyUnicodeObject *u;
10002
10003 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010004 PyUnicodeObject *v = u;
10005 u = *(PyUnicodeObject **)u;
10006 if (v->str)
10007 PyObject_DEL(v->str);
10008 Py_XDECREF(v->defenc);
10009 PyObject_Del(v);
10010 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010011 }
10012 free_list = NULL;
10013 assert(numfree == 0);
10014 return freelist_size;
10015}
10016
Guido van Rossumd57fd912000-03-10 22:53:23 +000010017void
Thomas Wouters78890102000-07-22 19:25:51 +000010018_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010020 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010022 Py_XDECREF(unicode_empty);
10023 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010024
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010025 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010026 if (unicode_latin1[i]) {
10027 Py_DECREF(unicode_latin1[i]);
10028 unicode_latin1[i] = NULL;
10029 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010030 }
Christian Heimesa156e092008-02-16 07:38:31 +000010031 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010033
Walter Dörwald16807132007-05-25 13:52:07 +000010034void
10035PyUnicode_InternInPlace(PyObject **p)
10036{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010037 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10038 PyObject *t;
10039 if (s == NULL || !PyUnicode_Check(s))
10040 Py_FatalError(
10041 "PyUnicode_InternInPlace: unicode strings only please!");
10042 /* If it's a subclass, we don't really know what putting
10043 it in the interned dict might do. */
10044 if (!PyUnicode_CheckExact(s))
10045 return;
10046 if (PyUnicode_CHECK_INTERNED(s))
10047 return;
10048 if (interned == NULL) {
10049 interned = PyDict_New();
10050 if (interned == NULL) {
10051 PyErr_Clear(); /* Don't leave an exception */
10052 return;
10053 }
10054 }
10055 /* It might be that the GetItem call fails even
10056 though the key is present in the dictionary,
10057 namely when this happens during a stack overflow. */
10058 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010059 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010060 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010061
Benjamin Peterson29060642009-01-31 22:14:21 +000010062 if (t) {
10063 Py_INCREF(t);
10064 Py_DECREF(*p);
10065 *p = t;
10066 return;
10067 }
Walter Dörwald16807132007-05-25 13:52:07 +000010068
Benjamin Peterson14339b62009-01-31 16:36:08 +000010069 PyThreadState_GET()->recursion_critical = 1;
10070 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10071 PyErr_Clear();
10072 PyThreadState_GET()->recursion_critical = 0;
10073 return;
10074 }
10075 PyThreadState_GET()->recursion_critical = 0;
10076 /* The two references in interned are not counted by refcnt.
10077 The deallocator will take care of this */
10078 Py_REFCNT(s) -= 2;
10079 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010080}
10081
10082void
10083PyUnicode_InternImmortal(PyObject **p)
10084{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010085 PyUnicode_InternInPlace(p);
10086 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10087 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10088 Py_INCREF(*p);
10089 }
Walter Dörwald16807132007-05-25 13:52:07 +000010090}
10091
10092PyObject *
10093PyUnicode_InternFromString(const char *cp)
10094{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010095 PyObject *s = PyUnicode_FromString(cp);
10096 if (s == NULL)
10097 return NULL;
10098 PyUnicode_InternInPlace(&s);
10099 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010100}
10101
10102void _Py_ReleaseInternedUnicodeStrings(void)
10103{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010104 PyObject *keys;
10105 PyUnicodeObject *s;
10106 Py_ssize_t i, n;
10107 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010108
Benjamin Peterson14339b62009-01-31 16:36:08 +000010109 if (interned == NULL || !PyDict_Check(interned))
10110 return;
10111 keys = PyDict_Keys(interned);
10112 if (keys == NULL || !PyList_Check(keys)) {
10113 PyErr_Clear();
10114 return;
10115 }
Walter Dörwald16807132007-05-25 13:52:07 +000010116
Benjamin Peterson14339b62009-01-31 16:36:08 +000010117 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10118 detector, interned unicode strings are not forcibly deallocated;
10119 rather, we give them their stolen references back, and then clear
10120 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010121
Benjamin Peterson14339b62009-01-31 16:36:08 +000010122 n = PyList_GET_SIZE(keys);
10123 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010124 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010125 for (i = 0; i < n; i++) {
10126 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10127 switch (s->state) {
10128 case SSTATE_NOT_INTERNED:
10129 /* XXX Shouldn't happen */
10130 break;
10131 case SSTATE_INTERNED_IMMORTAL:
10132 Py_REFCNT(s) += 1;
10133 immortal_size += s->length;
10134 break;
10135 case SSTATE_INTERNED_MORTAL:
10136 Py_REFCNT(s) += 2;
10137 mortal_size += s->length;
10138 break;
10139 default:
10140 Py_FatalError("Inconsistent interned string state.");
10141 }
10142 s->state = SSTATE_NOT_INTERNED;
10143 }
10144 fprintf(stderr, "total size of all interned strings: "
10145 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10146 "mortal/immortal\n", mortal_size, immortal_size);
10147 Py_DECREF(keys);
10148 PyDict_Clear(interned);
10149 Py_DECREF(interned);
10150 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010151}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010152
10153
10154/********************* Unicode Iterator **************************/
10155
10156typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010157 PyObject_HEAD
10158 Py_ssize_t it_index;
10159 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010160} unicodeiterobject;
10161
10162static void
10163unicodeiter_dealloc(unicodeiterobject *it)
10164{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010165 _PyObject_GC_UNTRACK(it);
10166 Py_XDECREF(it->it_seq);
10167 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010168}
10169
10170static int
10171unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10172{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010173 Py_VISIT(it->it_seq);
10174 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010175}
10176
10177static PyObject *
10178unicodeiter_next(unicodeiterobject *it)
10179{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010180 PyUnicodeObject *seq;
10181 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010182
Benjamin Peterson14339b62009-01-31 16:36:08 +000010183 assert(it != NULL);
10184 seq = it->it_seq;
10185 if (seq == NULL)
10186 return NULL;
10187 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010188
Benjamin Peterson14339b62009-01-31 16:36:08 +000010189 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10190 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010191 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010192 if (item != NULL)
10193 ++it->it_index;
10194 return item;
10195 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010196
Benjamin Peterson14339b62009-01-31 16:36:08 +000010197 Py_DECREF(seq);
10198 it->it_seq = NULL;
10199 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010200}
10201
10202static PyObject *
10203unicodeiter_len(unicodeiterobject *it)
10204{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010205 Py_ssize_t len = 0;
10206 if (it->it_seq)
10207 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10208 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010209}
10210
10211PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10212
10213static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010214 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010215 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010216 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010217};
10218
10219PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010220 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10221 "str_iterator", /* tp_name */
10222 sizeof(unicodeiterobject), /* tp_basicsize */
10223 0, /* tp_itemsize */
10224 /* methods */
10225 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10226 0, /* tp_print */
10227 0, /* tp_getattr */
10228 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010229 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010230 0, /* tp_repr */
10231 0, /* tp_as_number */
10232 0, /* tp_as_sequence */
10233 0, /* tp_as_mapping */
10234 0, /* tp_hash */
10235 0, /* tp_call */
10236 0, /* tp_str */
10237 PyObject_GenericGetAttr, /* tp_getattro */
10238 0, /* tp_setattro */
10239 0, /* tp_as_buffer */
10240 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10241 0, /* tp_doc */
10242 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10243 0, /* tp_clear */
10244 0, /* tp_richcompare */
10245 0, /* tp_weaklistoffset */
10246 PyObject_SelfIter, /* tp_iter */
10247 (iternextfunc)unicodeiter_next, /* tp_iternext */
10248 unicodeiter_methods, /* tp_methods */
10249 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010250};
10251
10252static PyObject *
10253unicode_iter(PyObject *seq)
10254{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010255 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010256
Benjamin Peterson14339b62009-01-31 16:36:08 +000010257 if (!PyUnicode_Check(seq)) {
10258 PyErr_BadInternalCall();
10259 return NULL;
10260 }
10261 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10262 if (it == NULL)
10263 return NULL;
10264 it->it_index = 0;
10265 Py_INCREF(seq);
10266 it->it_seq = (PyUnicodeObject *)seq;
10267 _PyObject_GC_TRACK(it);
10268 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010269}
10270
Martin v. Löwis5b222132007-06-10 09:51:05 +000010271size_t
10272Py_UNICODE_strlen(const Py_UNICODE *u)
10273{
10274 int res = 0;
10275 while(*u++)
10276 res++;
10277 return res;
10278}
10279
10280Py_UNICODE*
10281Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10282{
10283 Py_UNICODE *u = s1;
10284 while ((*u++ = *s2++));
10285 return s1;
10286}
10287
10288Py_UNICODE*
10289Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10290{
10291 Py_UNICODE *u = s1;
10292 while ((*u++ = *s2++))
10293 if (n-- == 0)
10294 break;
10295 return s1;
10296}
10297
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010298Py_UNICODE*
10299Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10300{
10301 Py_UNICODE *u1 = s1;
10302 u1 += Py_UNICODE_strlen(u1);
10303 Py_UNICODE_strcpy(u1, s2);
10304 return s1;
10305}
10306
Martin v. Löwis5b222132007-06-10 09:51:05 +000010307int
10308Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10309{
10310 while (*s1 && *s2 && *s1 == *s2)
10311 s1++, s2++;
10312 if (*s1 && *s2)
10313 return (*s1 < *s2) ? -1 : +1;
10314 if (*s1)
10315 return 1;
10316 if (*s2)
10317 return -1;
10318 return 0;
10319}
10320
Victor Stinneref8d95c2010-08-16 22:03:11 +000010321int
10322Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10323{
10324 register Py_UNICODE u1, u2;
10325 for (; n != 0; n--) {
10326 u1 = *s1;
10327 u2 = *s2;
10328 if (u1 != u2)
10329 return (u1 < u2) ? -1 : +1;
10330 if (u1 == '\0')
10331 return 0;
10332 s1++;
10333 s2++;
10334 }
10335 return 0;
10336}
10337
Martin v. Löwis5b222132007-06-10 09:51:05 +000010338Py_UNICODE*
10339Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10340{
10341 const Py_UNICODE *p;
10342 for (p = s; *p; p++)
10343 if (*p == c)
10344 return (Py_UNICODE*)p;
10345 return NULL;
10346}
10347
Victor Stinner331ea922010-08-10 16:37:20 +000010348Py_UNICODE*
10349Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10350{
10351 const Py_UNICODE *p;
10352 p = s + Py_UNICODE_strlen(s);
10353 while (p != s) {
10354 p--;
10355 if (*p == c)
10356 return (Py_UNICODE*)p;
10357 }
10358 return NULL;
10359}
10360
Victor Stinner71133ff2010-09-01 23:43:53 +000010361Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010362PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010363{
10364 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10365 Py_UNICODE *copy;
10366 Py_ssize_t size;
10367
10368 /* Ensure we won't overflow the size. */
10369 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10370 PyErr_NoMemory();
10371 return NULL;
10372 }
10373 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10374 size *= sizeof(Py_UNICODE);
10375 copy = PyMem_Malloc(size);
10376 if (copy == NULL) {
10377 PyErr_NoMemory();
10378 return NULL;
10379 }
10380 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10381 return copy;
10382}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010383
Georg Brandl66c221e2010-10-14 07:04:07 +000010384/* A _string module, to export formatter_parser and formatter_field_name_split
10385 to the string.Formatter class implemented in Python. */
10386
10387static PyMethodDef _string_methods[] = {
10388 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10389 METH_O, PyDoc_STR("split the argument as a field name")},
10390 {"formatter_parser", (PyCFunction) formatter_parser,
10391 METH_O, PyDoc_STR("parse the argument as a format string")},
10392 {NULL, NULL}
10393};
10394
10395static struct PyModuleDef _string_module = {
10396 PyModuleDef_HEAD_INIT,
10397 "_string",
10398 PyDoc_STR("string helper module"),
10399 0,
10400 _string_methods,
10401 NULL,
10402 NULL,
10403 NULL,
10404 NULL
10405};
10406
10407PyMODINIT_FUNC
10408PyInit__string(void)
10409{
10410 return PyModule_Create(&_string_module);
10411}
10412
10413
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010414#ifdef __cplusplus
10415}
10416#endif