blob: 57baebd54710186e79dd3bc8cf498497af27c991 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000145static PyObject *unicode_encode_call_errorhandler(const char *errors,
146 PyObject **errorHandler,const char *encoding, const char *reason,
147 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
148 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
149
Victor Stinner31be90b2010-04-22 19:38:16 +0000150static void raise_encode_exception(PyObject **exceptionObject,
151 const char *encoding,
152 const Py_UNICODE *unicode, Py_ssize_t size,
153 Py_ssize_t startpos, Py_ssize_t endpos,
154 const char *reason);
155
Christian Heimes190d79e2008-01-30 11:58:22 +0000156/* Same for linebreaks */
157static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000160/* 0x000B, * LINE TABULATION */
161/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x001C, * FILE SEPARATOR */
166/* 0x001D, * GROUP SEPARATOR */
167/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000168 0, 0, 0, 0, 1, 1, 1, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000173
Benjamin Peterson14339b62009-01-31 16:36:08 +0000174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000182};
183
184
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000185Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000186PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000188#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000189 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 /* This is actually an illegal character, so it should
192 not be passed to unichr. */
193 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#endif
195}
196
Thomas Wouters477c8d52006-05-27 19:21:47 +0000197/* --- Bloom Filters ----------------------------------------------------- */
198
199/* stuff to implement simple "bloom filters" for Unicode characters.
200 to keep things simple, we use a single bitmask, using the least 5
201 bits from each unicode characters as the bit index. */
202
203/* the linebreak mask is set up by Unicode_Init below */
204
Antoine Pitrouf068f942010-01-13 14:19:12 +0000205#if LONG_BIT >= 128
206#define BLOOM_WIDTH 128
207#elif LONG_BIT >= 64
208#define BLOOM_WIDTH 64
209#elif LONG_BIT >= 32
210#define BLOOM_WIDTH 32
211#else
212#error "LONG_BIT is smaller than 32"
213#endif
214
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215#define BLOOM_MASK unsigned long
216
217static BLOOM_MASK bloom_linebreak;
218
Antoine Pitrouf068f942010-01-13 14:19:12 +0000219#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
220#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221
Benjamin Peterson29060642009-01-31 22:14:21 +0000222#define BLOOM_LINEBREAK(ch) \
223 ((ch) < 128U ? ascii_linebreak[(ch)] : \
224 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000225
226Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
227{
228 /* calculate simple bloom-style bitmask for a given unicode string */
229
Antoine Pitrouf068f942010-01-13 14:19:12 +0000230 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231 Py_ssize_t i;
232
233 mask = 0;
234 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000235 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000236
237 return mask;
238}
239
240Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
241{
242 Py_ssize_t i;
243
244 for (i = 0; i < setlen; i++)
245 if (set[i] == chr)
246 return 1;
247
248 return 0;
249}
250
Benjamin Peterson29060642009-01-31 22:14:21 +0000251#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000252 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
253
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254/* --- Unicode Object ----------------------------------------------------- */
255
256static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000257int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000258 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259{
260 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000261
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000266 /* Resizing shared object (unicode_empty or single character
267 objects) in-place is not allowed. Use PyUnicode_Resize()
268 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000271 (unicode->length == 1 &&
272 unicode->str[0] < 256U &&
273 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000275 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 return -1;
277 }
278
Thomas Wouters477c8d52006-05-27 19:21:47 +0000279 /* We allocate one more byte to make sure the string is Ux0000 terminated.
280 The overallocation is also used by fastsearch, which assumes that it's
281 safe to look at str[length] (without making any assumptions about what
282 it contains). */
283
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000285 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000286 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 PyErr_NoMemory();
290 return -1;
291 }
292 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294
Benjamin Peterson29060642009-01-31 22:14:21 +0000295 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000297 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000298 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 }
300 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000301
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 return 0;
303}
304
305/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000306 Ux0000 terminated; some code (e.g. new_identifier)
307 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308
309 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000310 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312*/
313
314static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316{
317 register PyUnicodeObject *unicode;
318
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 if (length == 0 && unicode_empty != NULL) {
321 Py_INCREF(unicode_empty);
322 return unicode_empty;
323 }
324
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000325 /* Ensure we won't overflow the size. */
326 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
327 return (PyUnicodeObject *)PyErr_NoMemory();
328 }
329
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000331 if (free_list) {
332 unicode = free_list;
333 free_list = *(PyUnicodeObject **)unicode;
334 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000335 if (unicode->str) {
336 /* Keep-Alive optimization: we only upsize the buffer,
337 never downsize it. */
338 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000339 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000340 PyObject_DEL(unicode->str);
341 unicode->str = NULL;
342 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000343 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 }
348 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 }
350 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000351 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000352 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 if (unicode == NULL)
354 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
356 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 }
358
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000359 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000360 PyErr_NoMemory();
361 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000363 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000364 * the caller fails before initializing str -- unicode_resize()
365 * reads str[0], and the Keep-Alive optimization can keep memory
366 * allocated for str alive across a call to unicode_dealloc(unicode).
367 * We don't want unicode_resize to read uninitialized memory in
368 * that case.
369 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000370 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000374 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000375 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377
Benjamin Peterson29060642009-01-31 22:14:21 +0000378 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000379 /* XXX UNREF/NEWREF interface should be more symmetrical */
380 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000382 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000383 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384}
385
386static
Guido van Rossum9475a232001-10-05 20:51:39 +0000387void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388{
Walter Dörwald16807132007-05-25 13:52:07 +0000389 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 case SSTATE_NOT_INTERNED:
391 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000392
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_INTERNED_MORTAL:
394 /* revive dead object temporarily for DelItem */
395 Py_REFCNT(unicode) = 3;
396 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
397 Py_FatalError(
398 "deletion of interned string failed");
399 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000400
Benjamin Peterson29060642009-01-31 22:14:21 +0000401 case SSTATE_INTERNED_IMMORTAL:
402 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 default:
405 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406 }
407
Guido van Rossum604ddf82001-12-06 20:03:56 +0000408 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000409 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000410 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000411 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
412 PyObject_DEL(unicode->str);
413 unicode->str = NULL;
414 unicode->length = 0;
415 }
416 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000417 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000418 }
419 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000420 *(PyUnicodeObject **)unicode = free_list;
421 free_list = unicode;
422 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000425 PyObject_DEL(unicode->str);
426 Py_XDECREF(unicode->defenc);
427 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428 }
429}
430
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000431static
432int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433{
434 register PyUnicodeObject *v;
435
436 /* Argument checks */
437 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000441 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000442 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000443 PyErr_BadInternalCall();
444 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000445 }
446
447 /* Resizing unicode_empty and single character objects is not
448 possible since these are being shared. We simply return a fresh
449 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000450 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000451 (v == unicode_empty || v->length == 1)) {
452 PyUnicodeObject *w = _PyUnicode_New(length);
453 if (w == NULL)
454 return -1;
455 Py_UNICODE_COPY(w->str, v->str,
456 length < v->length ? length : v->length);
457 Py_DECREF(*unicode);
458 *unicode = w;
459 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 }
461
462 /* Note that we don't have to modify *unicode for unshared Unicode
463 objects, since we can modify them in-place. */
464 return unicode_resize(v, length);
465}
466
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000467int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
468{
469 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
470}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000471
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000473 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474{
475 PyUnicodeObject *unicode;
476
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000477 /* If the Unicode data is known at construction time, we can apply
478 some optimizations which share commonly used objects. */
479 if (u != NULL) {
480
Benjamin Peterson29060642009-01-31 22:14:21 +0000481 /* Optimization for empty strings */
482 if (size == 0 && unicode_empty != NULL) {
483 Py_INCREF(unicode_empty);
484 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000485 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000486
487 /* Single character Unicode objects in the Latin-1 range are
488 shared when using this constructor */
489 if (size == 1 && *u < 256) {
490 unicode = unicode_latin1[*u];
491 if (!unicode) {
492 unicode = _PyUnicode_New(1);
493 if (!unicode)
494 return NULL;
495 unicode->str[0] = *u;
496 unicode_latin1[*u] = unicode;
497 }
498 Py_INCREF(unicode);
499 return (PyObject *)unicode;
500 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000501 }
Tim Petersced69f82003-09-16 20:30:58 +0000502
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 unicode = _PyUnicode_New(size);
504 if (!unicode)
505 return NULL;
506
507 /* Copy the Unicode data into the new object */
508 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000509 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510
511 return (PyObject *)unicode;
512}
513
Walter Dörwaldd2034312007-05-18 16:29:38 +0000514PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000515{
516 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000517
Benjamin Peterson14339b62009-01-31 16:36:08 +0000518 if (size < 0) {
519 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000520 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 return NULL;
522 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000524 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000525 some optimizations which share commonly used objects.
526 Also, this means the input must be UTF-8, so fall back to the
527 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000528 if (u != NULL) {
529
Benjamin Peterson29060642009-01-31 22:14:21 +0000530 /* Optimization for empty strings */
531 if (size == 0 && unicode_empty != NULL) {
532 Py_INCREF(unicode_empty);
533 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000534 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000535
536 /* Single characters are shared when using this constructor.
537 Restrict to ASCII, since the input must be UTF-8. */
538 if (size == 1 && Py_CHARMASK(*u) < 128) {
539 unicode = unicode_latin1[Py_CHARMASK(*u)];
540 if (!unicode) {
541 unicode = _PyUnicode_New(1);
542 if (!unicode)
543 return NULL;
544 unicode->str[0] = Py_CHARMASK(*u);
545 unicode_latin1[Py_CHARMASK(*u)] = unicode;
546 }
547 Py_INCREF(unicode);
548 return (PyObject *)unicode;
549 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000550
551 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000552 }
553
Walter Dörwald55507312007-05-18 13:12:10 +0000554 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 if (!unicode)
556 return NULL;
557
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 return (PyObject *)unicode;
559}
560
Walter Dörwaldd2034312007-05-18 16:29:38 +0000561PyObject *PyUnicode_FromString(const char *u)
562{
563 size_t size = strlen(u);
564 if (size > PY_SSIZE_T_MAX) {
565 PyErr_SetString(PyExc_OverflowError, "input too long");
566 return NULL;
567 }
568
569 return PyUnicode_FromStringAndSize(u, size);
570}
571
Guido van Rossumd57fd912000-03-10 22:53:23 +0000572#ifdef HAVE_WCHAR_H
573
Mark Dickinson081dfee2009-03-18 14:47:41 +0000574#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
575# define CONVERT_WCHAR_TO_SURROGATES
576#endif
577
578#ifdef CONVERT_WCHAR_TO_SURROGATES
579
580/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
581 to convert from UTF32 to UTF16. */
582
583PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
584 Py_ssize_t size)
585{
586 PyUnicodeObject *unicode;
587 register Py_ssize_t i;
588 Py_ssize_t alloc;
589 const wchar_t *orig_w;
590
591 if (w == NULL) {
592 if (size == 0)
593 return PyUnicode_FromStringAndSize(NULL, 0);
594 PyErr_BadInternalCall();
595 return NULL;
596 }
597
598 if (size == -1) {
599 size = wcslen(w);
600 }
601
602 alloc = size;
603 orig_w = w;
604 for (i = size; i > 0; i--) {
605 if (*w > 0xFFFF)
606 alloc++;
607 w++;
608 }
609 w = orig_w;
610 unicode = _PyUnicode_New(alloc);
611 if (!unicode)
612 return NULL;
613
614 /* Copy the wchar_t data into the new object */
615 {
616 register Py_UNICODE *u;
617 u = PyUnicode_AS_UNICODE(unicode);
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF) {
620 wchar_t ordinal = *w++;
621 ordinal -= 0x10000;
622 *u++ = 0xD800 | (ordinal >> 10);
623 *u++ = 0xDC00 | (ordinal & 0x3FF);
624 }
625 else
626 *u++ = *w++;
627 }
628 }
629 return (PyObject *)unicode;
630}
631
632#else
633
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000635 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636{
637 PyUnicodeObject *unicode;
638
639 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000640 if (size == 0)
641 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000642 PyErr_BadInternalCall();
643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000644 }
645
Martin v. Löwis790465f2008-04-05 20:41:37 +0000646 if (size == -1) {
647 size = wcslen(w);
648 }
649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = _PyUnicode_New(size);
651 if (!unicode)
652 return NULL;
653
654 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000655#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000657#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000659 register Py_UNICODE *u;
660 register Py_ssize_t i;
661 u = PyUnicode_AS_UNICODE(unicode);
662 for (i = size; i > 0; i--)
663 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 }
665#endif
666
667 return (PyObject *)unicode;
668}
669
Mark Dickinson081dfee2009-03-18 14:47:41 +0000670#endif /* CONVERT_WCHAR_TO_SURROGATES */
671
672#undef CONVERT_WCHAR_TO_SURROGATES
673
Walter Dörwald346737f2007-05-31 10:44:43 +0000674static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000675makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
676 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000677{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000678 *fmt++ = '%';
679 if (width) {
680 if (zeropad)
681 *fmt++ = '0';
682 fmt += sprintf(fmt, "%d", width);
683 }
684 if (precision)
685 fmt += sprintf(fmt, ".%d", precision);
686 if (longflag)
687 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000688 else if (longlongflag) {
689 /* longlongflag should only ever be nonzero on machines with
690 HAVE_LONG_LONG defined */
691#ifdef HAVE_LONG_LONG
692 char *f = PY_FORMAT_LONG_LONG;
693 while (*f)
694 *fmt++ = *f++;
695#else
696 /* we shouldn't ever get here */
697 assert(0);
698 *fmt++ = 'l';
699#endif
700 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000701 else if (size_tflag) {
702 char *f = PY_FORMAT_SIZE_T;
703 while (*f)
704 *fmt++ = *f++;
705 }
706 *fmt++ = c;
707 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000708}
709
Walter Dörwaldd2034312007-05-18 16:29:38 +0000710#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
711
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000712/* size of fixed-size buffer for formatting single arguments */
713#define ITEM_BUFFER_LEN 21
714/* maximum number of characters required for output of %ld. 21 characters
715 allows for 64-bit integers (in decimal) and an optional sign. */
716#define MAX_LONG_CHARS 21
717/* maximum number of characters required for output of %lld.
718 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
719 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
720#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
721
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722PyObject *
723PyUnicode_FromFormatV(const char *format, va_list vargs)
724{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000725 va_list count;
726 Py_ssize_t callcount = 0;
727 PyObject **callresults = NULL;
728 PyObject **callresult = NULL;
729 Py_ssize_t n = 0;
730 int width = 0;
731 int precision = 0;
732 int zeropad;
733 const char* f;
734 Py_UNICODE *s;
735 PyObject *string;
736 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000737 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000738 /* use abuffer instead of buffer, if we need more space
739 * (which can happen if there's a format specifier with width). */
740 char *abuffer = NULL;
741 char *realbuffer;
742 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000743 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000744 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000745
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000746 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000747 /* step 1: count the number of %S/%R/%A/%s format specifications
748 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
749 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
750 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000751 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000752 if (*f == '%') {
753 if (*(f+1)=='%')
754 continue;
755 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
756 ++callcount;
David Malcolm96960882010-11-05 17:23:41 +0000757 while (Py_ISDIGIT((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000758 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000759 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000760 ;
761 if (*f == 's')
762 ++callcount;
763 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000764 else if (128 <= (unsigned char)*f) {
765 PyErr_Format(PyExc_ValueError,
766 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000767 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000768 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000769 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 }
772 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000773 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000774 if (callcount) {
775 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
776 if (!callresults) {
777 PyErr_NoMemory();
778 return NULL;
779 }
780 callresult = callresults;
781 }
782 /* step 3: figure out how large a buffer we need */
783 for (f = format; *f; f++) {
784 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000785#ifdef HAVE_LONG_LONG
786 int longlongflag = 0;
787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000788 const char* p = f;
789 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000790 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000792 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000793 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794
Benjamin Peterson14339b62009-01-31 16:36:08 +0000795 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
796 * they don't affect the amount of space we reserve.
797 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000798 if (*f == 'l') {
799 if (f[1] == 'd' || f[1] == 'u') {
800 ++f;
801 }
802#ifdef HAVE_LONG_LONG
803 else if (f[1] == 'l' &&
804 (f[2] == 'd' || f[2] == 'u')) {
805 longlongflag = 1;
806 f += 2;
807 }
808#endif
809 }
810 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000811 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000812 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000813
Benjamin Peterson14339b62009-01-31 16:36:08 +0000814 switch (*f) {
815 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000816 {
817#ifndef Py_UNICODE_WIDE
818 int ordinal = va_arg(count, int);
819 if (ordinal > 0xffff)
820 n += 2;
821 else
822 n++;
823#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000824 (void)va_arg(count, int);
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000825 n++;
826#endif
827 break;
828 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 case '%':
830 n++;
831 break;
832 case 'd': case 'u': case 'i': case 'x':
833 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000834#ifdef HAVE_LONG_LONG
835 if (longlongflag) {
836 if (width < MAX_LONG_LONG_CHARS)
837 width = MAX_LONG_LONG_CHARS;
838 }
839 else
840#endif
841 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
842 including sign. Decimal takes the most space. This
843 isn't enough for octal. If a width is specified we
844 need more (which we allocate later). */
845 if (width < MAX_LONG_CHARS)
846 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000847 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000848 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000849 if (abuffersize < width)
850 abuffersize = width;
851 break;
852 case 's':
853 {
854 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000855 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000856 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
857 if (!str)
858 goto fail;
859 n += PyUnicode_GET_SIZE(str);
860 /* Remember the str and switch to the next slot */
861 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000862 break;
863 }
864 case 'U':
865 {
866 PyObject *obj = va_arg(count, PyObject *);
867 assert(obj && PyUnicode_Check(obj));
868 n += PyUnicode_GET_SIZE(obj);
869 break;
870 }
871 case 'V':
872 {
873 PyObject *obj = va_arg(count, PyObject *);
874 const char *str = va_arg(count, const char *);
875 assert(obj || str);
876 assert(!obj || PyUnicode_Check(obj));
877 if (obj)
878 n += PyUnicode_GET_SIZE(obj);
879 else
880 n += strlen(str);
881 break;
882 }
883 case 'S':
884 {
885 PyObject *obj = va_arg(count, PyObject *);
886 PyObject *str;
887 assert(obj);
888 str = PyObject_Str(obj);
889 if (!str)
890 goto fail;
891 n += PyUnicode_GET_SIZE(str);
892 /* Remember the str and switch to the next slot */
893 *callresult++ = str;
894 break;
895 }
896 case 'R':
897 {
898 PyObject *obj = va_arg(count, PyObject *);
899 PyObject *repr;
900 assert(obj);
901 repr = PyObject_Repr(obj);
902 if (!repr)
903 goto fail;
904 n += PyUnicode_GET_SIZE(repr);
905 /* Remember the repr and switch to the next slot */
906 *callresult++ = repr;
907 break;
908 }
909 case 'A':
910 {
911 PyObject *obj = va_arg(count, PyObject *);
912 PyObject *ascii;
913 assert(obj);
914 ascii = PyObject_ASCII(obj);
915 if (!ascii)
916 goto fail;
917 n += PyUnicode_GET_SIZE(ascii);
918 /* Remember the repr and switch to the next slot */
919 *callresult++ = ascii;
920 break;
921 }
922 case 'p':
923 (void) va_arg(count, int);
924 /* maximum 64-bit pointer representation:
925 * 0xffffffffffffffff
926 * so 19 characters is enough.
927 * XXX I count 18 -- what's the extra for?
928 */
929 n += 19;
930 break;
931 default:
932 /* if we stumble upon an unknown
933 formatting code, copy the rest of
934 the format string to the output
935 string. (we cannot just skip the
936 code, since there's no way to know
937 what's in the argument list) */
938 n += strlen(p);
939 goto expand;
940 }
941 } else
942 n++;
943 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000944 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000945 if (abuffersize > ITEM_BUFFER_LEN) {
946 /* add 1 for sprintf's trailing null byte */
947 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000948 if (!abuffer) {
949 PyErr_NoMemory();
950 goto fail;
951 }
952 realbuffer = abuffer;
953 }
954 else
955 realbuffer = buffer;
956 /* step 4: fill the buffer */
957 /* Since we've analyzed how much space we need for the worst case,
958 we don't have to resize the string.
959 There can be no errors beyond this point. */
960 string = PyUnicode_FromUnicode(NULL, n);
961 if (!string)
962 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000963
Benjamin Peterson14339b62009-01-31 16:36:08 +0000964 s = PyUnicode_AS_UNICODE(string);
965 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000966
Benjamin Peterson14339b62009-01-31 16:36:08 +0000967 for (f = format; *f; f++) {
968 if (*f == '%') {
969 const char* p = f++;
970 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000971 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000972 int size_tflag = 0;
973 zeropad = (*f == '0');
974 /* parse the width.precision part */
975 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000976 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000977 width = (width*10) + *f++ - '0';
978 precision = 0;
979 if (*f == '.') {
980 f++;
David Malcolm96960882010-11-05 17:23:41 +0000981 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000982 precision = (precision*10) + *f++ - '0';
983 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000984 /* Handle %ld, %lu, %lld and %llu. */
985 if (*f == 'l') {
986 if (f[1] == 'd' || f[1] == 'u') {
987 longflag = 1;
988 ++f;
989 }
990#ifdef HAVE_LONG_LONG
991 else if (f[1] == 'l' &&
992 (f[2] == 'd' || f[2] == 'u')) {
993 longlongflag = 1;
994 f += 2;
995 }
996#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000997 }
998 /* handle the size_t flag. */
999 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1000 size_tflag = 1;
1001 ++f;
1002 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001003
Benjamin Peterson14339b62009-01-31 16:36:08 +00001004 switch (*f) {
1005 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001006 {
1007 int ordinal = va_arg(vargs, int);
1008#ifndef Py_UNICODE_WIDE
1009 if (ordinal > 0xffff) {
1010 ordinal -= 0x10000;
1011 *s++ = 0xD800 | (ordinal >> 10);
1012 *s++ = 0xDC00 | (ordinal & 0x3FF);
1013 } else
1014#endif
1015 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001016 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001017 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001019 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1020 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 if (longflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001023#ifdef HAVE_LONG_LONG
1024 else if (longlongflag)
1025 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1026#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001027 else if (size_tflag)
1028 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1029 else
1030 sprintf(realbuffer, fmt, va_arg(vargs, int));
1031 appendstring(realbuffer);
1032 break;
1033 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001034 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1035 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001036 if (longflag)
1037 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001038#ifdef HAVE_LONG_LONG
1039 else if (longlongflag)
1040 sprintf(realbuffer, fmt, va_arg(vargs,
1041 unsigned PY_LONG_LONG));
1042#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001043 else if (size_tflag)
1044 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1045 else
1046 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1047 appendstring(realbuffer);
1048 break;
1049 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001050 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 sprintf(realbuffer, fmt, va_arg(vargs, int));
1052 appendstring(realbuffer);
1053 break;
1054 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001055 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001056 sprintf(realbuffer, fmt, va_arg(vargs, int));
1057 appendstring(realbuffer);
1058 break;
1059 case 's':
1060 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001061 /* unused, since we already have the result */
1062 (void) va_arg(vargs, char *);
1063 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1064 PyUnicode_GET_SIZE(*callresult));
1065 s += PyUnicode_GET_SIZE(*callresult);
1066 /* We're done with the unicode()/repr() => forget it */
1067 Py_DECREF(*callresult);
1068 /* switch to next unicode()/repr() result */
1069 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001070 break;
1071 }
1072 case 'U':
1073 {
1074 PyObject *obj = va_arg(vargs, PyObject *);
1075 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1076 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1077 s += size;
1078 break;
1079 }
1080 case 'V':
1081 {
1082 PyObject *obj = va_arg(vargs, PyObject *);
1083 const char *str = va_arg(vargs, const char *);
1084 if (obj) {
1085 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1086 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1087 s += size;
1088 } else {
1089 appendstring(str);
1090 }
1091 break;
1092 }
1093 case 'S':
1094 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001095 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001096 {
1097 Py_UNICODE *ucopy;
1098 Py_ssize_t usize;
1099 Py_ssize_t upos;
1100 /* unused, since we already have the result */
1101 (void) va_arg(vargs, PyObject *);
1102 ucopy = PyUnicode_AS_UNICODE(*callresult);
1103 usize = PyUnicode_GET_SIZE(*callresult);
1104 for (upos = 0; upos<usize;)
1105 *s++ = ucopy[upos++];
1106 /* We're done with the unicode()/repr() => forget it */
1107 Py_DECREF(*callresult);
1108 /* switch to next unicode()/repr() result */
1109 ++callresult;
1110 break;
1111 }
1112 case 'p':
1113 sprintf(buffer, "%p", va_arg(vargs, void*));
1114 /* %p is ill-defined: ensure leading 0x. */
1115 if (buffer[1] == 'X')
1116 buffer[1] = 'x';
1117 else if (buffer[1] != 'x') {
1118 memmove(buffer+2, buffer, strlen(buffer)+1);
1119 buffer[0] = '0';
1120 buffer[1] = 'x';
1121 }
1122 appendstring(buffer);
1123 break;
1124 case '%':
1125 *s++ = '%';
1126 break;
1127 default:
1128 appendstring(p);
1129 goto end;
1130 }
Victor Stinner1205f272010-09-11 00:54:47 +00001131 }
Victor Stinner1205f272010-09-11 00:54:47 +00001132 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001133 *s++ = *f;
1134 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001135
Benjamin Peterson29060642009-01-31 22:14:21 +00001136 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001137 if (callresults)
1138 PyObject_Free(callresults);
1139 if (abuffer)
1140 PyObject_Free(abuffer);
1141 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1142 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001143 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001144 if (callresults) {
1145 PyObject **callresult2 = callresults;
1146 while (callresult2 < callresult) {
1147 Py_DECREF(*callresult2);
1148 ++callresult2;
1149 }
1150 PyObject_Free(callresults);
1151 }
1152 if (abuffer)
1153 PyObject_Free(abuffer);
1154 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001155}
1156
1157#undef appendstring
1158
1159PyObject *
1160PyUnicode_FromFormat(const char *format, ...)
1161{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 PyObject* ret;
1163 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001164
1165#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001166 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001167#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001168 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001169#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001170 ret = PyUnicode_FromFormatV(format, vargs);
1171 va_end(vargs);
1172 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001173}
1174
Victor Stinner5593d8a2010-10-02 11:11:27 +00001175/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1176 convert a Unicode object to a wide character string.
1177
1178 - If w is NULL: return the number of wide characters (including the nul
1179 character) required to convert the unicode object. Ignore size argument.
1180
1181 - Otherwise: return the number of wide characters (excluding the nul
1182 character) written into w. Write at most size wide characters (including
1183 the nul character). */
1184static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001185unicode_aswidechar(PyUnicodeObject *unicode,
1186 wchar_t *w,
1187 Py_ssize_t size)
1188{
1189#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001190 Py_ssize_t res;
1191 if (w != NULL) {
1192 res = PyUnicode_GET_SIZE(unicode);
1193 if (size > res)
1194 size = res + 1;
1195 else
1196 res = size;
1197 memcpy(w, unicode->str, size * sizeof(wchar_t));
1198 return res;
1199 }
1200 else
1201 return PyUnicode_GET_SIZE(unicode) + 1;
1202#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1203 register const Py_UNICODE *u;
1204 const Py_UNICODE *uend;
1205 const wchar_t *worig, *wend;
1206 Py_ssize_t nchar;
1207
Victor Stinner137c34c2010-09-29 10:25:54 +00001208 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001209 uend = u + PyUnicode_GET_SIZE(unicode);
1210 if (w != NULL) {
1211 worig = w;
1212 wend = w + size;
1213 while (u != uend && w != wend) {
1214 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1215 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1216 {
1217 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1218 u += 2;
1219 }
1220 else {
1221 *w = *u;
1222 u++;
1223 }
1224 w++;
1225 }
1226 if (w != wend)
1227 *w = L'\0';
1228 return w - worig;
1229 }
1230 else {
1231 nchar = 1; /* nul character at the end */
1232 while (u != uend) {
1233 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1234 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1235 u += 2;
1236 else
1237 u++;
1238 nchar++;
1239 }
1240 }
1241 return nchar;
1242#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1243 register Py_UNICODE *u, *uend, ordinal;
1244 register Py_ssize_t i;
1245 wchar_t *worig, *wend;
1246 Py_ssize_t nchar;
1247
1248 u = PyUnicode_AS_UNICODE(unicode);
1249 uend = u + PyUnicode_GET_SIZE(u);
1250 if (w != NULL) {
1251 worig = w;
1252 wend = w + size;
1253 while (u != uend && w != wend) {
1254 ordinal = *u;
1255 if (ordinal > 0xffff) {
1256 ordinal -= 0x10000;
1257 *w++ = 0xD800 | (ordinal >> 10);
1258 *w++ = 0xDC00 | (ordinal & 0x3FF);
1259 }
1260 else
1261 *w++ = ordinal;
1262 u++;
1263 }
1264 if (w != wend)
1265 *w = 0;
1266 return w - worig;
1267 }
1268 else {
1269 nchar = 1; /* nul character */
1270 while (u != uend) {
1271 if (*u > 0xffff)
1272 nchar += 2;
1273 else
1274 nchar++;
1275 u++;
1276 }
1277 return nchar;
1278 }
1279#else
1280# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001281#endif
1282}
1283
1284Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001285PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001286 wchar_t *w,
1287 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288{
1289 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001290 PyErr_BadInternalCall();
1291 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001293 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294}
1295
Victor Stinner137c34c2010-09-29 10:25:54 +00001296wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001297PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001298 Py_ssize_t *size)
1299{
1300 wchar_t* buffer;
1301 Py_ssize_t buflen;
1302
1303 if (unicode == NULL) {
1304 PyErr_BadInternalCall();
1305 return NULL;
1306 }
1307
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001308 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001309 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001310 PyErr_NoMemory();
1311 return NULL;
1312 }
1313
Victor Stinner137c34c2010-09-29 10:25:54 +00001314 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1315 if (buffer == NULL) {
1316 PyErr_NoMemory();
1317 return NULL;
1318 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001319 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001320 if (size != NULL)
1321 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001322 return buffer;
1323}
1324
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325#endif
1326
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001327PyObject *PyUnicode_FromOrdinal(int ordinal)
1328{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001329 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001330
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001331 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001332 PyErr_SetString(PyExc_ValueError,
1333 "chr() arg not in range(0x110000)");
1334 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001335 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001336
1337#ifndef Py_UNICODE_WIDE
1338 if (ordinal > 0xffff) {
1339 ordinal -= 0x10000;
1340 s[0] = 0xD800 | (ordinal >> 10);
1341 s[1] = 0xDC00 | (ordinal & 0x3FF);
1342 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001343 }
1344#endif
1345
Hye-Shik Chang40574832004-04-06 07:24:51 +00001346 s[0] = (Py_UNICODE)ordinal;
1347 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001348}
1349
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350PyObject *PyUnicode_FromObject(register PyObject *obj)
1351{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001352 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001353 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001354 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001355 Py_INCREF(obj);
1356 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001357 }
1358 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001359 /* For a Unicode subtype that's not a Unicode object,
1360 return a true Unicode object with the same data. */
1361 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1362 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001363 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001364 PyErr_Format(PyExc_TypeError,
1365 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001366 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001367 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001368}
1369
1370PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001371 const char *encoding,
1372 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001373{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001374 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001375 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001376
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001378 PyErr_BadInternalCall();
1379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001381
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001382 /* Decoding bytes objects is the most common case and should be fast */
1383 if (PyBytes_Check(obj)) {
1384 if (PyBytes_GET_SIZE(obj) == 0) {
1385 Py_INCREF(unicode_empty);
1386 v = (PyObject *) unicode_empty;
1387 }
1388 else {
1389 v = PyUnicode_Decode(
1390 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1391 encoding, errors);
1392 }
1393 return v;
1394 }
1395
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001396 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001397 PyErr_SetString(PyExc_TypeError,
1398 "decoding str is not supported");
1399 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001400 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001401
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001402 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1403 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1404 PyErr_Format(PyExc_TypeError,
1405 "coercing to str: need bytes, bytearray "
1406 "or buffer-like object, %.80s found",
1407 Py_TYPE(obj)->tp_name);
1408 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001409 }
Tim Petersced69f82003-09-16 20:30:58 +00001410
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001411 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001412 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001413 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001414 }
Tim Petersced69f82003-09-16 20:30:58 +00001415 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001416 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001417
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001418 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001419 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420}
1421
Victor Stinner600d3be2010-06-10 12:00:55 +00001422/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001423 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1424 1 on success. */
1425static int
1426normalize_encoding(const char *encoding,
1427 char *lower,
1428 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001430 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001431 char *l;
1432 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001433
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001434 e = encoding;
1435 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001436 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001437 while (*e) {
1438 if (l == l_end)
1439 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001440 if (Py_ISUPPER(*e)) {
1441 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001442 }
1443 else if (*e == '_') {
1444 *l++ = '-';
1445 e++;
1446 }
1447 else {
1448 *l++ = *e++;
1449 }
1450 }
1451 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001452 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001453}
1454
1455PyObject *PyUnicode_Decode(const char *s,
1456 Py_ssize_t size,
1457 const char *encoding,
1458 const char *errors)
1459{
1460 PyObject *buffer = NULL, *unicode;
1461 Py_buffer info;
1462 char lower[11]; /* Enough for any encoding shortcut */
1463
1464 if (encoding == NULL)
1465 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001466
1467 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001468 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1469 if (strcmp(lower, "utf-8") == 0)
1470 return PyUnicode_DecodeUTF8(s, size, errors);
1471 else if ((strcmp(lower, "latin-1") == 0) ||
1472 (strcmp(lower, "iso-8859-1") == 0))
1473 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001474#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001475 else if (strcmp(lower, "mbcs") == 0)
1476 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001477#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001478 else if (strcmp(lower, "ascii") == 0)
1479 return PyUnicode_DecodeASCII(s, size, errors);
1480 else if (strcmp(lower, "utf-16") == 0)
1481 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1482 else if (strcmp(lower, "utf-32") == 0)
1483 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1484 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001485
1486 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001487 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001488 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001489 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001490 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491 if (buffer == NULL)
1492 goto onError;
1493 unicode = PyCodec_Decode(buffer, encoding, errors);
1494 if (unicode == NULL)
1495 goto onError;
1496 if (!PyUnicode_Check(unicode)) {
1497 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001498 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001499 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500 Py_DECREF(unicode);
1501 goto onError;
1502 }
1503 Py_DECREF(buffer);
1504 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001505
Benjamin Peterson29060642009-01-31 22:14:21 +00001506 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 Py_XDECREF(buffer);
1508 return NULL;
1509}
1510
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001511PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1512 const char *encoding,
1513 const char *errors)
1514{
1515 PyObject *v;
1516
1517 if (!PyUnicode_Check(unicode)) {
1518 PyErr_BadArgument();
1519 goto onError;
1520 }
1521
1522 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001523 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001524
1525 /* Decode via the codec registry */
1526 v = PyCodec_Decode(unicode, encoding, errors);
1527 if (v == NULL)
1528 goto onError;
1529 return v;
1530
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001532 return NULL;
1533}
1534
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001535PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1536 const char *encoding,
1537 const char *errors)
1538{
1539 PyObject *v;
1540
1541 if (!PyUnicode_Check(unicode)) {
1542 PyErr_BadArgument();
1543 goto onError;
1544 }
1545
1546 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001547 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001548
1549 /* Decode via the codec registry */
1550 v = PyCodec_Decode(unicode, encoding, errors);
1551 if (v == NULL)
1552 goto onError;
1553 if (!PyUnicode_Check(v)) {
1554 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001555 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001556 Py_TYPE(v)->tp_name);
1557 Py_DECREF(v);
1558 goto onError;
1559 }
1560 return v;
1561
Benjamin Peterson29060642009-01-31 22:14:21 +00001562 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001563 return NULL;
1564}
1565
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001567 Py_ssize_t size,
1568 const char *encoding,
1569 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570{
1571 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001572
Guido van Rossumd57fd912000-03-10 22:53:23 +00001573 unicode = PyUnicode_FromUnicode(s, size);
1574 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001575 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001576 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1577 Py_DECREF(unicode);
1578 return v;
1579}
1580
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001581PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1582 const char *encoding,
1583 const char *errors)
1584{
1585 PyObject *v;
1586
1587 if (!PyUnicode_Check(unicode)) {
1588 PyErr_BadArgument();
1589 goto onError;
1590 }
1591
1592 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001593 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001594
1595 /* Encode via the codec registry */
1596 v = PyCodec_Encode(unicode, encoding, errors);
1597 if (v == NULL)
1598 goto onError;
1599 return v;
1600
Benjamin Peterson29060642009-01-31 22:14:21 +00001601 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001602 return NULL;
1603}
1604
Victor Stinnerad158722010-10-27 00:25:46 +00001605PyObject *
1606PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001607{
Victor Stinner313a1202010-06-11 23:56:51 +00001608#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001609 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1610 PyUnicode_GET_SIZE(unicode),
1611 NULL);
1612#elif defined(__APPLE__)
1613 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1614 PyUnicode_GET_SIZE(unicode),
1615 "surrogateescape");
1616#else
1617 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001618 return PyUnicode_AsEncodedString(unicode,
1619 Py_FileSystemDefaultEncoding,
1620 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001621 }
1622 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001623 /* locale encoding with surrogateescape */
1624 wchar_t *wchar;
1625 char *bytes;
1626 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001627 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001628
1629 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1630 if (wchar == NULL)
1631 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001632 bytes = _Py_wchar2char(wchar, &error_pos);
1633 if (bytes == NULL) {
1634 if (error_pos != (size_t)-1) {
1635 char *errmsg = strerror(errno);
1636 PyObject *exc = NULL;
1637 if (errmsg == NULL)
1638 errmsg = "Py_wchar2char() failed";
1639 raise_encode_exception(&exc,
1640 "filesystemencoding",
1641 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1642 error_pos, error_pos+1,
1643 errmsg);
1644 Py_XDECREF(exc);
1645 }
1646 else
1647 PyErr_NoMemory();
1648 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001649 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001650 }
1651 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001652
1653 bytes_obj = PyBytes_FromString(bytes);
1654 PyMem_Free(bytes);
1655 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001656 }
Victor Stinnerad158722010-10-27 00:25:46 +00001657#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001658}
1659
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1661 const char *encoding,
1662 const char *errors)
1663{
1664 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001665 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001666
Guido van Rossumd57fd912000-03-10 22:53:23 +00001667 if (!PyUnicode_Check(unicode)) {
1668 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001669 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670 }
Fred Drakee4315f52000-05-09 19:53:39 +00001671
Tim Petersced69f82003-09-16 20:30:58 +00001672 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001673 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001674
1675 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001676 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1677 if (strcmp(lower, "utf-8") == 0)
1678 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1679 PyUnicode_GET_SIZE(unicode),
1680 errors);
1681 else if ((strcmp(lower, "latin-1") == 0) ||
1682 (strcmp(lower, "iso-8859-1") == 0))
1683 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1684 PyUnicode_GET_SIZE(unicode),
1685 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001686#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001687 else if (strcmp(lower, "mbcs") == 0)
1688 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1689 PyUnicode_GET_SIZE(unicode),
1690 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001691#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001692 else if (strcmp(lower, "ascii") == 0)
1693 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1694 PyUnicode_GET_SIZE(unicode),
1695 errors);
1696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001697
1698 /* Encode via the codec registry */
1699 v = PyCodec_Encode(unicode, encoding, errors);
1700 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001701 return NULL;
1702
1703 /* The normal path */
1704 if (PyBytes_Check(v))
1705 return v;
1706
1707 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001708 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001709 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001710 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001711
1712 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1713 "encoder %s returned bytearray instead of bytes",
1714 encoding);
1715 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001716 Py_DECREF(v);
1717 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001718 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001719
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001720 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1721 Py_DECREF(v);
1722 return b;
1723 }
1724
1725 PyErr_Format(PyExc_TypeError,
1726 "encoder did not return a bytes object (type=%.400s)",
1727 Py_TYPE(v)->tp_name);
1728 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001729 return NULL;
1730}
1731
1732PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1733 const char *encoding,
1734 const char *errors)
1735{
1736 PyObject *v;
1737
1738 if (!PyUnicode_Check(unicode)) {
1739 PyErr_BadArgument();
1740 goto onError;
1741 }
1742
1743 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001744 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001745
1746 /* Encode via the codec registry */
1747 v = PyCodec_Encode(unicode, encoding, errors);
1748 if (v == NULL)
1749 goto onError;
1750 if (!PyUnicode_Check(v)) {
1751 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001752 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001753 Py_TYPE(v)->tp_name);
1754 Py_DECREF(v);
1755 goto onError;
1756 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001758
Benjamin Peterson29060642009-01-31 22:14:21 +00001759 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760 return NULL;
1761}
1762
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001763PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001764 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001765{
1766 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001767 if (v)
1768 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001769 if (errors != NULL)
1770 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001771 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001772 PyUnicode_GET_SIZE(unicode),
1773 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001774 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001775 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001776 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001777 return v;
1778}
1779
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001780PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001781PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001782 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001783 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1784}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001785
Christian Heimes5894ba72007-11-04 11:43:14 +00001786PyObject*
1787PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1788{
Victor Stinnerad158722010-10-27 00:25:46 +00001789#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1790 return PyUnicode_DecodeMBCS(s, size, NULL);
1791#elif defined(__APPLE__)
1792 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1793#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001794 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1795 can be undefined. If it is case, decode using UTF-8. The following assumes
1796 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1797 bootstrapping process where the codecs aren't ready yet.
1798 */
1799 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001800 return PyUnicode_Decode(s, size,
1801 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001802 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001803 }
1804 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001805 /* locale encoding with surrogateescape */
1806 wchar_t *wchar;
1807 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001808 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001809
1810 if (s[size] != '\0' || size != strlen(s)) {
1811 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1812 return NULL;
1813 }
1814
Victor Stinner168e1172010-10-16 23:16:16 +00001815 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001816 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001817 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001818
Victor Stinner168e1172010-10-16 23:16:16 +00001819 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001820 PyMem_Free(wchar);
1821 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001822 }
Victor Stinnerad158722010-10-27 00:25:46 +00001823#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001824}
1825
Martin v. Löwis011e8422009-05-05 04:43:17 +00001826
1827int
1828PyUnicode_FSConverter(PyObject* arg, void* addr)
1829{
1830 PyObject *output = NULL;
1831 Py_ssize_t size;
1832 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001833 if (arg == NULL) {
1834 Py_DECREF(*(PyObject**)addr);
1835 return 1;
1836 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001837 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001838 output = arg;
1839 Py_INCREF(output);
1840 }
1841 else {
1842 arg = PyUnicode_FromObject(arg);
1843 if (!arg)
1844 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001845 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001846 Py_DECREF(arg);
1847 if (!output)
1848 return 0;
1849 if (!PyBytes_Check(output)) {
1850 Py_DECREF(output);
1851 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1852 return 0;
1853 }
1854 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001855 size = PyBytes_GET_SIZE(output);
1856 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001857 if (size != strlen(data)) {
1858 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1859 Py_DECREF(output);
1860 return 0;
1861 }
1862 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001863 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001864}
1865
1866
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001867int
1868PyUnicode_FSDecoder(PyObject* arg, void* addr)
1869{
1870 PyObject *output = NULL;
1871 Py_ssize_t size;
1872 void *data;
1873 if (arg == NULL) {
1874 Py_DECREF(*(PyObject**)addr);
1875 return 1;
1876 }
1877 if (PyUnicode_Check(arg)) {
1878 output = arg;
1879 Py_INCREF(output);
1880 }
1881 else {
1882 arg = PyBytes_FromObject(arg);
1883 if (!arg)
1884 return 0;
1885 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1886 PyBytes_GET_SIZE(arg));
1887 Py_DECREF(arg);
1888 if (!output)
1889 return 0;
1890 if (!PyUnicode_Check(output)) {
1891 Py_DECREF(output);
1892 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1893 return 0;
1894 }
1895 }
1896 size = PyUnicode_GET_SIZE(output);
1897 data = PyUnicode_AS_UNICODE(output);
1898 if (size != Py_UNICODE_strlen(data)) {
1899 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1900 Py_DECREF(output);
1901 return 0;
1902 }
1903 *(PyObject**)addr = output;
1904 return Py_CLEANUP_SUPPORTED;
1905}
1906
1907
Martin v. Löwis5b222132007-06-10 09:51:05 +00001908char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001909_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001910{
Christian Heimesf3863112007-11-22 07:46:41 +00001911 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001912 if (!PyUnicode_Check(unicode)) {
1913 PyErr_BadArgument();
1914 return NULL;
1915 }
Christian Heimesf3863112007-11-22 07:46:41 +00001916 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1917 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001918 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001919 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001920 *psize = PyBytes_GET_SIZE(bytes);
1921 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001922}
1923
1924char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001925_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001926{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001927 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001928}
1929
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1931{
1932 if (!PyUnicode_Check(unicode)) {
1933 PyErr_BadArgument();
1934 goto onError;
1935 }
1936 return PyUnicode_AS_UNICODE(unicode);
1937
Benjamin Peterson29060642009-01-31 22:14:21 +00001938 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 return NULL;
1940}
1941
Martin v. Löwis18e16552006-02-15 17:27:45 +00001942Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943{
1944 if (!PyUnicode_Check(unicode)) {
1945 PyErr_BadArgument();
1946 goto onError;
1947 }
1948 return PyUnicode_GET_SIZE(unicode);
1949
Benjamin Peterson29060642009-01-31 22:14:21 +00001950 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951 return -1;
1952}
1953
Thomas Wouters78890102000-07-22 19:25:51 +00001954const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001955{
Victor Stinner42cb4622010-09-01 19:39:01 +00001956 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001957}
1958
Victor Stinner554f3f02010-06-16 23:33:54 +00001959/* create or adjust a UnicodeDecodeError */
1960static void
1961make_decode_exception(PyObject **exceptionObject,
1962 const char *encoding,
1963 const char *input, Py_ssize_t length,
1964 Py_ssize_t startpos, Py_ssize_t endpos,
1965 const char *reason)
1966{
1967 if (*exceptionObject == NULL) {
1968 *exceptionObject = PyUnicodeDecodeError_Create(
1969 encoding, input, length, startpos, endpos, reason);
1970 }
1971 else {
1972 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1973 goto onError;
1974 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1975 goto onError;
1976 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1977 goto onError;
1978 }
1979 return;
1980
1981onError:
1982 Py_DECREF(*exceptionObject);
1983 *exceptionObject = NULL;
1984}
1985
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001986/* error handling callback helper:
1987 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001988 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001989 and adjust various state variables.
1990 return 0 on success, -1 on error
1991*/
1992
1993static
1994int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00001995 const char *encoding, const char *reason,
1996 const char **input, const char **inend, Py_ssize_t *startinpos,
1997 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1998 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001999{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002000 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002001
2002 PyObject *restuple = NULL;
2003 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002004 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002005 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002006 Py_ssize_t requiredsize;
2007 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002008 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002009 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002010 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002011 int res = -1;
2012
2013 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002014 *errorHandler = PyCodec_LookupError(errors);
2015 if (*errorHandler == NULL)
2016 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002017 }
2018
Victor Stinner554f3f02010-06-16 23:33:54 +00002019 make_decode_exception(exceptionObject,
2020 encoding,
2021 *input, *inend - *input,
2022 *startinpos, *endinpos,
2023 reason);
2024 if (*exceptionObject == NULL)
2025 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002026
2027 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2028 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002029 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002030 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002031 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002032 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002033 }
2034 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002035 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002036
2037 /* Copy back the bytes variables, which might have been modified by the
2038 callback */
2039 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2040 if (!inputobj)
2041 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002042 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002043 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002044 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002045 *input = PyBytes_AS_STRING(inputobj);
2046 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002047 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002048 /* we can DECREF safely, as the exception has another reference,
2049 so the object won't go away. */
2050 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002051
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002052 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002053 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002054 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002055 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2056 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002057 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002058
2059 /* need more space? (at least enough for what we
2060 have+the replacement+the rest of the string (starting
2061 at the new input position), so we won't have to check space
2062 when there are no errors in the rest of the string) */
2063 repptr = PyUnicode_AS_UNICODE(repunicode);
2064 repsize = PyUnicode_GET_SIZE(repunicode);
2065 requiredsize = *outpos + repsize + insize-newpos;
2066 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002067 if (requiredsize<2*outsize)
2068 requiredsize = 2*outsize;
2069 if (_PyUnicode_Resize(output, requiredsize) < 0)
2070 goto onError;
2071 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002072 }
2073 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002074 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002075 Py_UNICODE_COPY(*outptr, repptr, repsize);
2076 *outptr += repsize;
2077 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 /* we made it! */
2080 res = 0;
2081
Benjamin Peterson29060642009-01-31 22:14:21 +00002082 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002083 Py_XDECREF(restuple);
2084 return res;
2085}
2086
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002087/* --- UTF-7 Codec -------------------------------------------------------- */
2088
Antoine Pitrou244651a2009-05-04 18:56:13 +00002089/* See RFC2152 for details. We encode conservatively and decode liberally. */
2090
2091/* Three simple macros defining base-64. */
2092
2093/* Is c a base-64 character? */
2094
2095#define IS_BASE64(c) \
2096 (((c) >= 'A' && (c) <= 'Z') || \
2097 ((c) >= 'a' && (c) <= 'z') || \
2098 ((c) >= '0' && (c) <= '9') || \
2099 (c) == '+' || (c) == '/')
2100
2101/* given that c is a base-64 character, what is its base-64 value? */
2102
2103#define FROM_BASE64(c) \
2104 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2105 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2106 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2107 (c) == '+' ? 62 : 63)
2108
2109/* What is the base-64 character of the bottom 6 bits of n? */
2110
2111#define TO_BASE64(n) \
2112 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2113
2114/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2115 * decoded as itself. We are permissive on decoding; the only ASCII
2116 * byte not decoding to itself is the + which begins a base64
2117 * string. */
2118
2119#define DECODE_DIRECT(c) \
2120 ((c) <= 127 && (c) != '+')
2121
2122/* The UTF-7 encoder treats ASCII characters differently according to
2123 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2124 * the above). See RFC2152. This array identifies these different
2125 * sets:
2126 * 0 : "Set D"
2127 * alphanumeric and '(),-./:?
2128 * 1 : "Set O"
2129 * !"#$%&*;<=>@[]^_`{|}
2130 * 2 : "whitespace"
2131 * ht nl cr sp
2132 * 3 : special (must be base64 encoded)
2133 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2134 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002135
Tim Petersced69f82003-09-16 20:30:58 +00002136static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002137char utf7_category[128] = {
2138/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2139 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2140/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2141 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2142/* sp ! " # $ % & ' ( ) * + , - . / */
2143 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2144/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2145 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2146/* @ A B C D E F G H I J K L M N O */
2147 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2148/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2149 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2150/* ` a b c d e f g h i j k l m n o */
2151 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2152/* p q r s t u v w x y z { | } ~ del */
2153 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002154};
2155
Antoine Pitrou244651a2009-05-04 18:56:13 +00002156/* ENCODE_DIRECT: this character should be encoded as itself. The
2157 * answer depends on whether we are encoding set O as itself, and also
2158 * on whether we are encoding whitespace as itself. RFC2152 makes it
2159 * clear that the answers to these questions vary between
2160 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002161
Antoine Pitrou244651a2009-05-04 18:56:13 +00002162#define ENCODE_DIRECT(c, directO, directWS) \
2163 ((c) < 128 && (c) > 0 && \
2164 ((utf7_category[(c)] == 0) || \
2165 (directWS && (utf7_category[(c)] == 2)) || \
2166 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002167
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002168PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002169 Py_ssize_t size,
2170 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002171{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002172 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2173}
2174
Antoine Pitrou244651a2009-05-04 18:56:13 +00002175/* The decoder. The only state we preserve is our read position,
2176 * i.e. how many characters we have consumed. So if we end in the
2177 * middle of a shift sequence we have to back off the read position
2178 * and the output to the beginning of the sequence, otherwise we lose
2179 * all the shift state (seen bits, number of bits seen, high
2180 * surrogate). */
2181
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002182PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002183 Py_ssize_t size,
2184 const char *errors,
2185 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002186{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002187 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002188 Py_ssize_t startinpos;
2189 Py_ssize_t endinpos;
2190 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002191 const char *e;
2192 PyUnicodeObject *unicode;
2193 Py_UNICODE *p;
2194 const char *errmsg = "";
2195 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002196 Py_UNICODE *shiftOutStart;
2197 unsigned int base64bits = 0;
2198 unsigned long base64buffer = 0;
2199 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002200 PyObject *errorHandler = NULL;
2201 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002202
2203 unicode = _PyUnicode_New(size);
2204 if (!unicode)
2205 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002206 if (size == 0) {
2207 if (consumed)
2208 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002209 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002210 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002211
2212 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002213 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002214 e = s + size;
2215
2216 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002217 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002218 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002219 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002220
Antoine Pitrou244651a2009-05-04 18:56:13 +00002221 if (inShift) { /* in a base-64 section */
2222 if (IS_BASE64(ch)) { /* consume a base-64 character */
2223 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2224 base64bits += 6;
2225 s++;
2226 if (base64bits >= 16) {
2227 /* we have enough bits for a UTF-16 value */
2228 Py_UNICODE outCh = (Py_UNICODE)
2229 (base64buffer >> (base64bits-16));
2230 base64bits -= 16;
2231 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2232 if (surrogate) {
2233 /* expecting a second surrogate */
2234 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2235#ifdef Py_UNICODE_WIDE
2236 *p++ = (((surrogate & 0x3FF)<<10)
2237 | (outCh & 0x3FF)) + 0x10000;
2238#else
2239 *p++ = surrogate;
2240 *p++ = outCh;
2241#endif
2242 surrogate = 0;
2243 }
2244 else {
2245 surrogate = 0;
2246 errmsg = "second surrogate missing";
2247 goto utf7Error;
2248 }
2249 }
2250 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2251 /* first surrogate */
2252 surrogate = outCh;
2253 }
2254 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2255 errmsg = "unexpected second surrogate";
2256 goto utf7Error;
2257 }
2258 else {
2259 *p++ = outCh;
2260 }
2261 }
2262 }
2263 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002264 inShift = 0;
2265 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002266 if (surrogate) {
2267 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002268 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002269 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002270 if (base64bits > 0) { /* left-over bits */
2271 if (base64bits >= 6) {
2272 /* We've seen at least one base-64 character */
2273 errmsg = "partial character in shift sequence";
2274 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002275 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002276 else {
2277 /* Some bits remain; they should be zero */
2278 if (base64buffer != 0) {
2279 errmsg = "non-zero padding bits in shift sequence";
2280 goto utf7Error;
2281 }
2282 }
2283 }
2284 if (ch != '-') {
2285 /* '-' is absorbed; other terminating
2286 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002287 *p++ = ch;
2288 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002289 }
2290 }
2291 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002292 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002293 s++; /* consume '+' */
2294 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002295 s++;
2296 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002297 }
2298 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002299 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002300 shiftOutStart = p;
2301 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002302 }
2303 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002304 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002305 *p++ = ch;
2306 s++;
2307 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002308 else {
2309 startinpos = s-starts;
2310 s++;
2311 errmsg = "unexpected special character";
2312 goto utf7Error;
2313 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002314 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002315utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002316 outpos = p-PyUnicode_AS_UNICODE(unicode);
2317 endinpos = s-starts;
2318 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002319 errors, &errorHandler,
2320 "utf7", errmsg,
2321 &starts, &e, &startinpos, &endinpos, &exc, &s,
2322 &unicode, &outpos, &p))
2323 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002324 }
2325
Antoine Pitrou244651a2009-05-04 18:56:13 +00002326 /* end of string */
2327
2328 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2329 /* if we're in an inconsistent state, that's an error */
2330 if (surrogate ||
2331 (base64bits >= 6) ||
2332 (base64bits > 0 && base64buffer != 0)) {
2333 outpos = p-PyUnicode_AS_UNICODE(unicode);
2334 endinpos = size;
2335 if (unicode_decode_call_errorhandler(
2336 errors, &errorHandler,
2337 "utf7", "unterminated shift sequence",
2338 &starts, &e, &startinpos, &endinpos, &exc, &s,
2339 &unicode, &outpos, &p))
2340 goto onError;
2341 if (s < e)
2342 goto restart;
2343 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002344 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002345
2346 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002347 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002348 if (inShift) {
2349 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002350 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002351 }
2352 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002353 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002354 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002355 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002356
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002357 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002358 goto onError;
2359
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002360 Py_XDECREF(errorHandler);
2361 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002362 return (PyObject *)unicode;
2363
Benjamin Peterson29060642009-01-31 22:14:21 +00002364 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002365 Py_XDECREF(errorHandler);
2366 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002367 Py_DECREF(unicode);
2368 return NULL;
2369}
2370
2371
2372PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002373 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002374 int base64SetO,
2375 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002376 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002377{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002378 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002379 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002380 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002381 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002382 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002383 unsigned int base64bits = 0;
2384 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002385 char * out;
2386 char * start;
2387
2388 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002389 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002390
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002391 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002392 return PyErr_NoMemory();
2393
Antoine Pitrou244651a2009-05-04 18:56:13 +00002394 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002395 if (v == NULL)
2396 return NULL;
2397
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002398 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002399 for (;i < size; ++i) {
2400 Py_UNICODE ch = s[i];
2401
Antoine Pitrou244651a2009-05-04 18:56:13 +00002402 if (inShift) {
2403 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2404 /* shifting out */
2405 if (base64bits) { /* output remaining bits */
2406 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2407 base64buffer = 0;
2408 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002409 }
2410 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002411 /* Characters not in the BASE64 set implicitly unshift the sequence
2412 so no '-' is required, except if the character is itself a '-' */
2413 if (IS_BASE64(ch) || ch == '-') {
2414 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002415 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002416 *out++ = (char) ch;
2417 }
2418 else {
2419 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002420 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002421 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002422 else { /* not in a shift sequence */
2423 if (ch == '+') {
2424 *out++ = '+';
2425 *out++ = '-';
2426 }
2427 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2428 *out++ = (char) ch;
2429 }
2430 else {
2431 *out++ = '+';
2432 inShift = 1;
2433 goto encode_char;
2434 }
2435 }
2436 continue;
2437encode_char:
2438#ifdef Py_UNICODE_WIDE
2439 if (ch >= 0x10000) {
2440 /* code first surrogate */
2441 base64bits += 16;
2442 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2443 while (base64bits >= 6) {
2444 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2445 base64bits -= 6;
2446 }
2447 /* prepare second surrogate */
2448 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2449 }
2450#endif
2451 base64bits += 16;
2452 base64buffer = (base64buffer << 16) | ch;
2453 while (base64bits >= 6) {
2454 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2455 base64bits -= 6;
2456 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002457 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002458 if (base64bits)
2459 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2460 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002461 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002462 if (_PyBytes_Resize(&v, out - start) < 0)
2463 return NULL;
2464 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002465}
2466
Antoine Pitrou244651a2009-05-04 18:56:13 +00002467#undef IS_BASE64
2468#undef FROM_BASE64
2469#undef TO_BASE64
2470#undef DECODE_DIRECT
2471#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002472
Guido van Rossumd57fd912000-03-10 22:53:23 +00002473/* --- UTF-8 Codec -------------------------------------------------------- */
2474
Tim Petersced69f82003-09-16 20:30:58 +00002475static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002477 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2478 illegal prefix. See RFC 3629 for details */
2479 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2480 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002481 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2483 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2484 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2485 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002486 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2489 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2491 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2492 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2493 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2494 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002495};
2496
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002498 Py_ssize_t size,
2499 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500{
Walter Dörwald69652032004-09-07 20:24:22 +00002501 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2502}
2503
Antoine Pitrouab868312009-01-10 15:40:25 +00002504/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2505#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2506
2507/* Mask to quickly check whether a C 'long' contains a
2508 non-ASCII, UTF8-encoded char. */
2509#if (SIZEOF_LONG == 8)
2510# define ASCII_CHAR_MASK 0x8080808080808080L
2511#elif (SIZEOF_LONG == 4)
2512# define ASCII_CHAR_MASK 0x80808080L
2513#else
2514# error C 'long' size should be either 4 or 8!
2515#endif
2516
Walter Dörwald69652032004-09-07 20:24:22 +00002517PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002518 Py_ssize_t size,
2519 const char *errors,
2520 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002521{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002522 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002523 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002524 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002525 Py_ssize_t startinpos;
2526 Py_ssize_t endinpos;
2527 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002528 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 PyUnicodeObject *unicode;
2530 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002531 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002532 PyObject *errorHandler = NULL;
2533 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534
2535 /* Note: size will always be longer than the resulting Unicode
2536 character count */
2537 unicode = _PyUnicode_New(size);
2538 if (!unicode)
2539 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002540 if (size == 0) {
2541 if (consumed)
2542 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545
2546 /* Unpack UTF-8 encoded data */
2547 p = unicode->str;
2548 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002549 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550
2551 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002552 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553
2554 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002555 /* Fast path for runs of ASCII characters. Given that common UTF-8
2556 input will consist of an overwhelming majority of ASCII
2557 characters, we try to optimize for this case by checking
2558 as many characters as a C 'long' can contain.
2559 First, check if we can do an aligned read, as most CPUs have
2560 a penalty for unaligned reads.
2561 */
2562 if (!((size_t) s & LONG_PTR_MASK)) {
2563 /* Help register allocation */
2564 register const char *_s = s;
2565 register Py_UNICODE *_p = p;
2566 while (_s < aligned_end) {
2567 /* Read a whole long at a time (either 4 or 8 bytes),
2568 and do a fast unrolled copy if it only contains ASCII
2569 characters. */
2570 unsigned long data = *(unsigned long *) _s;
2571 if (data & ASCII_CHAR_MASK)
2572 break;
2573 _p[0] = (unsigned char) _s[0];
2574 _p[1] = (unsigned char) _s[1];
2575 _p[2] = (unsigned char) _s[2];
2576 _p[3] = (unsigned char) _s[3];
2577#if (SIZEOF_LONG == 8)
2578 _p[4] = (unsigned char) _s[4];
2579 _p[5] = (unsigned char) _s[5];
2580 _p[6] = (unsigned char) _s[6];
2581 _p[7] = (unsigned char) _s[7];
2582#endif
2583 _s += SIZEOF_LONG;
2584 _p += SIZEOF_LONG;
2585 }
2586 s = _s;
2587 p = _p;
2588 if (s == e)
2589 break;
2590 ch = (unsigned char)*s;
2591 }
2592 }
2593
2594 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002595 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596 s++;
2597 continue;
2598 }
2599
2600 n = utf8_code_length[ch];
2601
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002602 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002603 if (consumed)
2604 break;
2605 else {
2606 errmsg = "unexpected end of data";
2607 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002608 endinpos = startinpos+1;
2609 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2610 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002611 goto utf8Error;
2612 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002613 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614
2615 switch (n) {
2616
2617 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002618 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002619 startinpos = s-starts;
2620 endinpos = startinpos+1;
2621 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002622
2623 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002624 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002625 startinpos = s-starts;
2626 endinpos = startinpos+1;
2627 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628
2629 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002630 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002631 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002632 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002633 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002634 goto utf8Error;
2635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002636 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002637 assert ((ch > 0x007F) && (ch <= 0x07FF));
2638 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002639 break;
2640
2641 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002642 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2643 will result in surrogates in range d800-dfff. Surrogates are
2644 not valid UTF-8 so they are rejected.
2645 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2646 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002647 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002648 (s[2] & 0xc0) != 0x80 ||
2649 ((unsigned char)s[0] == 0xE0 &&
2650 (unsigned char)s[1] < 0xA0) ||
2651 ((unsigned char)s[0] == 0xED &&
2652 (unsigned char)s[1] > 0x9F)) {
2653 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002654 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002655 endinpos = startinpos + 1;
2656
2657 /* if s[1] first two bits are 1 and 0, then the invalid
2658 continuation byte is s[2], so increment endinpos by 1,
2659 if not, s[1] is invalid and endinpos doesn't need to
2660 be incremented. */
2661 if ((s[1] & 0xC0) == 0x80)
2662 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002663 goto utf8Error;
2664 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002666 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2667 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002668 break;
2669
2670 case 4:
2671 if ((s[1] & 0xc0) != 0x80 ||
2672 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002673 (s[3] & 0xc0) != 0x80 ||
2674 ((unsigned char)s[0] == 0xF0 &&
2675 (unsigned char)s[1] < 0x90) ||
2676 ((unsigned char)s[0] == 0xF4 &&
2677 (unsigned char)s[1] > 0x8F)) {
2678 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002679 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002680 endinpos = startinpos + 1;
2681 if ((s[1] & 0xC0) == 0x80) {
2682 endinpos++;
2683 if ((s[2] & 0xC0) == 0x80)
2684 endinpos++;
2685 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002686 goto utf8Error;
2687 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002688 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002689 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2690 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2691
Fredrik Lundh8f455852001-06-27 18:59:43 +00002692#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002693 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002694#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002695 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002696
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002697 /* translate from 10000..10FFFF to 0..FFFF */
2698 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002699
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002700 /* high surrogate = top 10 bits added to D800 */
2701 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002702
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002703 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002704 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002705#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707 }
2708 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002709 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002710
Benjamin Peterson29060642009-01-31 22:14:21 +00002711 utf8Error:
2712 outpos = p-PyUnicode_AS_UNICODE(unicode);
2713 if (unicode_decode_call_errorhandler(
2714 errors, &errorHandler,
2715 "utf8", errmsg,
2716 &starts, &e, &startinpos, &endinpos, &exc, &s,
2717 &unicode, &outpos, &p))
2718 goto onError;
2719 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 }
Walter Dörwald69652032004-09-07 20:24:22 +00002721 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002722 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723
2724 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002725 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726 goto onError;
2727
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002728 Py_XDECREF(errorHandler);
2729 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002730 return (PyObject *)unicode;
2731
Benjamin Peterson29060642009-01-31 22:14:21 +00002732 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002733 Py_XDECREF(errorHandler);
2734 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735 Py_DECREF(unicode);
2736 return NULL;
2737}
2738
Antoine Pitrouab868312009-01-10 15:40:25 +00002739#undef ASCII_CHAR_MASK
2740
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002741#ifdef __APPLE__
2742
2743/* Simplified UTF-8 decoder using surrogateescape error handler,
2744 used to decode the command line arguments on Mac OS X. */
2745
2746wchar_t*
2747_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2748{
2749 int n;
2750 const char *e;
2751 wchar_t *unicode, *p;
2752
2753 /* Note: size will always be longer than the resulting Unicode
2754 character count */
2755 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2756 PyErr_NoMemory();
2757 return NULL;
2758 }
2759 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2760 if (!unicode)
2761 return NULL;
2762
2763 /* Unpack UTF-8 encoded data */
2764 p = unicode;
2765 e = s + size;
2766 while (s < e) {
2767 Py_UCS4 ch = (unsigned char)*s;
2768
2769 if (ch < 0x80) {
2770 *p++ = (wchar_t)ch;
2771 s++;
2772 continue;
2773 }
2774
2775 n = utf8_code_length[ch];
2776 if (s + n > e) {
2777 goto surrogateescape;
2778 }
2779
2780 switch (n) {
2781 case 0:
2782 case 1:
2783 goto surrogateescape;
2784
2785 case 2:
2786 if ((s[1] & 0xc0) != 0x80)
2787 goto surrogateescape;
2788 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2789 assert ((ch > 0x007F) && (ch <= 0x07FF));
2790 *p++ = (wchar_t)ch;
2791 break;
2792
2793 case 3:
2794 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2795 will result in surrogates in range d800-dfff. Surrogates are
2796 not valid UTF-8 so they are rejected.
2797 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2798 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2799 if ((s[1] & 0xc0) != 0x80 ||
2800 (s[2] & 0xc0) != 0x80 ||
2801 ((unsigned char)s[0] == 0xE0 &&
2802 (unsigned char)s[1] < 0xA0) ||
2803 ((unsigned char)s[0] == 0xED &&
2804 (unsigned char)s[1] > 0x9F)) {
2805
2806 goto surrogateescape;
2807 }
2808 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2809 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2810 *p++ = (Py_UNICODE)ch;
2811 break;
2812
2813 case 4:
2814 if ((s[1] & 0xc0) != 0x80 ||
2815 (s[2] & 0xc0) != 0x80 ||
2816 (s[3] & 0xc0) != 0x80 ||
2817 ((unsigned char)s[0] == 0xF0 &&
2818 (unsigned char)s[1] < 0x90) ||
2819 ((unsigned char)s[0] == 0xF4 &&
2820 (unsigned char)s[1] > 0x8F)) {
2821 goto surrogateescape;
2822 }
2823 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2824 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2825 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2826
2827#if SIZEOF_WCHAR_T == 4
2828 *p++ = (wchar_t)ch;
2829#else
2830 /* compute and append the two surrogates: */
2831
2832 /* translate from 10000..10FFFF to 0..FFFF */
2833 ch -= 0x10000;
2834
2835 /* high surrogate = top 10 bits added to D800 */
2836 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2837
2838 /* low surrogate = bottom 10 bits added to DC00 */
2839 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2840#endif
2841 break;
2842 }
2843 s += n;
2844 continue;
2845
2846 surrogateescape:
2847 *p++ = 0xDC00 + ch;
2848 s++;
2849 }
2850 *p = L'\0';
2851 return unicode;
2852}
2853
2854#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002855
Tim Peters602f7402002-04-27 18:03:26 +00002856/* Allocation strategy: if the string is short, convert into a stack buffer
2857 and allocate exactly as much space needed at the end. Else allocate the
2858 maximum possible needed (4 result bytes per Unicode character), and return
2859 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002860*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002861PyObject *
2862PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002863 Py_ssize_t size,
2864 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002865{
Tim Peters602f7402002-04-27 18:03:26 +00002866#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002867
Guido van Rossum98297ee2007-11-06 21:34:58 +00002868 Py_ssize_t i; /* index into s of next input byte */
2869 PyObject *result; /* result string object */
2870 char *p; /* next free byte in output buffer */
2871 Py_ssize_t nallocated; /* number of result bytes allocated */
2872 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002873 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002874 PyObject *errorHandler = NULL;
2875 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002876
Tim Peters602f7402002-04-27 18:03:26 +00002877 assert(s != NULL);
2878 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879
Tim Peters602f7402002-04-27 18:03:26 +00002880 if (size <= MAX_SHORT_UNICHARS) {
2881 /* Write into the stack buffer; nallocated can't overflow.
2882 * At the end, we'll allocate exactly as much heap space as it
2883 * turns out we need.
2884 */
2885 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002886 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002887 p = stackbuf;
2888 }
2889 else {
2890 /* Overallocate on the heap, and give the excess back at the end. */
2891 nallocated = size * 4;
2892 if (nallocated / 4 != size) /* overflow! */
2893 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002894 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002895 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002896 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002897 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002898 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002899
Tim Peters602f7402002-04-27 18:03:26 +00002900 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002901 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002902
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002903 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002904 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002906
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002908 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002909 *p++ = (char)(0xc0 | (ch >> 6));
2910 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002911 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002912#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002913 /* Special case: check for high and low surrogate */
2914 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2915 Py_UCS4 ch2 = s[i];
2916 /* Combine the two surrogates to form a UCS4 value */
2917 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2918 i++;
2919
2920 /* Encode UCS4 Unicode ordinals */
2921 *p++ = (char)(0xf0 | (ch >> 18));
2922 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002923 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2924 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002925 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002926#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002927 Py_ssize_t newpos;
2928 PyObject *rep;
2929 Py_ssize_t repsize, k;
2930 rep = unicode_encode_call_errorhandler
2931 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2932 s, size, &exc, i-1, i, &newpos);
2933 if (!rep)
2934 goto error;
2935
2936 if (PyBytes_Check(rep))
2937 repsize = PyBytes_GET_SIZE(rep);
2938 else
2939 repsize = PyUnicode_GET_SIZE(rep);
2940
2941 if (repsize > 4) {
2942 Py_ssize_t offset;
2943
2944 if (result == NULL)
2945 offset = p - stackbuf;
2946 else
2947 offset = p - PyBytes_AS_STRING(result);
2948
2949 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2950 /* integer overflow */
2951 PyErr_NoMemory();
2952 goto error;
2953 }
2954 nallocated += repsize - 4;
2955 if (result != NULL) {
2956 if (_PyBytes_Resize(&result, nallocated) < 0)
2957 goto error;
2958 } else {
2959 result = PyBytes_FromStringAndSize(NULL, nallocated);
2960 if (result == NULL)
2961 goto error;
2962 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2963 }
2964 p = PyBytes_AS_STRING(result) + offset;
2965 }
2966
2967 if (PyBytes_Check(rep)) {
2968 char *prep = PyBytes_AS_STRING(rep);
2969 for(k = repsize; k > 0; k--)
2970 *p++ = *prep++;
2971 } else /* rep is unicode */ {
2972 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2973 Py_UNICODE c;
2974
2975 for(k=0; k<repsize; k++) {
2976 c = prep[k];
2977 if (0x80 <= c) {
2978 raise_encode_exception(&exc, "utf-8", s, size,
2979 i-1, i, "surrogates not allowed");
2980 goto error;
2981 }
2982 *p++ = (char)prep[k];
2983 }
2984 }
2985 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002986#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002987 }
Victor Stinner445a6232010-04-22 20:01:57 +00002988#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002989 } else if (ch < 0x10000) {
2990 *p++ = (char)(0xe0 | (ch >> 12));
2991 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2992 *p++ = (char)(0x80 | (ch & 0x3f));
2993 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00002994 /* Encode UCS4 Unicode ordinals */
2995 *p++ = (char)(0xf0 | (ch >> 18));
2996 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2997 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2998 *p++ = (char)(0x80 | (ch & 0x3f));
2999 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003001
Guido van Rossum98297ee2007-11-06 21:34:58 +00003002 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003003 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003004 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003005 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003006 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003007 }
3008 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003009 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003010 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003011 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003012 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003013 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003014 Py_XDECREF(errorHandler);
3015 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003016 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003017 error:
3018 Py_XDECREF(errorHandler);
3019 Py_XDECREF(exc);
3020 Py_XDECREF(result);
3021 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003022
Tim Peters602f7402002-04-27 18:03:26 +00003023#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024}
3025
Guido van Rossumd57fd912000-03-10 22:53:23 +00003026PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3027{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 if (!PyUnicode_Check(unicode)) {
3029 PyErr_BadArgument();
3030 return NULL;
3031 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003032 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003033 PyUnicode_GET_SIZE(unicode),
3034 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035}
3036
Walter Dörwald41980ca2007-08-16 21:55:45 +00003037/* --- UTF-32 Codec ------------------------------------------------------- */
3038
3039PyObject *
3040PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003041 Py_ssize_t size,
3042 const char *errors,
3043 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003044{
3045 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3046}
3047
3048PyObject *
3049PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003050 Py_ssize_t size,
3051 const char *errors,
3052 int *byteorder,
3053 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003054{
3055 const char *starts = s;
3056 Py_ssize_t startinpos;
3057 Py_ssize_t endinpos;
3058 Py_ssize_t outpos;
3059 PyUnicodeObject *unicode;
3060 Py_UNICODE *p;
3061#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003062 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003063 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003064#else
3065 const int pairs = 0;
3066#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003067 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003068 int bo = 0; /* assume native ordering by default */
3069 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003070 /* Offsets from q for retrieving bytes in the right order. */
3071#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3072 int iorder[] = {0, 1, 2, 3};
3073#else
3074 int iorder[] = {3, 2, 1, 0};
3075#endif
3076 PyObject *errorHandler = NULL;
3077 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003078
Walter Dörwald41980ca2007-08-16 21:55:45 +00003079 q = (unsigned char *)s;
3080 e = q + size;
3081
3082 if (byteorder)
3083 bo = *byteorder;
3084
3085 /* Check for BOM marks (U+FEFF) in the input and adjust current
3086 byte order setting accordingly. In native mode, the leading BOM
3087 mark is skipped, in all other modes, it is copied to the output
3088 stream as-is (giving a ZWNBSP character). */
3089 if (bo == 0) {
3090 if (size >= 4) {
3091 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003092 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003093#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003094 if (bom == 0x0000FEFF) {
3095 q += 4;
3096 bo = -1;
3097 }
3098 else if (bom == 0xFFFE0000) {
3099 q += 4;
3100 bo = 1;
3101 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003102#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003103 if (bom == 0x0000FEFF) {
3104 q += 4;
3105 bo = 1;
3106 }
3107 else if (bom == 0xFFFE0000) {
3108 q += 4;
3109 bo = -1;
3110 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003111#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003112 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003113 }
3114
3115 if (bo == -1) {
3116 /* force LE */
3117 iorder[0] = 0;
3118 iorder[1] = 1;
3119 iorder[2] = 2;
3120 iorder[3] = 3;
3121 }
3122 else if (bo == 1) {
3123 /* force BE */
3124 iorder[0] = 3;
3125 iorder[1] = 2;
3126 iorder[2] = 1;
3127 iorder[3] = 0;
3128 }
3129
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003130 /* On narrow builds we split characters outside the BMP into two
3131 codepoints => count how much extra space we need. */
3132#ifndef Py_UNICODE_WIDE
3133 for (qq = q; qq < e; qq += 4)
3134 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3135 pairs++;
3136#endif
3137
3138 /* This might be one to much, because of a BOM */
3139 unicode = _PyUnicode_New((size+3)/4+pairs);
3140 if (!unicode)
3141 return NULL;
3142 if (size == 0)
3143 return (PyObject *)unicode;
3144
3145 /* Unpack UTF-32 encoded data */
3146 p = unicode->str;
3147
Walter Dörwald41980ca2007-08-16 21:55:45 +00003148 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003149 Py_UCS4 ch;
3150 /* remaining bytes at the end? (size should be divisible by 4) */
3151 if (e-q<4) {
3152 if (consumed)
3153 break;
3154 errmsg = "truncated data";
3155 startinpos = ((const char *)q)-starts;
3156 endinpos = ((const char *)e)-starts;
3157 goto utf32Error;
3158 /* The remaining input chars are ignored if the callback
3159 chooses to skip the input */
3160 }
3161 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3162 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003163
Benjamin Peterson29060642009-01-31 22:14:21 +00003164 if (ch >= 0x110000)
3165 {
3166 errmsg = "codepoint not in range(0x110000)";
3167 startinpos = ((const char *)q)-starts;
3168 endinpos = startinpos+4;
3169 goto utf32Error;
3170 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003171#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003172 if (ch >= 0x10000)
3173 {
3174 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3175 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3176 }
3177 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003178#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003179 *p++ = ch;
3180 q += 4;
3181 continue;
3182 utf32Error:
3183 outpos = p-PyUnicode_AS_UNICODE(unicode);
3184 if (unicode_decode_call_errorhandler(
3185 errors, &errorHandler,
3186 "utf32", errmsg,
3187 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3188 &unicode, &outpos, &p))
3189 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003190 }
3191
3192 if (byteorder)
3193 *byteorder = bo;
3194
3195 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003196 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003197
3198 /* Adjust length */
3199 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3200 goto onError;
3201
3202 Py_XDECREF(errorHandler);
3203 Py_XDECREF(exc);
3204 return (PyObject *)unicode;
3205
Benjamin Peterson29060642009-01-31 22:14:21 +00003206 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003207 Py_DECREF(unicode);
3208 Py_XDECREF(errorHandler);
3209 Py_XDECREF(exc);
3210 return NULL;
3211}
3212
3213PyObject *
3214PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003215 Py_ssize_t size,
3216 const char *errors,
3217 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003218{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003219 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003220 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003221 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003222#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003223 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003224#else
3225 const int pairs = 0;
3226#endif
3227 /* Offsets from p for storing byte pairs in the right order. */
3228#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3229 int iorder[] = {0, 1, 2, 3};
3230#else
3231 int iorder[] = {3, 2, 1, 0};
3232#endif
3233
Benjamin Peterson29060642009-01-31 22:14:21 +00003234#define STORECHAR(CH) \
3235 do { \
3236 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3237 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3238 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3239 p[iorder[0]] = (CH) & 0xff; \
3240 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003241 } while(0)
3242
3243 /* In narrow builds we can output surrogate pairs as one codepoint,
3244 so we need less space. */
3245#ifndef Py_UNICODE_WIDE
3246 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003247 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3248 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3249 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003250#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003251 nsize = (size - pairs + (byteorder == 0));
3252 bytesize = nsize * 4;
3253 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003254 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003255 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003256 if (v == NULL)
3257 return NULL;
3258
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003259 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003260 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003261 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003262 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003263 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003264
3265 if (byteorder == -1) {
3266 /* force LE */
3267 iorder[0] = 0;
3268 iorder[1] = 1;
3269 iorder[2] = 2;
3270 iorder[3] = 3;
3271 }
3272 else if (byteorder == 1) {
3273 /* force BE */
3274 iorder[0] = 3;
3275 iorder[1] = 2;
3276 iorder[2] = 1;
3277 iorder[3] = 0;
3278 }
3279
3280 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003281 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003282#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003283 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3284 Py_UCS4 ch2 = *s;
3285 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3286 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3287 s++;
3288 size--;
3289 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003290 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003291#endif
3292 STORECHAR(ch);
3293 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003294
3295 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003296 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003297#undef STORECHAR
3298}
3299
3300PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3301{
3302 if (!PyUnicode_Check(unicode)) {
3303 PyErr_BadArgument();
3304 return NULL;
3305 }
3306 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003307 PyUnicode_GET_SIZE(unicode),
3308 NULL,
3309 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003310}
3311
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312/* --- UTF-16 Codec ------------------------------------------------------- */
3313
Tim Peters772747b2001-08-09 22:21:55 +00003314PyObject *
3315PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 Py_ssize_t size,
3317 const char *errors,
3318 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003319{
Walter Dörwald69652032004-09-07 20:24:22 +00003320 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3321}
3322
Antoine Pitrouab868312009-01-10 15:40:25 +00003323/* Two masks for fast checking of whether a C 'long' may contain
3324 UTF16-encoded surrogate characters. This is an efficient heuristic,
3325 assuming that non-surrogate characters with a code point >= 0x8000 are
3326 rare in most input.
3327 FAST_CHAR_MASK is used when the input is in native byte ordering,
3328 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003329*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003330#if (SIZEOF_LONG == 8)
3331# define FAST_CHAR_MASK 0x8000800080008000L
3332# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3333#elif (SIZEOF_LONG == 4)
3334# define FAST_CHAR_MASK 0x80008000L
3335# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3336#else
3337# error C 'long' size should be either 4 or 8!
3338#endif
3339
Walter Dörwald69652032004-09-07 20:24:22 +00003340PyObject *
3341PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003342 Py_ssize_t size,
3343 const char *errors,
3344 int *byteorder,
3345 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003346{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003347 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003348 Py_ssize_t startinpos;
3349 Py_ssize_t endinpos;
3350 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351 PyUnicodeObject *unicode;
3352 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003353 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003354 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003355 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003356 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003357 /* Offsets from q for retrieving byte pairs in the right order. */
3358#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3359 int ihi = 1, ilo = 0;
3360#else
3361 int ihi = 0, ilo = 1;
3362#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003363 PyObject *errorHandler = NULL;
3364 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365
3366 /* Note: size will always be longer than the resulting Unicode
3367 character count */
3368 unicode = _PyUnicode_New(size);
3369 if (!unicode)
3370 return NULL;
3371 if (size == 0)
3372 return (PyObject *)unicode;
3373
3374 /* Unpack UTF-16 encoded data */
3375 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003376 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003377 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378
3379 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003380 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003381
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003382 /* Check for BOM marks (U+FEFF) in the input and adjust current
3383 byte order setting accordingly. In native mode, the leading BOM
3384 mark is skipped, in all other modes, it is copied to the output
3385 stream as-is (giving a ZWNBSP character). */
3386 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003387 if (size >= 2) {
3388 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003389#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003390 if (bom == 0xFEFF) {
3391 q += 2;
3392 bo = -1;
3393 }
3394 else if (bom == 0xFFFE) {
3395 q += 2;
3396 bo = 1;
3397 }
Tim Petersced69f82003-09-16 20:30:58 +00003398#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003399 if (bom == 0xFEFF) {
3400 q += 2;
3401 bo = 1;
3402 }
3403 else if (bom == 0xFFFE) {
3404 q += 2;
3405 bo = -1;
3406 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003407#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003408 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410
Tim Peters772747b2001-08-09 22:21:55 +00003411 if (bo == -1) {
3412 /* force LE */
3413 ihi = 1;
3414 ilo = 0;
3415 }
3416 else if (bo == 1) {
3417 /* force BE */
3418 ihi = 0;
3419 ilo = 1;
3420 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003421#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3422 native_ordering = ilo < ihi;
3423#else
3424 native_ordering = ilo > ihi;
3425#endif
Tim Peters772747b2001-08-09 22:21:55 +00003426
Antoine Pitrouab868312009-01-10 15:40:25 +00003427 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003428 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003429 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003430 /* First check for possible aligned read of a C 'long'. Unaligned
3431 reads are more expensive, better to defer to another iteration. */
3432 if (!((size_t) q & LONG_PTR_MASK)) {
3433 /* Fast path for runs of non-surrogate chars. */
3434 register const unsigned char *_q = q;
3435 Py_UNICODE *_p = p;
3436 if (native_ordering) {
3437 /* Native ordering is simple: as long as the input cannot
3438 possibly contain a surrogate char, do an unrolled copy
3439 of several 16-bit code points to the target object.
3440 The non-surrogate check is done on several input bytes
3441 at a time (as many as a C 'long' can contain). */
3442 while (_q < aligned_end) {
3443 unsigned long data = * (unsigned long *) _q;
3444 if (data & FAST_CHAR_MASK)
3445 break;
3446 _p[0] = ((unsigned short *) _q)[0];
3447 _p[1] = ((unsigned short *) _q)[1];
3448#if (SIZEOF_LONG == 8)
3449 _p[2] = ((unsigned short *) _q)[2];
3450 _p[3] = ((unsigned short *) _q)[3];
3451#endif
3452 _q += SIZEOF_LONG;
3453 _p += SIZEOF_LONG / 2;
3454 }
3455 }
3456 else {
3457 /* Byteswapped ordering is similar, but we must decompose
3458 the copy bytewise, and take care of zero'ing out the
3459 upper bytes if the target object is in 32-bit units
3460 (that is, in UCS-4 builds). */
3461 while (_q < aligned_end) {
3462 unsigned long data = * (unsigned long *) _q;
3463 if (data & SWAPPED_FAST_CHAR_MASK)
3464 break;
3465 /* Zero upper bytes in UCS-4 builds */
3466#if (Py_UNICODE_SIZE > 2)
3467 _p[0] = 0;
3468 _p[1] = 0;
3469#if (SIZEOF_LONG == 8)
3470 _p[2] = 0;
3471 _p[3] = 0;
3472#endif
3473#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003474 /* Issue #4916; UCS-4 builds on big endian machines must
3475 fill the two last bytes of each 4-byte unit. */
3476#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3477# define OFF 2
3478#else
3479# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003480#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003481 ((unsigned char *) _p)[OFF + 1] = _q[0];
3482 ((unsigned char *) _p)[OFF + 0] = _q[1];
3483 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3484 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3485#if (SIZEOF_LONG == 8)
3486 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3487 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3488 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3489 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3490#endif
3491#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003492 _q += SIZEOF_LONG;
3493 _p += SIZEOF_LONG / 2;
3494 }
3495 }
3496 p = _p;
3497 q = _q;
3498 if (q >= e)
3499 break;
3500 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003501 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502
Benjamin Peterson14339b62009-01-31 16:36:08 +00003503 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003504
3505 if (ch < 0xD800 || ch > 0xDFFF) {
3506 *p++ = ch;
3507 continue;
3508 }
3509
3510 /* UTF-16 code pair: */
3511 if (q > e) {
3512 errmsg = "unexpected end of data";
3513 startinpos = (((const char *)q) - 2) - starts;
3514 endinpos = ((const char *)e) + 1 - starts;
3515 goto utf16Error;
3516 }
3517 if (0xD800 <= ch && ch <= 0xDBFF) {
3518 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3519 q += 2;
3520 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003521#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003522 *p++ = ch;
3523 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003524#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003525 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003526#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003527 continue;
3528 }
3529 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003530 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003531 startinpos = (((const char *)q)-4)-starts;
3532 endinpos = startinpos+2;
3533 goto utf16Error;
3534 }
3535
Benjamin Peterson14339b62009-01-31 16:36:08 +00003536 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003537 errmsg = "illegal encoding";
3538 startinpos = (((const char *)q)-2)-starts;
3539 endinpos = startinpos+2;
3540 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003541
Benjamin Peterson29060642009-01-31 22:14:21 +00003542 utf16Error:
3543 outpos = p - PyUnicode_AS_UNICODE(unicode);
3544 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003545 errors,
3546 &errorHandler,
3547 "utf16", errmsg,
3548 &starts,
3549 (const char **)&e,
3550 &startinpos,
3551 &endinpos,
3552 &exc,
3553 (const char **)&q,
3554 &unicode,
3555 &outpos,
3556 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003557 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003559 /* remaining byte at the end? (size should be even) */
3560 if (e == q) {
3561 if (!consumed) {
3562 errmsg = "truncated data";
3563 startinpos = ((const char *)q) - starts;
3564 endinpos = ((const char *)e) + 1 - starts;
3565 outpos = p - PyUnicode_AS_UNICODE(unicode);
3566 if (unicode_decode_call_errorhandler(
3567 errors,
3568 &errorHandler,
3569 "utf16", errmsg,
3570 &starts,
3571 (const char **)&e,
3572 &startinpos,
3573 &endinpos,
3574 &exc,
3575 (const char **)&q,
3576 &unicode,
3577 &outpos,
3578 &p))
3579 goto onError;
3580 /* The remaining input chars are ignored if the callback
3581 chooses to skip the input */
3582 }
3583 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584
3585 if (byteorder)
3586 *byteorder = bo;
3587
Walter Dörwald69652032004-09-07 20:24:22 +00003588 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003589 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003590
Guido van Rossumd57fd912000-03-10 22:53:23 +00003591 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003592 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593 goto onError;
3594
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003595 Py_XDECREF(errorHandler);
3596 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597 return (PyObject *)unicode;
3598
Benjamin Peterson29060642009-01-31 22:14:21 +00003599 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003600 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 Py_XDECREF(errorHandler);
3602 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603 return NULL;
3604}
3605
Antoine Pitrouab868312009-01-10 15:40:25 +00003606#undef FAST_CHAR_MASK
3607#undef SWAPPED_FAST_CHAR_MASK
3608
Tim Peters772747b2001-08-09 22:21:55 +00003609PyObject *
3610PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003611 Py_ssize_t size,
3612 const char *errors,
3613 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003615 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003616 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003617 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003618#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003619 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003620#else
3621 const int pairs = 0;
3622#endif
Tim Peters772747b2001-08-09 22:21:55 +00003623 /* Offsets from p for storing byte pairs in the right order. */
3624#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3625 int ihi = 1, ilo = 0;
3626#else
3627 int ihi = 0, ilo = 1;
3628#endif
3629
Benjamin Peterson29060642009-01-31 22:14:21 +00003630#define STORECHAR(CH) \
3631 do { \
3632 p[ihi] = ((CH) >> 8) & 0xff; \
3633 p[ilo] = (CH) & 0xff; \
3634 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003635 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003637#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003638 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003639 if (s[i] >= 0x10000)
3640 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003641#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003642 /* 2 * (size + pairs + (byteorder == 0)) */
3643 if (size > PY_SSIZE_T_MAX ||
3644 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003645 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003646 nsize = size + pairs + (byteorder == 0);
3647 bytesize = nsize * 2;
3648 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003649 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003650 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 if (v == NULL)
3652 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003654 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003656 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003657 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003658 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003659
3660 if (byteorder == -1) {
3661 /* force LE */
3662 ihi = 1;
3663 ilo = 0;
3664 }
3665 else if (byteorder == 1) {
3666 /* force BE */
3667 ihi = 0;
3668 ilo = 1;
3669 }
3670
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003671 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003672 Py_UNICODE ch = *s++;
3673 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003674#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003675 if (ch >= 0x10000) {
3676 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3677 ch = 0xD800 | ((ch-0x10000) >> 10);
3678 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003679#endif
Tim Peters772747b2001-08-09 22:21:55 +00003680 STORECHAR(ch);
3681 if (ch2)
3682 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003683 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003684
3685 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003686 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003687#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688}
3689
3690PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3691{
3692 if (!PyUnicode_Check(unicode)) {
3693 PyErr_BadArgument();
3694 return NULL;
3695 }
3696 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003697 PyUnicode_GET_SIZE(unicode),
3698 NULL,
3699 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700}
3701
3702/* --- Unicode Escape Codec ----------------------------------------------- */
3703
Fredrik Lundh06d12682001-01-24 07:59:11 +00003704static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003705
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003707 Py_ssize_t size,
3708 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003710 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003711 Py_ssize_t startinpos;
3712 Py_ssize_t endinpos;
3713 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003714 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003718 char* message;
3719 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003720 PyObject *errorHandler = NULL;
3721 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003722
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723 /* Escaped strings will always be longer than the resulting
3724 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003725 length after conversion to the true value.
3726 (but if the error callback returns a long replacement string
3727 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728 v = _PyUnicode_New(size);
3729 if (v == NULL)
3730 goto onError;
3731 if (size == 0)
3732 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003733
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003734 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003736
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737 while (s < end) {
3738 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003739 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003740 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741
3742 /* Non-escape characters are interpreted as Unicode ordinals */
3743 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003744 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003745 continue;
3746 }
3747
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003748 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003749 /* \ - Escapes */
3750 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003751 c = *s++;
3752 if (s > end)
3753 c = '\0'; /* Invalid after \ */
3754 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755
Benjamin Peterson29060642009-01-31 22:14:21 +00003756 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 case '\n': break;
3758 case '\\': *p++ = '\\'; break;
3759 case '\'': *p++ = '\''; break;
3760 case '\"': *p++ = '\"'; break;
3761 case 'b': *p++ = '\b'; break;
3762 case 'f': *p++ = '\014'; break; /* FF */
3763 case 't': *p++ = '\t'; break;
3764 case 'n': *p++ = '\n'; break;
3765 case 'r': *p++ = '\r'; break;
3766 case 'v': *p++ = '\013'; break; /* VT */
3767 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3768
Benjamin Peterson29060642009-01-31 22:14:21 +00003769 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003770 case '0': case '1': case '2': case '3':
3771 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003772 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003773 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003774 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003775 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003776 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003778 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 break;
3780
Benjamin Peterson29060642009-01-31 22:14:21 +00003781 /* hex escapes */
3782 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003784 digits = 2;
3785 message = "truncated \\xXX escape";
3786 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003787
Benjamin Peterson29060642009-01-31 22:14:21 +00003788 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003790 digits = 4;
3791 message = "truncated \\uXXXX escape";
3792 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793
Benjamin Peterson29060642009-01-31 22:14:21 +00003794 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003795 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003796 digits = 8;
3797 message = "truncated \\UXXXXXXXX escape";
3798 hexescape:
3799 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003800 outpos = p-PyUnicode_AS_UNICODE(v);
3801 if (s+digits>end) {
3802 endinpos = size;
3803 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003804 errors, &errorHandler,
3805 "unicodeescape", "end of string in escape sequence",
3806 &starts, &end, &startinpos, &endinpos, &exc, &s,
3807 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003808 goto onError;
3809 goto nextByte;
3810 }
3811 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003812 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003813 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003814 endinpos = (s+i+1)-starts;
3815 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003816 errors, &errorHandler,
3817 "unicodeescape", message,
3818 &starts, &end, &startinpos, &endinpos, &exc, &s,
3819 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003820 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003821 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003822 }
3823 chr = (chr<<4) & ~0xF;
3824 if (c >= '0' && c <= '9')
3825 chr += c - '0';
3826 else if (c >= 'a' && c <= 'f')
3827 chr += 10 + c - 'a';
3828 else
3829 chr += 10 + c - 'A';
3830 }
3831 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003832 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003833 /* _decoding_error will have already written into the
3834 target buffer. */
3835 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003836 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003837 /* when we get here, chr is a 32-bit unicode character */
3838 if (chr <= 0xffff)
3839 /* UCS-2 character */
3840 *p++ = (Py_UNICODE) chr;
3841 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003842 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003843 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003844#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003845 *p++ = chr;
3846#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003847 chr -= 0x10000L;
3848 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003849 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003850#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003851 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003852 endinpos = s-starts;
3853 outpos = p-PyUnicode_AS_UNICODE(v);
3854 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003855 errors, &errorHandler,
3856 "unicodeescape", "illegal Unicode character",
3857 &starts, &end, &startinpos, &endinpos, &exc, &s,
3858 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003859 goto onError;
3860 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003861 break;
3862
Benjamin Peterson29060642009-01-31 22:14:21 +00003863 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003864 case 'N':
3865 message = "malformed \\N character escape";
3866 if (ucnhash_CAPI == NULL) {
3867 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003868 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003869 if (ucnhash_CAPI == NULL)
3870 goto ucnhashError;
3871 }
3872 if (*s == '{') {
3873 const char *start = s+1;
3874 /* look for the closing brace */
3875 while (*s != '}' && s < end)
3876 s++;
3877 if (s > start && s < end && *s == '}') {
3878 /* found a name. look it up in the unicode database */
3879 message = "unknown Unicode character name";
3880 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003881 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003882 goto store;
3883 }
3884 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003885 endinpos = s-starts;
3886 outpos = p-PyUnicode_AS_UNICODE(v);
3887 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003888 errors, &errorHandler,
3889 "unicodeescape", message,
3890 &starts, &end, &startinpos, &endinpos, &exc, &s,
3891 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003892 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003893 break;
3894
3895 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003896 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003897 message = "\\ at end of string";
3898 s--;
3899 endinpos = s-starts;
3900 outpos = p-PyUnicode_AS_UNICODE(v);
3901 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003902 errors, &errorHandler,
3903 "unicodeescape", message,
3904 &starts, &end, &startinpos, &endinpos, &exc, &s,
3905 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003906 goto onError;
3907 }
3908 else {
3909 *p++ = '\\';
3910 *p++ = (unsigned char)s[-1];
3911 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003912 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003914 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003915 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003917 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003918 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003919 Py_XDECREF(errorHandler);
3920 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003922
Benjamin Peterson29060642009-01-31 22:14:21 +00003923 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003924 PyErr_SetString(
3925 PyExc_UnicodeError,
3926 "\\N escapes not supported (can't load unicodedata module)"
3927 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003928 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003929 Py_XDECREF(errorHandler);
3930 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003931 return NULL;
3932
Benjamin Peterson29060642009-01-31 22:14:21 +00003933 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 Py_XDECREF(errorHandler);
3936 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937 return NULL;
3938}
3939
3940/* Return a Unicode-Escape string version of the Unicode object.
3941
3942 If quotes is true, the string is enclosed in u"" or u'' quotes as
3943 appropriate.
3944
3945*/
3946
Thomas Wouters477c8d52006-05-27 19:21:47 +00003947Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003948 Py_ssize_t size,
3949 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003950{
3951 /* like wcschr, but doesn't stop at NULL characters */
3952
3953 while (size-- > 0) {
3954 if (*s == ch)
3955 return s;
3956 s++;
3957 }
3958
3959 return NULL;
3960}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003961
Walter Dörwald79e913e2007-05-12 11:08:06 +00003962static const char *hexdigits = "0123456789abcdef";
3963
3964PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003965 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003967 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003970#ifdef Py_UNICODE_WIDE
3971 const Py_ssize_t expandsize = 10;
3972#else
3973 const Py_ssize_t expandsize = 6;
3974#endif
3975
Thomas Wouters89f507f2006-12-13 04:49:30 +00003976 /* XXX(nnorwitz): rather than over-allocating, it would be
3977 better to choose a different scheme. Perhaps scan the
3978 first N-chars of the string and allocate based on that size.
3979 */
3980 /* Initial allocation is based on the longest-possible unichr
3981 escape.
3982
3983 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3984 unichr, so in this case it's the longest unichr escape. In
3985 narrow (UTF-16) builds this is five chars per source unichr
3986 since there are two unichrs in the surrogate pair, so in narrow
3987 (UTF-16) builds it's not the longest unichr escape.
3988
3989 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3990 so in the narrow (UTF-16) build case it's the longest unichr
3991 escape.
3992 */
3993
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003994 if (size == 0)
3995 return PyBytes_FromStringAndSize(NULL, 0);
3996
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003997 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003998 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003999
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004000 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004001 2
4002 + expandsize*size
4003 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 if (repr == NULL)
4005 return NULL;
4006
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004007 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009 while (size-- > 0) {
4010 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004011
Walter Dörwald79e913e2007-05-12 11:08:06 +00004012 /* Escape backslashes */
4013 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014 *p++ = '\\';
4015 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004016 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004017 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004018
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004019#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004020 /* Map 21-bit characters to '\U00xxxxxx' */
4021 else if (ch >= 0x10000) {
4022 *p++ = '\\';
4023 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004024 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4025 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4026 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4027 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4028 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4029 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4030 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4031 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004032 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004033 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004034#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004035 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4036 else if (ch >= 0xD800 && ch < 0xDC00) {
4037 Py_UNICODE ch2;
4038 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004039
Benjamin Peterson29060642009-01-31 22:14:21 +00004040 ch2 = *s++;
4041 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004042 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004043 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4044 *p++ = '\\';
4045 *p++ = 'U';
4046 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4047 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4048 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4049 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4050 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4051 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4052 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4053 *p++ = hexdigits[ucs & 0x0000000F];
4054 continue;
4055 }
4056 /* Fall through: isolated surrogates are copied as-is */
4057 s--;
4058 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004059 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004060#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004061
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004063 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 *p++ = '\\';
4065 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004066 *p++ = hexdigits[(ch >> 12) & 0x000F];
4067 *p++ = hexdigits[(ch >> 8) & 0x000F];
4068 *p++ = hexdigits[(ch >> 4) & 0x000F];
4069 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004071
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004072 /* Map special whitespace to '\t', \n', '\r' */
4073 else if (ch == '\t') {
4074 *p++ = '\\';
4075 *p++ = 't';
4076 }
4077 else if (ch == '\n') {
4078 *p++ = '\\';
4079 *p++ = 'n';
4080 }
4081 else if (ch == '\r') {
4082 *p++ = '\\';
4083 *p++ = 'r';
4084 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004085
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004086 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004087 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004089 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004090 *p++ = hexdigits[(ch >> 4) & 0x000F];
4091 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004092 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004093
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094 /* Copy everything else as-is */
4095 else
4096 *p++ = (char) ch;
4097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004099 assert(p - PyBytes_AS_STRING(repr) > 0);
4100 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4101 return NULL;
4102 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103}
4104
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00004105PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004107 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004108 if (!PyUnicode_Check(unicode)) {
4109 PyErr_BadArgument();
4110 return NULL;
4111 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004112 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4113 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004114 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115}
4116
4117/* --- Raw Unicode Escape Codec ------------------------------------------- */
4118
4119PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 Py_ssize_t size,
4121 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004124 Py_ssize_t startinpos;
4125 Py_ssize_t endinpos;
4126 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129 const char *end;
4130 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004131 PyObject *errorHandler = NULL;
4132 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004133
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134 /* Escaped strings will always be longer than the resulting
4135 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136 length after conversion to the true value. (But decoding error
4137 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 v = _PyUnicode_New(size);
4139 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004140 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004142 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004143 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 end = s + size;
4145 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 unsigned char c;
4147 Py_UCS4 x;
4148 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004149 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150
Benjamin Peterson29060642009-01-31 22:14:21 +00004151 /* Non-escape characters are interpreted as Unicode ordinals */
4152 if (*s != '\\') {
4153 *p++ = (unsigned char)*s++;
4154 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004155 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004156 startinpos = s-starts;
4157
4158 /* \u-escapes are only interpreted iff the number of leading
4159 backslashes if odd */
4160 bs = s;
4161 for (;s < end;) {
4162 if (*s != '\\')
4163 break;
4164 *p++ = (unsigned char)*s++;
4165 }
4166 if (((s - bs) & 1) == 0 ||
4167 s >= end ||
4168 (*s != 'u' && *s != 'U')) {
4169 continue;
4170 }
4171 p--;
4172 count = *s=='u' ? 4 : 8;
4173 s++;
4174
4175 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4176 outpos = p-PyUnicode_AS_UNICODE(v);
4177 for (x = 0, i = 0; i < count; ++i, ++s) {
4178 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004179 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004180 endinpos = s-starts;
4181 if (unicode_decode_call_errorhandler(
4182 errors, &errorHandler,
4183 "rawunicodeescape", "truncated \\uXXXX",
4184 &starts, &end, &startinpos, &endinpos, &exc, &s,
4185 &v, &outpos, &p))
4186 goto onError;
4187 goto nextByte;
4188 }
4189 x = (x<<4) & ~0xF;
4190 if (c >= '0' && c <= '9')
4191 x += c - '0';
4192 else if (c >= 'a' && c <= 'f')
4193 x += 10 + c - 'a';
4194 else
4195 x += 10 + c - 'A';
4196 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004197 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 /* UCS-2 character */
4199 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004200 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004201 /* UCS-4 character. Either store directly, or as
4202 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004203#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004204 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004205#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004206 x -= 0x10000L;
4207 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4208 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004209#endif
4210 } else {
4211 endinpos = s-starts;
4212 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004213 if (unicode_decode_call_errorhandler(
4214 errors, &errorHandler,
4215 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 &starts, &end, &startinpos, &endinpos, &exc, &s,
4217 &v, &outpos, &p))
4218 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004219 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004220 nextByte:
4221 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004222 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004223 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004224 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004225 Py_XDECREF(errorHandler);
4226 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004228
Benjamin Peterson29060642009-01-31 22:14:21 +00004229 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004230 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004231 Py_XDECREF(errorHandler);
4232 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004233 return NULL;
4234}
4235
4236PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004237 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004239 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004240 char *p;
4241 char *q;
4242
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004243#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004244 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004245#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004246 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004247#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004248
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004249 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004251
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004252 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253 if (repr == NULL)
4254 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004255 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004256 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004258 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259 while (size-- > 0) {
4260 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004261#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004262 /* Map 32-bit characters to '\Uxxxxxxxx' */
4263 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004264 *p++ = '\\';
4265 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004266 *p++ = hexdigits[(ch >> 28) & 0xf];
4267 *p++ = hexdigits[(ch >> 24) & 0xf];
4268 *p++ = hexdigits[(ch >> 20) & 0xf];
4269 *p++ = hexdigits[(ch >> 16) & 0xf];
4270 *p++ = hexdigits[(ch >> 12) & 0xf];
4271 *p++ = hexdigits[(ch >> 8) & 0xf];
4272 *p++ = hexdigits[(ch >> 4) & 0xf];
4273 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004274 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004275 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004276#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004277 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4278 if (ch >= 0xD800 && ch < 0xDC00) {
4279 Py_UNICODE ch2;
4280 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004281
Benjamin Peterson29060642009-01-31 22:14:21 +00004282 ch2 = *s++;
4283 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004284 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004285 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4286 *p++ = '\\';
4287 *p++ = 'U';
4288 *p++ = hexdigits[(ucs >> 28) & 0xf];
4289 *p++ = hexdigits[(ucs >> 24) & 0xf];
4290 *p++ = hexdigits[(ucs >> 20) & 0xf];
4291 *p++ = hexdigits[(ucs >> 16) & 0xf];
4292 *p++ = hexdigits[(ucs >> 12) & 0xf];
4293 *p++ = hexdigits[(ucs >> 8) & 0xf];
4294 *p++ = hexdigits[(ucs >> 4) & 0xf];
4295 *p++ = hexdigits[ucs & 0xf];
4296 continue;
4297 }
4298 /* Fall through: isolated surrogates are copied as-is */
4299 s--;
4300 size++;
4301 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004302#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004303 /* Map 16-bit characters to '\uxxxx' */
4304 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305 *p++ = '\\';
4306 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004307 *p++ = hexdigits[(ch >> 12) & 0xf];
4308 *p++ = hexdigits[(ch >> 8) & 0xf];
4309 *p++ = hexdigits[(ch >> 4) & 0xf];
4310 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004312 /* Copy everything else as-is */
4313 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314 *p++ = (char) ch;
4315 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004316 size = p - q;
4317
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004318 assert(size > 0);
4319 if (_PyBytes_Resize(&repr, size) < 0)
4320 return NULL;
4321 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322}
4323
4324PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4325{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004326 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004328 PyErr_BadArgument();
4329 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004331 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4332 PyUnicode_GET_SIZE(unicode));
4333
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004334 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335}
4336
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004337/* --- Unicode Internal Codec ------------------------------------------- */
4338
4339PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004340 Py_ssize_t size,
4341 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004342{
4343 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004344 Py_ssize_t startinpos;
4345 Py_ssize_t endinpos;
4346 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004347 PyUnicodeObject *v;
4348 Py_UNICODE *p;
4349 const char *end;
4350 const char *reason;
4351 PyObject *errorHandler = NULL;
4352 PyObject *exc = NULL;
4353
Neal Norwitzd43069c2006-01-08 01:12:10 +00004354#ifdef Py_UNICODE_WIDE
4355 Py_UNICODE unimax = PyUnicode_GetMax();
4356#endif
4357
Thomas Wouters89f507f2006-12-13 04:49:30 +00004358 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004359 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4360 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004361 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004362 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004363 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004364 p = PyUnicode_AS_UNICODE(v);
4365 end = s + size;
4366
4367 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004368 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004369 /* We have to sanity check the raw data, otherwise doom looms for
4370 some malformed UCS-4 data. */
4371 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004372#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004373 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004374#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004375 end-s < Py_UNICODE_SIZE
4376 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004377 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004378 startinpos = s - starts;
4379 if (end-s < Py_UNICODE_SIZE) {
4380 endinpos = end-starts;
4381 reason = "truncated input";
4382 }
4383 else {
4384 endinpos = s - starts + Py_UNICODE_SIZE;
4385 reason = "illegal code point (> 0x10FFFF)";
4386 }
4387 outpos = p - PyUnicode_AS_UNICODE(v);
4388 if (unicode_decode_call_errorhandler(
4389 errors, &errorHandler,
4390 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004391 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004392 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004393 goto onError;
4394 }
4395 }
4396 else {
4397 p++;
4398 s += Py_UNICODE_SIZE;
4399 }
4400 }
4401
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004402 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004403 goto onError;
4404 Py_XDECREF(errorHandler);
4405 Py_XDECREF(exc);
4406 return (PyObject *)v;
4407
Benjamin Peterson29060642009-01-31 22:14:21 +00004408 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004409 Py_XDECREF(v);
4410 Py_XDECREF(errorHandler);
4411 Py_XDECREF(exc);
4412 return NULL;
4413}
4414
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415/* --- Latin-1 Codec ------------------------------------------------------ */
4416
4417PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004418 Py_ssize_t size,
4419 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420{
4421 PyUnicodeObject *v;
4422 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004423 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004424
Guido van Rossumd57fd912000-03-10 22:53:23 +00004425 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004426 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004427 Py_UNICODE r = *(unsigned char*)s;
4428 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004429 }
4430
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431 v = _PyUnicode_New(size);
4432 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004433 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004437 e = s + size;
4438 /* Unrolling the copy makes it much faster by reducing the looping
4439 overhead. This is similar to what many memcpy() implementations do. */
4440 unrolled_end = e - 4;
4441 while (s < unrolled_end) {
4442 p[0] = (unsigned char) s[0];
4443 p[1] = (unsigned char) s[1];
4444 p[2] = (unsigned char) s[2];
4445 p[3] = (unsigned char) s[3];
4446 s += 4;
4447 p += 4;
4448 }
4449 while (s < e)
4450 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004452
Benjamin Peterson29060642009-01-31 22:14:21 +00004453 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454 Py_XDECREF(v);
4455 return NULL;
4456}
4457
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004458/* create or adjust a UnicodeEncodeError */
4459static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004460 const char *encoding,
4461 const Py_UNICODE *unicode, Py_ssize_t size,
4462 Py_ssize_t startpos, Py_ssize_t endpos,
4463 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004464{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004466 *exceptionObject = PyUnicodeEncodeError_Create(
4467 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 }
4469 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004470 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4471 goto onError;
4472 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4473 goto onError;
4474 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4475 goto onError;
4476 return;
4477 onError:
4478 Py_DECREF(*exceptionObject);
4479 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 }
4481}
4482
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004483/* raises a UnicodeEncodeError */
4484static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004485 const char *encoding,
4486 const Py_UNICODE *unicode, Py_ssize_t size,
4487 Py_ssize_t startpos, Py_ssize_t endpos,
4488 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489{
4490 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004494}
4495
4496/* error handling callback helper:
4497 build arguments, call the callback and check the arguments,
4498 put the result into newpos and return the replacement string, which
4499 has to be freed by the caller */
4500static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 PyObject **errorHandler,
4502 const char *encoding, const char *reason,
4503 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4504 Py_ssize_t startpos, Py_ssize_t endpos,
4505 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004506{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004507 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004508
4509 PyObject *restuple;
4510 PyObject *resunicode;
4511
4512 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004513 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516 }
4517
4518 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004521 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522
4523 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004525 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004526 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004528 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 Py_DECREF(restuple);
4530 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004532 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004533 &resunicode, newpos)) {
4534 Py_DECREF(restuple);
4535 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004536 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004537 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4538 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4539 Py_DECREF(restuple);
4540 return NULL;
4541 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004543 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004544 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4546 Py_DECREF(restuple);
4547 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004548 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 Py_INCREF(resunicode);
4550 Py_DECREF(restuple);
4551 return resunicode;
4552}
4553
4554static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004555 Py_ssize_t size,
4556 const char *errors,
4557 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558{
4559 /* output object */
4560 PyObject *res;
4561 /* pointers to the beginning and end+1 of input */
4562 const Py_UNICODE *startp = p;
4563 const Py_UNICODE *endp = p + size;
4564 /* pointer to the beginning of the unencodable characters */
4565 /* const Py_UNICODE *badp = NULL; */
4566 /* pointer into the output */
4567 char *str;
4568 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004569 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004570 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4571 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 PyObject *errorHandler = NULL;
4573 PyObject *exc = NULL;
4574 /* the following variable is used for caching string comparisons
4575 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4576 int known_errorHandler = -1;
4577
4578 /* allocate enough for a simple encoding without
4579 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004580 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004581 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004582 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004584 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004585 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004586 ressize = size;
4587
4588 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004589 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004590
Benjamin Peterson29060642009-01-31 22:14:21 +00004591 /* can we encode this? */
4592 if (c<limit) {
4593 /* no overflow check, because we know that the space is enough */
4594 *str++ = (char)c;
4595 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004596 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004597 else {
4598 Py_ssize_t unicodepos = p-startp;
4599 Py_ssize_t requiredsize;
4600 PyObject *repunicode;
4601 Py_ssize_t repsize;
4602 Py_ssize_t newpos;
4603 Py_ssize_t respos;
4604 Py_UNICODE *uni2;
4605 /* startpos for collecting unencodable chars */
4606 const Py_UNICODE *collstart = p;
4607 const Py_UNICODE *collend = p;
4608 /* find all unecodable characters */
4609 while ((collend < endp) && ((*collend)>=limit))
4610 ++collend;
4611 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4612 if (known_errorHandler==-1) {
4613 if ((errors==NULL) || (!strcmp(errors, "strict")))
4614 known_errorHandler = 1;
4615 else if (!strcmp(errors, "replace"))
4616 known_errorHandler = 2;
4617 else if (!strcmp(errors, "ignore"))
4618 known_errorHandler = 3;
4619 else if (!strcmp(errors, "xmlcharrefreplace"))
4620 known_errorHandler = 4;
4621 else
4622 known_errorHandler = 0;
4623 }
4624 switch (known_errorHandler) {
4625 case 1: /* strict */
4626 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4627 goto onError;
4628 case 2: /* replace */
4629 while (collstart++<collend)
4630 *str++ = '?'; /* fall through */
4631 case 3: /* ignore */
4632 p = collend;
4633 break;
4634 case 4: /* xmlcharrefreplace */
4635 respos = str - PyBytes_AS_STRING(res);
4636 /* determine replacement size (temporarily (mis)uses p) */
4637 for (p = collstart, repsize = 0; p < collend; ++p) {
4638 if (*p<10)
4639 repsize += 2+1+1;
4640 else if (*p<100)
4641 repsize += 2+2+1;
4642 else if (*p<1000)
4643 repsize += 2+3+1;
4644 else if (*p<10000)
4645 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004646#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004647 else
4648 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004649#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004650 else if (*p<100000)
4651 repsize += 2+5+1;
4652 else if (*p<1000000)
4653 repsize += 2+6+1;
4654 else
4655 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004656#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004657 }
4658 requiredsize = respos+repsize+(endp-collend);
4659 if (requiredsize > ressize) {
4660 if (requiredsize<2*ressize)
4661 requiredsize = 2*ressize;
4662 if (_PyBytes_Resize(&res, requiredsize))
4663 goto onError;
4664 str = PyBytes_AS_STRING(res) + respos;
4665 ressize = requiredsize;
4666 }
4667 /* generate replacement (temporarily (mis)uses p) */
4668 for (p = collstart; p < collend; ++p) {
4669 str += sprintf(str, "&#%d;", (int)*p);
4670 }
4671 p = collend;
4672 break;
4673 default:
4674 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4675 encoding, reason, startp, size, &exc,
4676 collstart-startp, collend-startp, &newpos);
4677 if (repunicode == NULL)
4678 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004679 if (PyBytes_Check(repunicode)) {
4680 /* Directly copy bytes result to output. */
4681 repsize = PyBytes_Size(repunicode);
4682 if (repsize > 1) {
4683 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004684 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004685 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4686 Py_DECREF(repunicode);
4687 goto onError;
4688 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004689 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004690 ressize += repsize-1;
4691 }
4692 memcpy(str, PyBytes_AsString(repunicode), repsize);
4693 str += repsize;
4694 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004695 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004696 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004697 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004698 /* need more space? (at least enough for what we
4699 have+the replacement+the rest of the string, so
4700 we won't have to check space for encodable characters) */
4701 respos = str - PyBytes_AS_STRING(res);
4702 repsize = PyUnicode_GET_SIZE(repunicode);
4703 requiredsize = respos+repsize+(endp-collend);
4704 if (requiredsize > ressize) {
4705 if (requiredsize<2*ressize)
4706 requiredsize = 2*ressize;
4707 if (_PyBytes_Resize(&res, requiredsize)) {
4708 Py_DECREF(repunicode);
4709 goto onError;
4710 }
4711 str = PyBytes_AS_STRING(res) + respos;
4712 ressize = requiredsize;
4713 }
4714 /* check if there is anything unencodable in the replacement
4715 and copy it to the output */
4716 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4717 c = *uni2;
4718 if (c >= limit) {
4719 raise_encode_exception(&exc, encoding, startp, size,
4720 unicodepos, unicodepos+1, reason);
4721 Py_DECREF(repunicode);
4722 goto onError;
4723 }
4724 *str = (char)c;
4725 }
4726 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004727 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004728 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004729 }
4730 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004731 /* Resize if we allocated to much */
4732 size = str - PyBytes_AS_STRING(res);
4733 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004734 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004735 if (_PyBytes_Resize(&res, size) < 0)
4736 goto onError;
4737 }
4738
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004739 Py_XDECREF(errorHandler);
4740 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004741 return res;
4742
4743 onError:
4744 Py_XDECREF(res);
4745 Py_XDECREF(errorHandler);
4746 Py_XDECREF(exc);
4747 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004748}
4749
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004751 Py_ssize_t size,
4752 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755}
4756
4757PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4758{
4759 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004760 PyErr_BadArgument();
4761 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762 }
4763 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004764 PyUnicode_GET_SIZE(unicode),
4765 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766}
4767
4768/* --- 7-bit ASCII Codec -------------------------------------------------- */
4769
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004771 Py_ssize_t size,
4772 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004774 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775 PyUnicodeObject *v;
4776 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004777 Py_ssize_t startinpos;
4778 Py_ssize_t endinpos;
4779 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780 const char *e;
4781 PyObject *errorHandler = NULL;
4782 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004783
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004785 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004786 Py_UNICODE r = *(unsigned char*)s;
4787 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004788 }
Tim Petersced69f82003-09-16 20:30:58 +00004789
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790 v = _PyUnicode_New(size);
4791 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004792 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004794 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004795 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004796 e = s + size;
4797 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 register unsigned char c = (unsigned char)*s;
4799 if (c < 128) {
4800 *p++ = c;
4801 ++s;
4802 }
4803 else {
4804 startinpos = s-starts;
4805 endinpos = startinpos + 1;
4806 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4807 if (unicode_decode_call_errorhandler(
4808 errors, &errorHandler,
4809 "ascii", "ordinal not in range(128)",
4810 &starts, &e, &startinpos, &endinpos, &exc, &s,
4811 &v, &outpos, &p))
4812 goto onError;
4813 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004815 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004816 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4817 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004818 Py_XDECREF(errorHandler);
4819 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004821
Benjamin Peterson29060642009-01-31 22:14:21 +00004822 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 Py_XDECREF(errorHandler);
4825 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826 return NULL;
4827}
4828
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004830 Py_ssize_t size,
4831 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834}
4835
4836PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4837{
4838 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004839 PyErr_BadArgument();
4840 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 }
4842 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004843 PyUnicode_GET_SIZE(unicode),
4844 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845}
4846
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004847#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004848
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004849/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004850
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004851#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004852#define NEED_RETRY
4853#endif
4854
4855/* XXX This code is limited to "true" double-byte encodings, as
4856 a) it assumes an incomplete character consists of a single byte, and
4857 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004858 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004859
4860static int is_dbcs_lead_byte(const char *s, int offset)
4861{
4862 const char *curr = s + offset;
4863
4864 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004865 const char *prev = CharPrev(s, curr);
4866 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004867 }
4868 return 0;
4869}
4870
4871/*
4872 * Decode MBCS string into unicode object. If 'final' is set, converts
4873 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4874 */
4875static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004876 const char *s, /* MBCS string */
4877 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004878 int final,
4879 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004880{
4881 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004882 Py_ssize_t n;
4883 DWORD usize;
4884 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004885
4886 assert(size >= 0);
4887
Victor Stinner554f3f02010-06-16 23:33:54 +00004888 /* check and handle 'errors' arg */
4889 if (errors==NULL || strcmp(errors, "strict")==0)
4890 flags = MB_ERR_INVALID_CHARS;
4891 else if (strcmp(errors, "ignore")==0)
4892 flags = 0;
4893 else {
4894 PyErr_Format(PyExc_ValueError,
4895 "mbcs encoding does not support errors='%s'",
4896 errors);
4897 return -1;
4898 }
4899
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004900 /* Skip trailing lead-byte unless 'final' is set */
4901 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004902 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004903
4904 /* First get the size of the result */
4905 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004906 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4907 if (usize==0)
4908 goto mbcs_decode_error;
4909 } else
4910 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004911
4912 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004913 /* Create unicode object */
4914 *v = _PyUnicode_New(usize);
4915 if (*v == NULL)
4916 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004917 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004918 }
4919 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004920 /* Extend unicode object */
4921 n = PyUnicode_GET_SIZE(*v);
4922 if (_PyUnicode_Resize(v, n + usize) < 0)
4923 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004924 }
4925
4926 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004927 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004928 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004929 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4930 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004931 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004932 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004933 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004934
4935mbcs_decode_error:
4936 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4937 we raise a UnicodeDecodeError - else it is a 'generic'
4938 windows error
4939 */
4940 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4941 /* Ideally, we should get reason from FormatMessage - this
4942 is the Windows 2000 English version of the message
4943 */
4944 PyObject *exc = NULL;
4945 const char *reason = "No mapping for the Unicode character exists "
4946 "in the target multi-byte code page.";
4947 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4948 if (exc != NULL) {
4949 PyCodec_StrictErrors(exc);
4950 Py_DECREF(exc);
4951 }
4952 } else {
4953 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4954 }
4955 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004956}
4957
4958PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004959 Py_ssize_t size,
4960 const char *errors,
4961 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004962{
4963 PyUnicodeObject *v = NULL;
4964 int done;
4965
4966 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004967 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004968
4969#ifdef NEED_RETRY
4970 retry:
4971 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004972 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004973 else
4974#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004975 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004976
4977 if (done < 0) {
4978 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004979 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004980 }
4981
4982 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004983 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004984
4985#ifdef NEED_RETRY
4986 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004987 s += done;
4988 size -= done;
4989 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004990 }
4991#endif
4992
4993 return (PyObject *)v;
4994}
4995
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004996PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004997 Py_ssize_t size,
4998 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004999{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005000 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5001}
5002
5003/*
5004 * Convert unicode into string object (MBCS).
5005 * Returns 0 if succeed, -1 otherwise.
5006 */
5007static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00005008 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00005009 int size, /* size of unicode */
5010 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005011{
Victor Stinner554f3f02010-06-16 23:33:54 +00005012 BOOL usedDefaultChar = FALSE;
5013 BOOL *pusedDefaultChar;
5014 int mbcssize;
5015 Py_ssize_t n;
5016 PyObject *exc = NULL;
5017 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005018
5019 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005020
Victor Stinner554f3f02010-06-16 23:33:54 +00005021 /* check and handle 'errors' arg */
5022 if (errors==NULL || strcmp(errors, "strict")==0) {
5023 flags = WC_NO_BEST_FIT_CHARS;
5024 pusedDefaultChar = &usedDefaultChar;
5025 } else if (strcmp(errors, "replace")==0) {
5026 flags = 0;
5027 pusedDefaultChar = NULL;
5028 } else {
5029 PyErr_Format(PyExc_ValueError,
5030 "mbcs encoding does not support errors='%s'",
5031 errors);
5032 return -1;
5033 }
5034
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005035 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005036 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005037 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5038 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005039 if (mbcssize == 0) {
5040 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5041 return -1;
5042 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005043 /* If we used a default char, then we failed! */
5044 if (pusedDefaultChar && *pusedDefaultChar)
5045 goto mbcs_encode_error;
5046 } else {
5047 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005048 }
5049
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005050 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 /* Create string object */
5052 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5053 if (*repr == NULL)
5054 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005055 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005056 }
5057 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005058 /* Extend string object */
5059 n = PyBytes_Size(*repr);
5060 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5061 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005062 }
5063
5064 /* Do the conversion */
5065 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005067 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5068 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005069 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5070 return -1;
5071 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005072 if (pusedDefaultChar && *pusedDefaultChar)
5073 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005074 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005075 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005076
5077mbcs_encode_error:
5078 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5079 Py_XDECREF(exc);
5080 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005081}
5082
5083PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 Py_ssize_t size,
5085 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005086{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005087 PyObject *repr = NULL;
5088 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005089
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005090#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005091 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005092 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005093 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005094 else
5095#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005096 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005097
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005098 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005099 Py_XDECREF(repr);
5100 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005101 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005102
5103#ifdef NEED_RETRY
5104 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005105 p += INT_MAX;
5106 size -= INT_MAX;
5107 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005108 }
5109#endif
5110
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005111 return repr;
5112}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005113
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005114PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5115{
5116 if (!PyUnicode_Check(unicode)) {
5117 PyErr_BadArgument();
5118 return NULL;
5119 }
5120 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005121 PyUnicode_GET_SIZE(unicode),
5122 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005123}
5124
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005125#undef NEED_RETRY
5126
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005127#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005128
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129/* --- Character Mapping Codec -------------------------------------------- */
5130
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 Py_ssize_t size,
5133 PyObject *mapping,
5134 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005136 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005137 Py_ssize_t startinpos;
5138 Py_ssize_t endinpos;
5139 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005140 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141 PyUnicodeObject *v;
5142 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005143 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005144 PyObject *errorHandler = NULL;
5145 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005146 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005147 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005148
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 /* Default to Latin-1 */
5150 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005151 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152
5153 v = _PyUnicode_New(size);
5154 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005155 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005159 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005160 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005161 mapstring = PyUnicode_AS_UNICODE(mapping);
5162 maplen = PyUnicode_GET_SIZE(mapping);
5163 while (s < e) {
5164 unsigned char ch = *s;
5165 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 if (ch < maplen)
5168 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 if (x == 0xfffe) {
5171 /* undefined mapping */
5172 outpos = p-PyUnicode_AS_UNICODE(v);
5173 startinpos = s-starts;
5174 endinpos = startinpos+1;
5175 if (unicode_decode_call_errorhandler(
5176 errors, &errorHandler,
5177 "charmap", "character maps to <undefined>",
5178 &starts, &e, &startinpos, &endinpos, &exc, &s,
5179 &v, &outpos, &p)) {
5180 goto onError;
5181 }
5182 continue;
5183 }
5184 *p++ = x;
5185 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005186 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005187 }
5188 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 while (s < e) {
5190 unsigned char ch = *s;
5191 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005192
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5194 w = PyLong_FromLong((long)ch);
5195 if (w == NULL)
5196 goto onError;
5197 x = PyObject_GetItem(mapping, w);
5198 Py_DECREF(w);
5199 if (x == NULL) {
5200 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5201 /* No mapping found means: mapping is undefined. */
5202 PyErr_Clear();
5203 x = Py_None;
5204 Py_INCREF(x);
5205 } else
5206 goto onError;
5207 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005208
Benjamin Peterson29060642009-01-31 22:14:21 +00005209 /* Apply mapping */
5210 if (PyLong_Check(x)) {
5211 long value = PyLong_AS_LONG(x);
5212 if (value < 0 || value > 65535) {
5213 PyErr_SetString(PyExc_TypeError,
5214 "character mapping must be in range(65536)");
5215 Py_DECREF(x);
5216 goto onError;
5217 }
5218 *p++ = (Py_UNICODE)value;
5219 }
5220 else if (x == Py_None) {
5221 /* undefined mapping */
5222 outpos = p-PyUnicode_AS_UNICODE(v);
5223 startinpos = s-starts;
5224 endinpos = startinpos+1;
5225 if (unicode_decode_call_errorhandler(
5226 errors, &errorHandler,
5227 "charmap", "character maps to <undefined>",
5228 &starts, &e, &startinpos, &endinpos, &exc, &s,
5229 &v, &outpos, &p)) {
5230 Py_DECREF(x);
5231 goto onError;
5232 }
5233 Py_DECREF(x);
5234 continue;
5235 }
5236 else if (PyUnicode_Check(x)) {
5237 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005238
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 if (targetsize == 1)
5240 /* 1-1 mapping */
5241 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005242
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 else if (targetsize > 1) {
5244 /* 1-n mapping */
5245 if (targetsize > extrachars) {
5246 /* resize first */
5247 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5248 Py_ssize_t needed = (targetsize - extrachars) + \
5249 (targetsize << 2);
5250 extrachars += needed;
5251 /* XXX overflow detection missing */
5252 if (_PyUnicode_Resize(&v,
5253 PyUnicode_GET_SIZE(v) + needed) < 0) {
5254 Py_DECREF(x);
5255 goto onError;
5256 }
5257 p = PyUnicode_AS_UNICODE(v) + oldpos;
5258 }
5259 Py_UNICODE_COPY(p,
5260 PyUnicode_AS_UNICODE(x),
5261 targetsize);
5262 p += targetsize;
5263 extrachars -= targetsize;
5264 }
5265 /* 1-0 mapping: skip the character */
5266 }
5267 else {
5268 /* wrong return value */
5269 PyErr_SetString(PyExc_TypeError,
5270 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005271 Py_DECREF(x);
5272 goto onError;
5273 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005274 Py_DECREF(x);
5275 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005276 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277 }
5278 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005279 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5280 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005281 Py_XDECREF(errorHandler);
5282 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005284
Benjamin Peterson29060642009-01-31 22:14:21 +00005285 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005286 Py_XDECREF(errorHandler);
5287 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 Py_XDECREF(v);
5289 return NULL;
5290}
5291
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005292/* Charmap encoding: the lookup table */
5293
5294struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005295 PyObject_HEAD
5296 unsigned char level1[32];
5297 int count2, count3;
5298 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005299};
5300
5301static PyObject*
5302encoding_map_size(PyObject *obj, PyObject* args)
5303{
5304 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005305 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005306 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005307}
5308
5309static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005310 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005311 PyDoc_STR("Return the size (in bytes) of this object") },
5312 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005313};
5314
5315static void
5316encoding_map_dealloc(PyObject* o)
5317{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005318 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005319}
5320
5321static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005322 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005323 "EncodingMap", /*tp_name*/
5324 sizeof(struct encoding_map), /*tp_basicsize*/
5325 0, /*tp_itemsize*/
5326 /* methods */
5327 encoding_map_dealloc, /*tp_dealloc*/
5328 0, /*tp_print*/
5329 0, /*tp_getattr*/
5330 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005331 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005332 0, /*tp_repr*/
5333 0, /*tp_as_number*/
5334 0, /*tp_as_sequence*/
5335 0, /*tp_as_mapping*/
5336 0, /*tp_hash*/
5337 0, /*tp_call*/
5338 0, /*tp_str*/
5339 0, /*tp_getattro*/
5340 0, /*tp_setattro*/
5341 0, /*tp_as_buffer*/
5342 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5343 0, /*tp_doc*/
5344 0, /*tp_traverse*/
5345 0, /*tp_clear*/
5346 0, /*tp_richcompare*/
5347 0, /*tp_weaklistoffset*/
5348 0, /*tp_iter*/
5349 0, /*tp_iternext*/
5350 encoding_map_methods, /*tp_methods*/
5351 0, /*tp_members*/
5352 0, /*tp_getset*/
5353 0, /*tp_base*/
5354 0, /*tp_dict*/
5355 0, /*tp_descr_get*/
5356 0, /*tp_descr_set*/
5357 0, /*tp_dictoffset*/
5358 0, /*tp_init*/
5359 0, /*tp_alloc*/
5360 0, /*tp_new*/
5361 0, /*tp_free*/
5362 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005363};
5364
5365PyObject*
5366PyUnicode_BuildEncodingMap(PyObject* string)
5367{
5368 Py_UNICODE *decode;
5369 PyObject *result;
5370 struct encoding_map *mresult;
5371 int i;
5372 int need_dict = 0;
5373 unsigned char level1[32];
5374 unsigned char level2[512];
5375 unsigned char *mlevel1, *mlevel2, *mlevel3;
5376 int count2 = 0, count3 = 0;
5377
5378 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5379 PyErr_BadArgument();
5380 return NULL;
5381 }
5382 decode = PyUnicode_AS_UNICODE(string);
5383 memset(level1, 0xFF, sizeof level1);
5384 memset(level2, 0xFF, sizeof level2);
5385
5386 /* If there isn't a one-to-one mapping of NULL to \0,
5387 or if there are non-BMP characters, we need to use
5388 a mapping dictionary. */
5389 if (decode[0] != 0)
5390 need_dict = 1;
5391 for (i = 1; i < 256; i++) {
5392 int l1, l2;
5393 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005394#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005395 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005396#endif
5397 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005398 need_dict = 1;
5399 break;
5400 }
5401 if (decode[i] == 0xFFFE)
5402 /* unmapped character */
5403 continue;
5404 l1 = decode[i] >> 11;
5405 l2 = decode[i] >> 7;
5406 if (level1[l1] == 0xFF)
5407 level1[l1] = count2++;
5408 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005409 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005410 }
5411
5412 if (count2 >= 0xFF || count3 >= 0xFF)
5413 need_dict = 1;
5414
5415 if (need_dict) {
5416 PyObject *result = PyDict_New();
5417 PyObject *key, *value;
5418 if (!result)
5419 return NULL;
5420 for (i = 0; i < 256; i++) {
5421 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005422 key = PyLong_FromLong(decode[i]);
5423 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005424 if (!key || !value)
5425 goto failed1;
5426 if (PyDict_SetItem(result, key, value) == -1)
5427 goto failed1;
5428 Py_DECREF(key);
5429 Py_DECREF(value);
5430 }
5431 return result;
5432 failed1:
5433 Py_XDECREF(key);
5434 Py_XDECREF(value);
5435 Py_DECREF(result);
5436 return NULL;
5437 }
5438
5439 /* Create a three-level trie */
5440 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5441 16*count2 + 128*count3 - 1);
5442 if (!result)
5443 return PyErr_NoMemory();
5444 PyObject_Init(result, &EncodingMapType);
5445 mresult = (struct encoding_map*)result;
5446 mresult->count2 = count2;
5447 mresult->count3 = count3;
5448 mlevel1 = mresult->level1;
5449 mlevel2 = mresult->level23;
5450 mlevel3 = mresult->level23 + 16*count2;
5451 memcpy(mlevel1, level1, 32);
5452 memset(mlevel2, 0xFF, 16*count2);
5453 memset(mlevel3, 0, 128*count3);
5454 count3 = 0;
5455 for (i = 1; i < 256; i++) {
5456 int o1, o2, o3, i2, i3;
5457 if (decode[i] == 0xFFFE)
5458 /* unmapped character */
5459 continue;
5460 o1 = decode[i]>>11;
5461 o2 = (decode[i]>>7) & 0xF;
5462 i2 = 16*mlevel1[o1] + o2;
5463 if (mlevel2[i2] == 0xFF)
5464 mlevel2[i2] = count3++;
5465 o3 = decode[i] & 0x7F;
5466 i3 = 128*mlevel2[i2] + o3;
5467 mlevel3[i3] = i;
5468 }
5469 return result;
5470}
5471
5472static int
5473encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5474{
5475 struct encoding_map *map = (struct encoding_map*)mapping;
5476 int l1 = c>>11;
5477 int l2 = (c>>7) & 0xF;
5478 int l3 = c & 0x7F;
5479 int i;
5480
5481#ifdef Py_UNICODE_WIDE
5482 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005483 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005484 }
5485#endif
5486 if (c == 0)
5487 return 0;
5488 /* level 1*/
5489 i = map->level1[l1];
5490 if (i == 0xFF) {
5491 return -1;
5492 }
5493 /* level 2*/
5494 i = map->level23[16*i+l2];
5495 if (i == 0xFF) {
5496 return -1;
5497 }
5498 /* level 3 */
5499 i = map->level23[16*map->count2 + 128*i + l3];
5500 if (i == 0) {
5501 return -1;
5502 }
5503 return i;
5504}
5505
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005506/* Lookup the character ch in the mapping. If the character
5507 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005508 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005509static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510{
Christian Heimes217cfd12007-12-02 14:31:20 +00005511 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 PyObject *x;
5513
5514 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005516 x = PyObject_GetItem(mapping, w);
5517 Py_DECREF(w);
5518 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005519 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5520 /* No mapping found means: mapping is undefined. */
5521 PyErr_Clear();
5522 x = Py_None;
5523 Py_INCREF(x);
5524 return x;
5525 } else
5526 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005528 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005530 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 long value = PyLong_AS_LONG(x);
5532 if (value < 0 || value > 255) {
5533 PyErr_SetString(PyExc_TypeError,
5534 "character mapping must be in range(256)");
5535 Py_DECREF(x);
5536 return NULL;
5537 }
5538 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005540 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005541 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005543 /* wrong return value */
5544 PyErr_Format(PyExc_TypeError,
5545 "character mapping must return integer, bytes or None, not %.400s",
5546 x->ob_type->tp_name);
5547 Py_DECREF(x);
5548 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 }
5550}
5551
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005552static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005553charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005554{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005555 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5556 /* exponentially overallocate to minimize reallocations */
5557 if (requiredsize < 2*outsize)
5558 requiredsize = 2*outsize;
5559 if (_PyBytes_Resize(outobj, requiredsize))
5560 return -1;
5561 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005562}
5563
Benjamin Peterson14339b62009-01-31 16:36:08 +00005564typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005565 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005566}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005567/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005568 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005569 space is available. Return a new reference to the object that
5570 was put in the output buffer, or Py_None, if the mapping was undefined
5571 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005572 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005573static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005574charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005575 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005576{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005577 PyObject *rep;
5578 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005579 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580
Christian Heimes90aa7642007-12-19 02:45:37 +00005581 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005582 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005583 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005584 if (res == -1)
5585 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005586 if (outsize<requiredsize)
5587 if (charmapencode_resize(outobj, outpos, requiredsize))
5588 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005589 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005590 outstart[(*outpos)++] = (char)res;
5591 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005592 }
5593
5594 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005595 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005596 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005597 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 Py_DECREF(rep);
5599 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005600 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 if (PyLong_Check(rep)) {
5602 Py_ssize_t requiredsize = *outpos+1;
5603 if (outsize<requiredsize)
5604 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5605 Py_DECREF(rep);
5606 return enc_EXCEPTION;
5607 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005608 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005609 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005610 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005611 else {
5612 const char *repchars = PyBytes_AS_STRING(rep);
5613 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5614 Py_ssize_t requiredsize = *outpos+repsize;
5615 if (outsize<requiredsize)
5616 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5617 Py_DECREF(rep);
5618 return enc_EXCEPTION;
5619 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005620 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005621 memcpy(outstart + *outpos, repchars, repsize);
5622 *outpos += repsize;
5623 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005624 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005625 Py_DECREF(rep);
5626 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005627}
5628
5629/* handle an error in PyUnicode_EncodeCharmap
5630 Return 0 on success, -1 on error */
5631static
5632int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005633 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005634 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005635 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005636 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005637{
5638 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005639 Py_ssize_t repsize;
5640 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005641 Py_UNICODE *uni2;
5642 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005643 Py_ssize_t collstartpos = *inpos;
5644 Py_ssize_t collendpos = *inpos+1;
5645 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005646 char *encoding = "charmap";
5647 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005648 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005649
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005650 /* find all unencodable characters */
5651 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005652 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005653 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005654 int res = encoding_map_lookup(p[collendpos], mapping);
5655 if (res != -1)
5656 break;
5657 ++collendpos;
5658 continue;
5659 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005660
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 rep = charmapencode_lookup(p[collendpos], mapping);
5662 if (rep==NULL)
5663 return -1;
5664 else if (rep!=Py_None) {
5665 Py_DECREF(rep);
5666 break;
5667 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005668 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005670 }
5671 /* cache callback name lookup
5672 * (if not done yet, i.e. it's the first error) */
5673 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 if ((errors==NULL) || (!strcmp(errors, "strict")))
5675 *known_errorHandler = 1;
5676 else if (!strcmp(errors, "replace"))
5677 *known_errorHandler = 2;
5678 else if (!strcmp(errors, "ignore"))
5679 *known_errorHandler = 3;
5680 else if (!strcmp(errors, "xmlcharrefreplace"))
5681 *known_errorHandler = 4;
5682 else
5683 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005684 }
5685 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005686 case 1: /* strict */
5687 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5688 return -1;
5689 case 2: /* replace */
5690 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005691 x = charmapencode_output('?', mapping, res, respos);
5692 if (x==enc_EXCEPTION) {
5693 return -1;
5694 }
5695 else if (x==enc_FAILED) {
5696 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5697 return -1;
5698 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005699 }
5700 /* fall through */
5701 case 3: /* ignore */
5702 *inpos = collendpos;
5703 break;
5704 case 4: /* xmlcharrefreplace */
5705 /* generate replacement (temporarily (mis)uses p) */
5706 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 char buffer[2+29+1+1];
5708 char *cp;
5709 sprintf(buffer, "&#%d;", (int)p[collpos]);
5710 for (cp = buffer; *cp; ++cp) {
5711 x = charmapencode_output(*cp, mapping, res, respos);
5712 if (x==enc_EXCEPTION)
5713 return -1;
5714 else if (x==enc_FAILED) {
5715 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5716 return -1;
5717 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005718 }
5719 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005720 *inpos = collendpos;
5721 break;
5722 default:
5723 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005724 encoding, reason, p, size, exceptionObject,
5725 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005726 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005727 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005728 if (PyBytes_Check(repunicode)) {
5729 /* Directly copy bytes result to output. */
5730 Py_ssize_t outsize = PyBytes_Size(*res);
5731 Py_ssize_t requiredsize;
5732 repsize = PyBytes_Size(repunicode);
5733 requiredsize = *respos + repsize;
5734 if (requiredsize > outsize)
5735 /* Make room for all additional bytes. */
5736 if (charmapencode_resize(res, respos, requiredsize)) {
5737 Py_DECREF(repunicode);
5738 return -1;
5739 }
5740 memcpy(PyBytes_AsString(*res) + *respos,
5741 PyBytes_AsString(repunicode), repsize);
5742 *respos += repsize;
5743 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005744 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005745 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005746 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005747 /* generate replacement */
5748 repsize = PyUnicode_GET_SIZE(repunicode);
5749 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 x = charmapencode_output(*uni2, mapping, res, respos);
5751 if (x==enc_EXCEPTION) {
5752 return -1;
5753 }
5754 else if (x==enc_FAILED) {
5755 Py_DECREF(repunicode);
5756 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5757 return -1;
5758 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005759 }
5760 *inpos = newpos;
5761 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005762 }
5763 return 0;
5764}
5765
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 Py_ssize_t size,
5768 PyObject *mapping,
5769 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 /* output object */
5772 PyObject *res = NULL;
5773 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005774 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005775 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005776 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 PyObject *errorHandler = NULL;
5778 PyObject *exc = NULL;
5779 /* the following variable is used for caching string comparisons
5780 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5781 * 3=ignore, 4=xmlcharrefreplace */
5782 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783
5784 /* Default to Latin-1 */
5785 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005788 /* allocate enough for a simple encoding without
5789 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005790 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005791 if (res == NULL)
5792 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005793 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005796 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005797 /* try to encode it */
5798 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5799 if (x==enc_EXCEPTION) /* error */
5800 goto onError;
5801 if (x==enc_FAILED) { /* unencodable character */
5802 if (charmap_encoding_error(p, size, &inpos, mapping,
5803 &exc,
5804 &known_errorHandler, &errorHandler, errors,
5805 &res, &respos)) {
5806 goto onError;
5807 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005808 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 else
5810 /* done with this character => adjust input position */
5811 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005814 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005815 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005816 if (_PyBytes_Resize(&res, respos) < 0)
5817 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005818
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005819 Py_XDECREF(exc);
5820 Py_XDECREF(errorHandler);
5821 return res;
5822
Benjamin Peterson29060642009-01-31 22:14:21 +00005823 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005824 Py_XDECREF(res);
5825 Py_XDECREF(exc);
5826 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 return NULL;
5828}
5829
5830PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832{
5833 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 PyErr_BadArgument();
5835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 }
5837 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 PyUnicode_GET_SIZE(unicode),
5839 mapping,
5840 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841}
5842
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005843/* create or adjust a UnicodeTranslateError */
5844static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005845 const Py_UNICODE *unicode, Py_ssize_t size,
5846 Py_ssize_t startpos, Py_ssize_t endpos,
5847 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005849 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005850 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005851 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852 }
5853 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005854 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5855 goto onError;
5856 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5857 goto onError;
5858 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5859 goto onError;
5860 return;
5861 onError:
5862 Py_DECREF(*exceptionObject);
5863 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 }
5865}
5866
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867/* raises a UnicodeTranslateError */
5868static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005869 const Py_UNICODE *unicode, Py_ssize_t size,
5870 Py_ssize_t startpos, Py_ssize_t endpos,
5871 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005872{
5873 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005874 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005875 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005876 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005877}
5878
5879/* error handling callback helper:
5880 build arguments, call the callback and check the arguments,
5881 put the result into newpos and return the replacement string, which
5882 has to be freed by the caller */
5883static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 PyObject **errorHandler,
5885 const char *reason,
5886 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5887 Py_ssize_t startpos, Py_ssize_t endpos,
5888 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005889{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005890 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005891
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005892 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005893 PyObject *restuple;
5894 PyObject *resunicode;
5895
5896 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005900 }
5901
5902 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005904 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906
5907 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005909 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005911 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005912 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 Py_DECREF(restuple);
5914 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005915 }
5916 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 &resunicode, &i_newpos)) {
5918 Py_DECREF(restuple);
5919 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005920 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005921 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005923 else
5924 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005925 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5927 Py_DECREF(restuple);
5928 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005929 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005930 Py_INCREF(resunicode);
5931 Py_DECREF(restuple);
5932 return resunicode;
5933}
5934
5935/* Lookup the character ch in the mapping and put the result in result,
5936 which must be decrefed by the caller.
5937 Return 0 on success, -1 on error */
5938static
5939int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5940{
Christian Heimes217cfd12007-12-02 14:31:20 +00005941 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005942 PyObject *x;
5943
5944 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005946 x = PyObject_GetItem(mapping, w);
5947 Py_DECREF(w);
5948 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5950 /* No mapping found means: use 1:1 mapping. */
5951 PyErr_Clear();
5952 *result = NULL;
5953 return 0;
5954 } else
5955 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005956 }
5957 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005958 *result = x;
5959 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005960 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005961 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 long value = PyLong_AS_LONG(x);
5963 long max = PyUnicode_GetMax();
5964 if (value < 0 || value > max) {
5965 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005966 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005967 Py_DECREF(x);
5968 return -1;
5969 }
5970 *result = x;
5971 return 0;
5972 }
5973 else if (PyUnicode_Check(x)) {
5974 *result = x;
5975 return 0;
5976 }
5977 else {
5978 /* wrong return value */
5979 PyErr_SetString(PyExc_TypeError,
5980 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005981 Py_DECREF(x);
5982 return -1;
5983 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005984}
5985/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 if not reallocate and adjust various state variables.
5987 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005988static
Walter Dörwald4894c302003-10-24 14:25:28 +00005989int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005990 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005991{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005992 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005993 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 /* remember old output position */
5995 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5996 /* exponentially overallocate to minimize reallocations */
5997 if (requiredsize < 2 * oldsize)
5998 requiredsize = 2 * oldsize;
5999 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6000 return -1;
6001 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006002 }
6003 return 0;
6004}
6005/* lookup the character, put the result in the output string and adjust
6006 various state variables. Return a new reference to the object that
6007 was put in the output buffer in *result, or Py_None, if the mapping was
6008 undefined (in which case no character was written).
6009 The called must decref result.
6010 Return 0 on success, -1 on error. */
6011static
Walter Dörwald4894c302003-10-24 14:25:28 +00006012int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6014 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006015{
Walter Dörwald4894c302003-10-24 14:25:28 +00006016 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006018 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 /* not found => default to 1:1 mapping */
6020 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006021 }
6022 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006024 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 /* no overflow check, because we know that the space is enough */
6026 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006027 }
6028 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006029 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6030 if (repsize==1) {
6031 /* no overflow check, because we know that the space is enough */
6032 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6033 }
6034 else if (repsize!=0) {
6035 /* more than one character */
6036 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6037 (insize - (curinp-startinp)) +
6038 repsize - 1;
6039 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6040 return -1;
6041 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6042 *outp += repsize;
6043 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006044 }
6045 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006047 return 0;
6048}
6049
6050PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 Py_ssize_t size,
6052 PyObject *mapping,
6053 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006055 /* output object */
6056 PyObject *res = NULL;
6057 /* pointers to the beginning and end+1 of input */
6058 const Py_UNICODE *startp = p;
6059 const Py_UNICODE *endp = p + size;
6060 /* pointer into the output */
6061 Py_UNICODE *str;
6062 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006063 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006064 char *reason = "character maps to <undefined>";
6065 PyObject *errorHandler = NULL;
6066 PyObject *exc = NULL;
6067 /* the following variable is used for caching string comparisons
6068 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6069 * 3=ignore, 4=xmlcharrefreplace */
6070 int known_errorHandler = -1;
6071
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 PyErr_BadArgument();
6074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006076
6077 /* allocate enough for a simple 1:1 translation without
6078 replacements, if we need more, we'll resize */
6079 res = PyUnicode_FromUnicode(NULL, size);
6080 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006084 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006086 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 /* try to encode it */
6088 PyObject *x = NULL;
6089 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6090 Py_XDECREF(x);
6091 goto onError;
6092 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006093 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 if (x!=Py_None) /* it worked => adjust input pointer */
6095 ++p;
6096 else { /* untranslatable character */
6097 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6098 Py_ssize_t repsize;
6099 Py_ssize_t newpos;
6100 Py_UNICODE *uni2;
6101 /* startpos for collecting untranslatable chars */
6102 const Py_UNICODE *collstart = p;
6103 const Py_UNICODE *collend = p+1;
6104 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 /* find all untranslatable characters */
6107 while (collend < endp) {
6108 if (charmaptranslate_lookup(*collend, mapping, &x))
6109 goto onError;
6110 Py_XDECREF(x);
6111 if (x!=Py_None)
6112 break;
6113 ++collend;
6114 }
6115 /* cache callback name lookup
6116 * (if not done yet, i.e. it's the first error) */
6117 if (known_errorHandler==-1) {
6118 if ((errors==NULL) || (!strcmp(errors, "strict")))
6119 known_errorHandler = 1;
6120 else if (!strcmp(errors, "replace"))
6121 known_errorHandler = 2;
6122 else if (!strcmp(errors, "ignore"))
6123 known_errorHandler = 3;
6124 else if (!strcmp(errors, "xmlcharrefreplace"))
6125 known_errorHandler = 4;
6126 else
6127 known_errorHandler = 0;
6128 }
6129 switch (known_errorHandler) {
6130 case 1: /* strict */
6131 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006132 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 case 2: /* replace */
6134 /* No need to check for space, this is a 1:1 replacement */
6135 for (coll = collstart; coll<collend; ++coll)
6136 *str++ = '?';
6137 /* fall through */
6138 case 3: /* ignore */
6139 p = collend;
6140 break;
6141 case 4: /* xmlcharrefreplace */
6142 /* generate replacement (temporarily (mis)uses p) */
6143 for (p = collstart; p < collend; ++p) {
6144 char buffer[2+29+1+1];
6145 char *cp;
6146 sprintf(buffer, "&#%d;", (int)*p);
6147 if (charmaptranslate_makespace(&res, &str,
6148 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6149 goto onError;
6150 for (cp = buffer; *cp; ++cp)
6151 *str++ = *cp;
6152 }
6153 p = collend;
6154 break;
6155 default:
6156 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6157 reason, startp, size, &exc,
6158 collstart-startp, collend-startp, &newpos);
6159 if (repunicode == NULL)
6160 goto onError;
6161 /* generate replacement */
6162 repsize = PyUnicode_GET_SIZE(repunicode);
6163 if (charmaptranslate_makespace(&res, &str,
6164 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6165 Py_DECREF(repunicode);
6166 goto onError;
6167 }
6168 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6169 *str++ = *uni2;
6170 p = startp + newpos;
6171 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006172 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006173 }
6174 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006175 /* Resize if we allocated to much */
6176 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006177 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 if (PyUnicode_Resize(&res, respos) < 0)
6179 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006180 }
6181 Py_XDECREF(exc);
6182 Py_XDECREF(errorHandler);
6183 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006186 Py_XDECREF(res);
6187 Py_XDECREF(exc);
6188 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189 return NULL;
6190}
6191
6192PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006193 PyObject *mapping,
6194 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195{
6196 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006197
Guido van Rossumd57fd912000-03-10 22:53:23 +00006198 str = PyUnicode_FromObject(str);
6199 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 PyUnicode_GET_SIZE(str),
6203 mapping,
6204 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 Py_DECREF(str);
6206 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006207
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209 Py_XDECREF(str);
6210 return NULL;
6211}
Tim Petersced69f82003-09-16 20:30:58 +00006212
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006213PyObject *
6214PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6215 Py_ssize_t length)
6216{
6217 PyObject *result;
6218 Py_UNICODE *p; /* write pointer into result */
6219 Py_ssize_t i;
6220 /* Copy to a new string */
6221 result = (PyObject *)_PyUnicode_New(length);
6222 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6223 if (result == NULL)
6224 return result;
6225 p = PyUnicode_AS_UNICODE(result);
6226 /* Iterate over code points */
6227 for (i = 0; i < length; i++) {
6228 Py_UNICODE ch =s[i];
6229 if (ch > 127) {
6230 int decimal = Py_UNICODE_TODECIMAL(ch);
6231 if (decimal >= 0)
6232 p[i] = '0' + decimal;
6233 }
6234 }
6235 return result;
6236}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006237/* --- Decimal Encoder ---------------------------------------------------- */
6238
6239int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006240 Py_ssize_t length,
6241 char *output,
6242 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006243{
6244 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006245 PyObject *errorHandler = NULL;
6246 PyObject *exc = NULL;
6247 const char *encoding = "decimal";
6248 const char *reason = "invalid decimal Unicode string";
6249 /* the following variable is used for caching string comparisons
6250 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6251 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006252
6253 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 PyErr_BadArgument();
6255 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006256 }
6257
6258 p = s;
6259 end = s + length;
6260 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 register Py_UNICODE ch = *p;
6262 int decimal;
6263 PyObject *repunicode;
6264 Py_ssize_t repsize;
6265 Py_ssize_t newpos;
6266 Py_UNICODE *uni2;
6267 Py_UNICODE *collstart;
6268 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006269
Benjamin Peterson29060642009-01-31 22:14:21 +00006270 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006271 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006272 ++p;
6273 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006274 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 decimal = Py_UNICODE_TODECIMAL(ch);
6276 if (decimal >= 0) {
6277 *output++ = '0' + decimal;
6278 ++p;
6279 continue;
6280 }
6281 if (0 < ch && ch < 256) {
6282 *output++ = (char)ch;
6283 ++p;
6284 continue;
6285 }
6286 /* All other characters are considered unencodable */
6287 collstart = p;
6288 collend = p+1;
6289 while (collend < end) {
6290 if ((0 < *collend && *collend < 256) ||
6291 !Py_UNICODE_ISSPACE(*collend) ||
6292 Py_UNICODE_TODECIMAL(*collend))
6293 break;
6294 }
6295 /* cache callback name lookup
6296 * (if not done yet, i.e. it's the first error) */
6297 if (known_errorHandler==-1) {
6298 if ((errors==NULL) || (!strcmp(errors, "strict")))
6299 known_errorHandler = 1;
6300 else if (!strcmp(errors, "replace"))
6301 known_errorHandler = 2;
6302 else if (!strcmp(errors, "ignore"))
6303 known_errorHandler = 3;
6304 else if (!strcmp(errors, "xmlcharrefreplace"))
6305 known_errorHandler = 4;
6306 else
6307 known_errorHandler = 0;
6308 }
6309 switch (known_errorHandler) {
6310 case 1: /* strict */
6311 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6312 goto onError;
6313 case 2: /* replace */
6314 for (p = collstart; p < collend; ++p)
6315 *output++ = '?';
6316 /* fall through */
6317 case 3: /* ignore */
6318 p = collend;
6319 break;
6320 case 4: /* xmlcharrefreplace */
6321 /* generate replacement (temporarily (mis)uses p) */
6322 for (p = collstart; p < collend; ++p)
6323 output += sprintf(output, "&#%d;", (int)*p);
6324 p = collend;
6325 break;
6326 default:
6327 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6328 encoding, reason, s, length, &exc,
6329 collstart-s, collend-s, &newpos);
6330 if (repunicode == NULL)
6331 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006332 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006333 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006334 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6335 Py_DECREF(repunicode);
6336 goto onError;
6337 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 /* generate replacement */
6339 repsize = PyUnicode_GET_SIZE(repunicode);
6340 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6341 Py_UNICODE ch = *uni2;
6342 if (Py_UNICODE_ISSPACE(ch))
6343 *output++ = ' ';
6344 else {
6345 decimal = Py_UNICODE_TODECIMAL(ch);
6346 if (decimal >= 0)
6347 *output++ = '0' + decimal;
6348 else if (0 < ch && ch < 256)
6349 *output++ = (char)ch;
6350 else {
6351 Py_DECREF(repunicode);
6352 raise_encode_exception(&exc, encoding,
6353 s, length, collstart-s, collend-s, reason);
6354 goto onError;
6355 }
6356 }
6357 }
6358 p = s + newpos;
6359 Py_DECREF(repunicode);
6360 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006361 }
6362 /* 0-terminate the output string */
6363 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006364 Py_XDECREF(exc);
6365 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006366 return 0;
6367
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006369 Py_XDECREF(exc);
6370 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006371 return -1;
6372}
6373
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374/* --- Helpers ------------------------------------------------------------ */
6375
Eric Smith8c663262007-08-25 02:26:07 +00006376#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006377#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006378
Thomas Wouters477c8d52006-05-27 19:21:47 +00006379#include "stringlib/count.h"
6380#include "stringlib/find.h"
6381#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006382#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006383
Eric Smith5807c412008-05-11 21:00:57 +00006384#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006385#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006386#include "stringlib/localeutil.h"
6387
Thomas Wouters477c8d52006-05-27 19:21:47 +00006388/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006389#define ADJUST_INDICES(start, end, len) \
6390 if (end > len) \
6391 end = len; \
6392 else if (end < 0) { \
6393 end += len; \
6394 if (end < 0) \
6395 end = 0; \
6396 } \
6397 if (start < 0) { \
6398 start += len; \
6399 if (start < 0) \
6400 start = 0; \
6401 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006402
Martin v. Löwis18e16552006-02-15 17:27:45 +00006403Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006404 PyObject *substr,
6405 Py_ssize_t start,
6406 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006407{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006408 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006409 PyUnicodeObject* str_obj;
6410 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006411
Thomas Wouters477c8d52006-05-27 19:21:47 +00006412 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6413 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006414 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006415 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6416 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 Py_DECREF(str_obj);
6418 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 }
Tim Petersced69f82003-09-16 20:30:58 +00006420
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006421 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006422 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006423 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6424 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006425 );
6426
6427 Py_DECREF(sub_obj);
6428 Py_DECREF(str_obj);
6429
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430 return result;
6431}
6432
Martin v. Löwis18e16552006-02-15 17:27:45 +00006433Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006434 PyObject *sub,
6435 Py_ssize_t start,
6436 Py_ssize_t end,
6437 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006439 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006440
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006442 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006444 sub = PyUnicode_FromObject(sub);
6445 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 Py_DECREF(str);
6447 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 }
Tim Petersced69f82003-09-16 20:30:58 +00006449
Thomas Wouters477c8d52006-05-27 19:21:47 +00006450 if (direction > 0)
6451 result = stringlib_find_slice(
6452 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6453 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6454 start, end
6455 );
6456 else
6457 result = stringlib_rfind_slice(
6458 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6459 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6460 start, end
6461 );
6462
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006464 Py_DECREF(sub);
6465
Guido van Rossumd57fd912000-03-10 22:53:23 +00006466 return result;
6467}
6468
Tim Petersced69f82003-09-16 20:30:58 +00006469static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 PyUnicodeObject *substring,
6472 Py_ssize_t start,
6473 Py_ssize_t end,
6474 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 if (substring->length == 0)
6477 return 1;
6478
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006479 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 end -= substring->length;
6481 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483
6484 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 if (Py_UNICODE_MATCH(self, end, substring))
6486 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 } else {
6488 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 }
6491
6492 return 0;
6493}
6494
Martin v. Löwis18e16552006-02-15 17:27:45 +00006495Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 PyObject *substr,
6497 Py_ssize_t start,
6498 Py_ssize_t end,
6499 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006501 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006502
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 str = PyUnicode_FromObject(str);
6504 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 substr = PyUnicode_FromObject(substr);
6507 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 Py_DECREF(str);
6509 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 }
Tim Petersced69f82003-09-16 20:30:58 +00006511
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 (PyUnicodeObject *)substr,
6514 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 Py_DECREF(str);
6516 Py_DECREF(substr);
6517 return result;
6518}
6519
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520/* Apply fixfct filter to the Unicode object self and return a
6521 reference to the modified object */
6522
Tim Petersced69f82003-09-16 20:30:58 +00006523static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006524PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526{
6527
6528 PyUnicodeObject *u;
6529
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006530 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006533
6534 Py_UNICODE_COPY(u->str, self->str, self->length);
6535
Tim Peters7a29bd52001-09-12 03:03:31 +00006536 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 /* fixfct should return TRUE if it modified the buffer. If
6538 FALSE, return a reference to the original buffer instead
6539 (to save space, not time) */
6540 Py_INCREF(self);
6541 Py_DECREF(u);
6542 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 }
6544 return (PyObject*) u;
6545}
6546
Tim Petersced69f82003-09-16 20:30:58 +00006547static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548int fixupper(PyUnicodeObject *self)
6549{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006550 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 Py_UNICODE *s = self->str;
6552 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006553
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006556
Benjamin Peterson29060642009-01-31 22:14:21 +00006557 ch = Py_UNICODE_TOUPPER(*s);
6558 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 *s = ch;
6561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 s++;
6563 }
6564
6565 return status;
6566}
6567
Tim Petersced69f82003-09-16 20:30:58 +00006568static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569int fixlower(PyUnicodeObject *self)
6570{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006571 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 Py_UNICODE *s = self->str;
6573 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006574
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006577
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 ch = Py_UNICODE_TOLOWER(*s);
6579 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 *s = ch;
6582 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583 s++;
6584 }
6585
6586 return status;
6587}
6588
Tim Petersced69f82003-09-16 20:30:58 +00006589static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590int fixswapcase(PyUnicodeObject *self)
6591{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006592 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593 Py_UNICODE *s = self->str;
6594 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006595
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596 while (len-- > 0) {
6597 if (Py_UNICODE_ISUPPER(*s)) {
6598 *s = Py_UNICODE_TOLOWER(*s);
6599 status = 1;
6600 } else if (Py_UNICODE_ISLOWER(*s)) {
6601 *s = Py_UNICODE_TOUPPER(*s);
6602 status = 1;
6603 }
6604 s++;
6605 }
6606
6607 return status;
6608}
6609
Tim Petersced69f82003-09-16 20:30:58 +00006610static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611int fixcapitalize(PyUnicodeObject *self)
6612{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006613 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006614 Py_UNICODE *s = self->str;
6615 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006616
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006617 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006619 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006620 *s = Py_UNICODE_TOUPPER(*s);
6621 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006623 s++;
6624 while (--len > 0) {
6625 if (Py_UNICODE_ISUPPER(*s)) {
6626 *s = Py_UNICODE_TOLOWER(*s);
6627 status = 1;
6628 }
6629 s++;
6630 }
6631 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632}
6633
6634static
6635int fixtitle(PyUnicodeObject *self)
6636{
6637 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6638 register Py_UNICODE *e;
6639 int previous_is_cased;
6640
6641 /* Shortcut for single character strings */
6642 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6644 if (*p != ch) {
6645 *p = ch;
6646 return 1;
6647 }
6648 else
6649 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650 }
Tim Petersced69f82003-09-16 20:30:58 +00006651
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 e = p + PyUnicode_GET_SIZE(self);
6653 previous_is_cased = 0;
6654 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006656
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 if (previous_is_cased)
6658 *p = Py_UNICODE_TOLOWER(ch);
6659 else
6660 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006661
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 if (Py_UNICODE_ISLOWER(ch) ||
6663 Py_UNICODE_ISUPPER(ch) ||
6664 Py_UNICODE_ISTITLE(ch))
6665 previous_is_cased = 1;
6666 else
6667 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 }
6669 return 1;
6670}
6671
Tim Peters8ce9f162004-08-27 01:49:32 +00006672PyObject *
6673PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674{
Skip Montanaro6543b452004-09-16 03:28:13 +00006675 const Py_UNICODE blank = ' ';
6676 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006677 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006678 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006679 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6680 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006681 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6682 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006683 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006684 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685
Tim Peters05eba1f2004-08-27 21:32:02 +00006686 fseq = PySequence_Fast(seq, "");
6687 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006688 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006689 }
6690
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006691 /* NOTE: the following code can't call back into Python code,
6692 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006693 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006694
Tim Peters05eba1f2004-08-27 21:32:02 +00006695 seqlen = PySequence_Fast_GET_SIZE(fseq);
6696 /* If empty sequence, return u"". */
6697 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006698 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6699 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006700 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006701 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006702 /* If singleton sequence with an exact Unicode, return that. */
6703 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006704 item = items[0];
6705 if (PyUnicode_CheckExact(item)) {
6706 Py_INCREF(item);
6707 res = (PyUnicodeObject *)item;
6708 goto Done;
6709 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006710 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006711 else {
6712 /* Set up sep and seplen */
6713 if (separator == NULL) {
6714 sep = &blank;
6715 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006716 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006717 else {
6718 if (!PyUnicode_Check(separator)) {
6719 PyErr_Format(PyExc_TypeError,
6720 "separator: expected str instance,"
6721 " %.80s found",
6722 Py_TYPE(separator)->tp_name);
6723 goto onError;
6724 }
6725 sep = PyUnicode_AS_UNICODE(separator);
6726 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006727 }
6728 }
6729
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006730 /* There are at least two things to join, or else we have a subclass
6731 * of str in the sequence.
6732 * Do a pre-pass to figure out the total amount of space we'll
6733 * need (sz), and see whether all argument are strings.
6734 */
6735 sz = 0;
6736 for (i = 0; i < seqlen; i++) {
6737 const Py_ssize_t old_sz = sz;
6738 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 if (!PyUnicode_Check(item)) {
6740 PyErr_Format(PyExc_TypeError,
6741 "sequence item %zd: expected str instance,"
6742 " %.80s found",
6743 i, Py_TYPE(item)->tp_name);
6744 goto onError;
6745 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006746 sz += PyUnicode_GET_SIZE(item);
6747 if (i != 0)
6748 sz += seplen;
6749 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6750 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006752 goto onError;
6753 }
6754 }
Tim Petersced69f82003-09-16 20:30:58 +00006755
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006756 res = _PyUnicode_New(sz);
6757 if (res == NULL)
6758 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006759
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006760 /* Catenate everything. */
6761 res_p = PyUnicode_AS_UNICODE(res);
6762 for (i = 0; i < seqlen; ++i) {
6763 Py_ssize_t itemlen;
6764 item = items[i];
6765 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 /* Copy item, and maybe the separator. */
6767 if (i) {
6768 Py_UNICODE_COPY(res_p, sep, seplen);
6769 res_p += seplen;
6770 }
6771 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6772 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006773 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006774
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006776 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 return (PyObject *)res;
6778
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006780 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006781 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782 return NULL;
6783}
6784
Tim Petersced69f82003-09-16 20:30:58 +00006785static
6786PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006787 Py_ssize_t left,
6788 Py_ssize_t right,
6789 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790{
6791 PyUnicodeObject *u;
6792
6793 if (left < 0)
6794 left = 0;
6795 if (right < 0)
6796 right = 0;
6797
Tim Peters7a29bd52001-09-12 03:03:31 +00006798 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799 Py_INCREF(self);
6800 return self;
6801 }
6802
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006803 if (left > PY_SSIZE_T_MAX - self->length ||
6804 right > PY_SSIZE_T_MAX - (left + self->length)) {
6805 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6806 return NULL;
6807 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808 u = _PyUnicode_New(left + self->length + right);
6809 if (u) {
6810 if (left)
6811 Py_UNICODE_FILL(u->str, fill, left);
6812 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6813 if (right)
6814 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6815 }
6816
6817 return u;
6818}
6819
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006820PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006822 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823
6824 string = PyUnicode_FromObject(string);
6825 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006826 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006828 list = stringlib_splitlines(
6829 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6830 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831
6832 Py_DECREF(string);
6833 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834}
6835
Tim Petersced69f82003-09-16 20:30:58 +00006836static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006838 PyUnicodeObject *substring,
6839 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006842 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006845 return stringlib_split_whitespace(
6846 (PyObject*) self, self->str, self->length, maxcount
6847 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006849 return stringlib_split(
6850 (PyObject*) self, self->str, self->length,
6851 substring->str, substring->length,
6852 maxcount
6853 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854}
6855
Tim Petersced69f82003-09-16 20:30:58 +00006856static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006857PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 PyUnicodeObject *substring,
6859 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006860{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006861 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006862 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006863
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006864 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006865 return stringlib_rsplit_whitespace(
6866 (PyObject*) self, self->str, self->length, maxcount
6867 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006868
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006869 return stringlib_rsplit(
6870 (PyObject*) self, self->str, self->length,
6871 substring->str, substring->length,
6872 maxcount
6873 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006874}
6875
6876static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 PyUnicodeObject *str1,
6879 PyUnicodeObject *str2,
6880 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881{
6882 PyUnicodeObject *u;
6883
6884 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006886 else if (maxcount == 0 || self->length == 0)
6887 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888
Thomas Wouters477c8d52006-05-27 19:21:47 +00006889 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006890 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006891 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006892 if (str1->length == 0)
6893 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006894 if (str1->length == 1) {
6895 /* replace characters */
6896 Py_UNICODE u1, u2;
6897 if (!findchar(self->str, self->length, str1->str[0]))
6898 goto nothing;
6899 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6900 if (!u)
6901 return NULL;
6902 Py_UNICODE_COPY(u->str, self->str, self->length);
6903 u1 = str1->str[0];
6904 u2 = str2->str[0];
6905 for (i = 0; i < u->length; i++)
6906 if (u->str[i] == u1) {
6907 if (--maxcount < 0)
6908 break;
6909 u->str[i] = u2;
6910 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006911 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006912 i = stringlib_find(
6913 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006915 if (i < 0)
6916 goto nothing;
6917 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6918 if (!u)
6919 return NULL;
6920 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006921
6922 /* change everything in-place, starting with this one */
6923 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6924 i += str1->length;
6925
6926 while ( --maxcount > 0) {
6927 i = stringlib_find(self->str+i, self->length-i,
6928 str1->str, str1->length,
6929 i);
6930 if (i == -1)
6931 break;
6932 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6933 i += str1->length;
6934 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006937
6938 Py_ssize_t n, i, j, e;
6939 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 Py_UNICODE *p;
6941
6942 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006943 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6944 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006945 if (n == 0)
6946 goto nothing;
6947 /* new_size = self->length + n * (str2->length - str1->length)); */
6948 delta = (str2->length - str1->length);
6949 if (delta == 0) {
6950 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006952 product = n * (str2->length - str1->length);
6953 if ((product / (str2->length - str1->length)) != n) {
6954 PyErr_SetString(PyExc_OverflowError,
6955 "replace string is too long");
6956 return NULL;
6957 }
6958 new_size = self->length + product;
6959 if (new_size < 0) {
6960 PyErr_SetString(PyExc_OverflowError,
6961 "replace string is too long");
6962 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 }
6964 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006965 u = _PyUnicode_New(new_size);
6966 if (!u)
6967 return NULL;
6968 i = 0;
6969 p = u->str;
6970 e = self->length - str1->length;
6971 if (str1->length > 0) {
6972 while (n-- > 0) {
6973 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006974 j = stringlib_find(self->str+i, self->length-i,
6975 str1->str, str1->length,
6976 i);
6977 if (j == -1)
6978 break;
6979 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006980 /* copy unchanged part [i:j] */
6981 Py_UNICODE_COPY(p, self->str+i, j-i);
6982 p += j - i;
6983 }
6984 /* copy substitution string */
6985 if (str2->length > 0) {
6986 Py_UNICODE_COPY(p, str2->str, str2->length);
6987 p += str2->length;
6988 }
6989 i = j + str1->length;
6990 }
6991 if (i < self->length)
6992 /* copy tail [i:] */
6993 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6994 } else {
6995 /* interleave */
6996 while (n > 0) {
6997 Py_UNICODE_COPY(p, str2->str, str2->length);
6998 p += str2->length;
6999 if (--n <= 0)
7000 break;
7001 *p++ = self->str[i++];
7002 }
7003 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007007
Benjamin Peterson29060642009-01-31 22:14:21 +00007008 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007009 /* nothing to replace; return original string (when possible) */
7010 if (PyUnicode_CheckExact(self)) {
7011 Py_INCREF(self);
7012 return (PyObject *) self;
7013 }
7014 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007015}
7016
7017/* --- Unicode Object Methods --------------------------------------------- */
7018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007019PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021\n\
7022Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007023characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024
7025static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007026unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028 return fixup(self, fixtitle);
7029}
7030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007031PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033\n\
7034Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007035have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007036
7037static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007038unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040 return fixup(self, fixcapitalize);
7041}
7042
7043#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007044PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007045 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046\n\
7047Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007048normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049
7050static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007051unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052{
7053 PyObject *list;
7054 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007055 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057 /* Split into words */
7058 list = split(self, NULL, -1);
7059 if (!list)
7060 return NULL;
7061
7062 /* Capitalize each word */
7063 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7064 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007065 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 if (item == NULL)
7067 goto onError;
7068 Py_DECREF(PyList_GET_ITEM(list, i));
7069 PyList_SET_ITEM(list, i, item);
7070 }
7071
7072 /* Join the words to form a new string */
7073 item = PyUnicode_Join(NULL, list);
7074
Benjamin Peterson29060642009-01-31 22:14:21 +00007075 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076 Py_DECREF(list);
7077 return (PyObject *)item;
7078}
7079#endif
7080
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007081/* Argument converter. Coerces to a single unicode character */
7082
7083static int
7084convert_uc(PyObject *obj, void *addr)
7085{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007086 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7087 PyObject *uniobj;
7088 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007089
Benjamin Peterson14339b62009-01-31 16:36:08 +00007090 uniobj = PyUnicode_FromObject(obj);
7091 if (uniobj == NULL) {
7092 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007093 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007094 return 0;
7095 }
7096 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7097 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007098 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007099 Py_DECREF(uniobj);
7100 return 0;
7101 }
7102 unistr = PyUnicode_AS_UNICODE(uniobj);
7103 *fillcharloc = unistr[0];
7104 Py_DECREF(uniobj);
7105 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007106}
7107
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007108PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007109 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007111Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007112done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113
7114static PyObject *
7115unicode_center(PyUnicodeObject *self, PyObject *args)
7116{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007117 Py_ssize_t marg, left;
7118 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007119 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120
Thomas Woutersde017742006-02-16 19:34:37 +00007121 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122 return NULL;
7123
Tim Peters7a29bd52001-09-12 03:03:31 +00007124 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125 Py_INCREF(self);
7126 return (PyObject*) self;
7127 }
7128
7129 marg = width - self->length;
7130 left = marg / 2 + (marg & width & 1);
7131
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007132 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133}
7134
Marc-André Lemburge5034372000-08-08 08:04:29 +00007135#if 0
7136
7137/* This code should go into some future Unicode collation support
7138 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007139 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007140
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007141/* speedy UTF-16 code point order comparison */
7142/* gleaned from: */
7143/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7144
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007145static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007146{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007147 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007148 0, 0, 0, 0, 0, 0, 0, 0,
7149 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007150 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007151};
7152
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153static int
7154unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7155{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007156 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007157
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158 Py_UNICODE *s1 = str1->str;
7159 Py_UNICODE *s2 = str2->str;
7160
7161 len1 = str1->length;
7162 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007163
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007165 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007166
7167 c1 = *s1++;
7168 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007169
Benjamin Peterson29060642009-01-31 22:14:21 +00007170 if (c1 > (1<<11) * 26)
7171 c1 += utf16Fixup[c1>>11];
7172 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007173 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007174 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007175
7176 if (c1 != c2)
7177 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007178
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007179 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180 }
7181
7182 return (len1 < len2) ? -1 : (len1 != len2);
7183}
7184
Marc-André Lemburge5034372000-08-08 08:04:29 +00007185#else
7186
7187static int
7188unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7189{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007190 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007191
7192 Py_UNICODE *s1 = str1->str;
7193 Py_UNICODE *s2 = str2->str;
7194
7195 len1 = str1->length;
7196 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007197
Marc-André Lemburge5034372000-08-08 08:04:29 +00007198 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007199 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007200
Fredrik Lundh45714e92001-06-26 16:39:36 +00007201 c1 = *s1++;
7202 c2 = *s2++;
7203
7204 if (c1 != c2)
7205 return (c1 < c2) ? -1 : 1;
7206
Marc-André Lemburge5034372000-08-08 08:04:29 +00007207 len1--; len2--;
7208 }
7209
7210 return (len1 < len2) ? -1 : (len1 != len2);
7211}
7212
7213#endif
7214
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007218 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7219 return unicode_compare((PyUnicodeObject *)left,
7220 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007221 PyErr_Format(PyExc_TypeError,
7222 "Can't compare %.100s and %.100s",
7223 left->ob_type->tp_name,
7224 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007225 return -1;
7226}
7227
Martin v. Löwis5b222132007-06-10 09:51:05 +00007228int
7229PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7230{
7231 int i;
7232 Py_UNICODE *id;
7233 assert(PyUnicode_Check(uni));
7234 id = PyUnicode_AS_UNICODE(uni);
7235 /* Compare Unicode string and source character set string */
7236 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007237 if (id[i] != str[i])
7238 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007239 /* This check keeps Python strings that end in '\0' from comparing equal
7240 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007241 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007242 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007243 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007244 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007245 return 0;
7246}
7247
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007248
Benjamin Peterson29060642009-01-31 22:14:21 +00007249#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007250 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007251
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007252PyObject *PyUnicode_RichCompare(PyObject *left,
7253 PyObject *right,
7254 int op)
7255{
7256 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007257
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007258 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7259 PyObject *v;
7260 if (((PyUnicodeObject *) left)->length !=
7261 ((PyUnicodeObject *) right)->length) {
7262 if (op == Py_EQ) {
7263 Py_INCREF(Py_False);
7264 return Py_False;
7265 }
7266 if (op == Py_NE) {
7267 Py_INCREF(Py_True);
7268 return Py_True;
7269 }
7270 }
7271 if (left == right)
7272 result = 0;
7273 else
7274 result = unicode_compare((PyUnicodeObject *)left,
7275 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007276
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007277 /* Convert the return value to a Boolean */
7278 switch (op) {
7279 case Py_EQ:
7280 v = TEST_COND(result == 0);
7281 break;
7282 case Py_NE:
7283 v = TEST_COND(result != 0);
7284 break;
7285 case Py_LE:
7286 v = TEST_COND(result <= 0);
7287 break;
7288 case Py_GE:
7289 v = TEST_COND(result >= 0);
7290 break;
7291 case Py_LT:
7292 v = TEST_COND(result == -1);
7293 break;
7294 case Py_GT:
7295 v = TEST_COND(result == 1);
7296 break;
7297 default:
7298 PyErr_BadArgument();
7299 return NULL;
7300 }
7301 Py_INCREF(v);
7302 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007303 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007304
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007305 Py_INCREF(Py_NotImplemented);
7306 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007307}
7308
Guido van Rossum403d68b2000-03-13 15:55:09 +00007309int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007310 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007311{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007312 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007313 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007314
7315 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007316 sub = PyUnicode_FromObject(element);
7317 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007318 PyErr_Format(PyExc_TypeError,
7319 "'in <string>' requires string as left operand, not %s",
7320 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007321 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007322 }
7323
Thomas Wouters477c8d52006-05-27 19:21:47 +00007324 str = PyUnicode_FromObject(container);
7325 if (!str) {
7326 Py_DECREF(sub);
7327 return -1;
7328 }
7329
7330 result = stringlib_contains_obj(str, sub);
7331
7332 Py_DECREF(str);
7333 Py_DECREF(sub);
7334
Guido van Rossum403d68b2000-03-13 15:55:09 +00007335 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007336}
7337
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338/* Concat to string or Unicode object giving a new Unicode object. */
7339
7340PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342{
7343 PyUnicodeObject *u = NULL, *v = NULL, *w;
7344
7345 /* Coerce the two arguments */
7346 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7347 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7350 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352
7353 /* Shortcuts */
7354 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 Py_DECREF(v);
7356 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007357 }
7358 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 Py_DECREF(u);
7360 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361 }
7362
7363 /* Concat the two Unicode strings */
7364 w = _PyUnicode_New(u->length + v->length);
7365 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007366 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367 Py_UNICODE_COPY(w->str, u->str, u->length);
7368 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7369
7370 Py_DECREF(u);
7371 Py_DECREF(v);
7372 return (PyObject *)w;
7373
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375 Py_XDECREF(u);
7376 Py_XDECREF(v);
7377 return NULL;
7378}
7379
Walter Dörwald1ab83302007-05-18 17:15:44 +00007380void
7381PyUnicode_Append(PyObject **pleft, PyObject *right)
7382{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007383 PyObject *new;
7384 if (*pleft == NULL)
7385 return;
7386 if (right == NULL || !PyUnicode_Check(*pleft)) {
7387 Py_DECREF(*pleft);
7388 *pleft = NULL;
7389 return;
7390 }
7391 new = PyUnicode_Concat(*pleft, right);
7392 Py_DECREF(*pleft);
7393 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007394}
7395
7396void
7397PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7398{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007399 PyUnicode_Append(pleft, right);
7400 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007401}
7402
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007403PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007404 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007405\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007406Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007407string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007408interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409
7410static PyObject *
7411unicode_count(PyUnicodeObject *self, PyObject *args)
7412{
7413 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007414 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007415 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416 PyObject *result;
7417
Guido van Rossumb8872e62000-05-09 14:14:27 +00007418 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007419 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 return NULL;
7421
7422 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007423 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007426
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007427 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007428 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007429 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007430 substring->str, substring->length,
7431 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007432 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007433
7434 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007435
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436 return result;
7437}
7438
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007439PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007440 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007442Encode S using the codec registered for encoding. Default encoding\n\
7443is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007444handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007445a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7446'xmlcharrefreplace' as well as any other name registered with\n\
7447codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448
7449static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007450unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007451{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007452 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 char *encoding = NULL;
7454 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007455
Benjamin Peterson308d6372009-09-18 21:42:35 +00007456 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7457 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007459 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007460}
7461
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007462PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007463 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464\n\
7465Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007466If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467
7468static PyObject*
7469unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7470{
7471 Py_UNICODE *e;
7472 Py_UNICODE *p;
7473 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007474 Py_UNICODE *qe;
7475 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476 PyUnicodeObject *u;
7477 int tabsize = 8;
7478
7479 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481
Thomas Wouters7e474022000-07-16 12:04:32 +00007482 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007483 i = 0; /* chars up to and including most recent \n or \r */
7484 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7485 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486 for (p = self->str; p < e; p++)
7487 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007488 if (tabsize > 0) {
7489 incr = tabsize - (j % tabsize); /* cannot overflow */
7490 if (j > PY_SSIZE_T_MAX - incr)
7491 goto overflow1;
7492 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007493 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007494 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007496 if (j > PY_SSIZE_T_MAX - 1)
7497 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498 j++;
7499 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007500 if (i > PY_SSIZE_T_MAX - j)
7501 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007503 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007504 }
7505 }
7506
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007507 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007508 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007509
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510 /* Second pass: create output string and fill it */
7511 u = _PyUnicode_New(i + j);
7512 if (!u)
7513 return NULL;
7514
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007515 j = 0; /* same as in first pass */
7516 q = u->str; /* next output char */
7517 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518
7519 for (p = self->str; p < e; p++)
7520 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007521 if (tabsize > 0) {
7522 i = tabsize - (j % tabsize);
7523 j += i;
7524 while (i--) {
7525 if (q >= qe)
7526 goto overflow2;
7527 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007528 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007530 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 else {
7532 if (q >= qe)
7533 goto overflow2;
7534 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007535 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 if (*p == '\n' || *p == '\r')
7537 j = 0;
7538 }
7539
7540 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007541
7542 overflow2:
7543 Py_DECREF(u);
7544 overflow1:
7545 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7546 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547}
7548
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007549PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007550 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551\n\
7552Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007553such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554arguments start and end are interpreted as in slice notation.\n\
7555\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007556Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557
7558static PyObject *
7559unicode_find(PyUnicodeObject *self, PyObject *args)
7560{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007561 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007562 Py_ssize_t start;
7563 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007564 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565
Christian Heimes9cd17752007-11-18 19:35:23 +00007566 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568
Thomas Wouters477c8d52006-05-27 19:21:47 +00007569 result = stringlib_find_slice(
7570 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7571 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7572 start, end
7573 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574
7575 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007576
Christian Heimes217cfd12007-12-02 14:31:20 +00007577 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578}
7579
7580static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007581unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582{
7583 if (index < 0 || index >= self->length) {
7584 PyErr_SetString(PyExc_IndexError, "string index out of range");
7585 return NULL;
7586 }
7587
7588 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7589}
7590
Guido van Rossumc2504932007-09-18 19:42:40 +00007591/* Believe it or not, this produces the same value for ASCII strings
7592 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007593static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007594unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595{
Guido van Rossumc2504932007-09-18 19:42:40 +00007596 Py_ssize_t len;
7597 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007598 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007599
7600 if (self->hash != -1)
7601 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007602 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007603 p = self->str;
7604 x = *p << 7;
7605 while (--len >= 0)
7606 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007607 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007608 if (x == -1)
7609 x = -2;
7610 self->hash = x;
7611 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612}
7613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007614PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007617Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618
7619static PyObject *
7620unicode_index(PyUnicodeObject *self, PyObject *args)
7621{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007622 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007623 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007624 Py_ssize_t start;
7625 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626
Christian Heimes9cd17752007-11-18 19:35:23 +00007627 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629
Thomas Wouters477c8d52006-05-27 19:21:47 +00007630 result = stringlib_find_slice(
7631 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7632 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7633 start, end
7634 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635
7636 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007637
Guido van Rossumd57fd912000-03-10 22:53:23 +00007638 if (result < 0) {
7639 PyErr_SetString(PyExc_ValueError, "substring not found");
7640 return NULL;
7641 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007642
Christian Heimes217cfd12007-12-02 14:31:20 +00007643 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007644}
7645
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007646PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007647 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007649Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007650at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651
7652static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007653unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654{
7655 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7656 register const Py_UNICODE *e;
7657 int cased;
7658
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659 /* Shortcut for single character strings */
7660 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007661 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007663 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007664 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007666
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667 e = p + PyUnicode_GET_SIZE(self);
7668 cased = 0;
7669 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007671
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7673 return PyBool_FromLong(0);
7674 else if (!cased && Py_UNICODE_ISLOWER(ch))
7675 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007677 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678}
7679
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007680PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007681 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007683Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007684at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685
7686static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007687unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688{
7689 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7690 register const Py_UNICODE *e;
7691 int cased;
7692
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693 /* Shortcut for single character strings */
7694 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007695 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007697 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007698 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007700
Guido van Rossumd57fd912000-03-10 22:53:23 +00007701 e = p + PyUnicode_GET_SIZE(self);
7702 cased = 0;
7703 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007704 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007705
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7707 return PyBool_FromLong(0);
7708 else if (!cased && Py_UNICODE_ISUPPER(ch))
7709 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007711 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712}
7713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007714PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007717Return True if S is a titlecased string and there is at least one\n\
7718character in S, i.e. upper- and titlecase characters may only\n\
7719follow uncased characters and lowercase characters only cased ones.\n\
7720Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721
7722static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007723unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724{
7725 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7726 register const Py_UNICODE *e;
7727 int cased, previous_is_cased;
7728
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729 /* Shortcut for single character strings */
7730 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7732 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007734 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007735 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007737
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738 e = p + PyUnicode_GET_SIZE(self);
7739 cased = 0;
7740 previous_is_cased = 0;
7741 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007743
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7745 if (previous_is_cased)
7746 return PyBool_FromLong(0);
7747 previous_is_cased = 1;
7748 cased = 1;
7749 }
7750 else if (Py_UNICODE_ISLOWER(ch)) {
7751 if (!previous_is_cased)
7752 return PyBool_FromLong(0);
7753 previous_is_cased = 1;
7754 cased = 1;
7755 }
7756 else
7757 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007759 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760}
7761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007762PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007765Return True if all characters in S are whitespace\n\
7766and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767
7768static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007769unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770{
7771 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7772 register const Py_UNICODE *e;
7773
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774 /* Shortcut for single character strings */
7775 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007776 Py_UNICODE_ISSPACE(*p))
7777 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007779 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007780 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007782
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783 e = p + PyUnicode_GET_SIZE(self);
7784 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 if (!Py_UNICODE_ISSPACE(*p))
7786 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007788 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789}
7790
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007791PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007793\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007794Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007795and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007796
7797static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007798unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007799{
7800 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7801 register const Py_UNICODE *e;
7802
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007803 /* Shortcut for single character strings */
7804 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007805 Py_UNICODE_ISALPHA(*p))
7806 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007807
7808 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007809 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007810 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007811
7812 e = p + PyUnicode_GET_SIZE(self);
7813 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 if (!Py_UNICODE_ISALPHA(*p))
7815 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007816 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007817 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007818}
7819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007820PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007821 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007822\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007823Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007824and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007825
7826static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007827unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007828{
7829 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7830 register const Py_UNICODE *e;
7831
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007832 /* Shortcut for single character strings */
7833 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 Py_UNICODE_ISALNUM(*p))
7835 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007836
7837 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007838 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007840
7841 e = p + PyUnicode_GET_SIZE(self);
7842 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 if (!Py_UNICODE_ISALNUM(*p))
7844 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007845 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007846 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007847}
7848
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007849PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007852Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007853False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854
7855static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007856unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857{
7858 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7859 register const Py_UNICODE *e;
7860
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861 /* Shortcut for single character strings */
7862 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 Py_UNICODE_ISDECIMAL(*p))
7864 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007866 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007867 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007869
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870 e = p + PyUnicode_GET_SIZE(self);
7871 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 if (!Py_UNICODE_ISDECIMAL(*p))
7873 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007875 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876}
7877
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007878PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007879 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007881Return True if all characters in S are digits\n\
7882and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883
7884static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007885unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886{
7887 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7888 register const Py_UNICODE *e;
7889
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890 /* Shortcut for single character strings */
7891 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 Py_UNICODE_ISDIGIT(*p))
7893 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007895 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007896 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007898
Guido van Rossumd57fd912000-03-10 22:53:23 +00007899 e = p + PyUnicode_GET_SIZE(self);
7900 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 if (!Py_UNICODE_ISDIGIT(*p))
7902 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007904 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007905}
7906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007907PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007910Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007911False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007912
7913static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007914unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007915{
7916 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7917 register const Py_UNICODE *e;
7918
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919 /* Shortcut for single character strings */
7920 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 Py_UNICODE_ISNUMERIC(*p))
7922 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007923
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007924 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007925 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007927
Guido van Rossumd57fd912000-03-10 22:53:23 +00007928 e = p + PyUnicode_GET_SIZE(self);
7929 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 if (!Py_UNICODE_ISNUMERIC(*p))
7931 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007933 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007934}
7935
Martin v. Löwis47383402007-08-15 07:32:56 +00007936int
7937PyUnicode_IsIdentifier(PyObject *self)
7938{
7939 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7940 register const Py_UNICODE *e;
7941
7942 /* Special case for empty strings */
7943 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007944 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007945
7946 /* PEP 3131 says that the first character must be in
7947 XID_Start and subsequent characters in XID_Continue,
7948 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007949 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007950 letters, digits, underscore). However, given the current
7951 definition of XID_Start and XID_Continue, it is sufficient
7952 to check just for these, except that _ must be allowed
7953 as starting an identifier. */
7954 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7955 return 0;
7956
7957 e = p + PyUnicode_GET_SIZE(self);
7958 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 if (!_PyUnicode_IsXidContinue(*p))
7960 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007961 }
7962 return 1;
7963}
7964
7965PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007967\n\
7968Return True if S is a valid identifier according\n\
7969to the language definition.");
7970
7971static PyObject*
7972unicode_isidentifier(PyObject *self)
7973{
7974 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7975}
7976
Georg Brandl559e5d72008-06-11 18:37:52 +00007977PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007979\n\
7980Return True if all characters in S are considered\n\
7981printable in repr() or S is empty, False otherwise.");
7982
7983static PyObject*
7984unicode_isprintable(PyObject *self)
7985{
7986 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7987 register const Py_UNICODE *e;
7988
7989 /* Shortcut for single character strings */
7990 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7991 Py_RETURN_TRUE;
7992 }
7993
7994 e = p + PyUnicode_GET_SIZE(self);
7995 for (; p < e; p++) {
7996 if (!Py_UNICODE_ISPRINTABLE(*p)) {
7997 Py_RETURN_FALSE;
7998 }
7999 }
8000 Py_RETURN_TRUE;
8001}
8002
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008003PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008004 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005\n\
8006Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008007iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008
8009static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008010unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008012 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013}
8014
Martin v. Löwis18e16552006-02-15 17:27:45 +00008015static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016unicode_length(PyUnicodeObject *self)
8017{
8018 return self->length;
8019}
8020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008021PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008024Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008025done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026
8027static PyObject *
8028unicode_ljust(PyUnicodeObject *self, PyObject *args)
8029{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008030 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008031 Py_UNICODE fillchar = ' ';
8032
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008033 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034 return NULL;
8035
Tim Peters7a29bd52001-09-12 03:03:31 +00008036 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037 Py_INCREF(self);
8038 return (PyObject*) self;
8039 }
8040
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008041 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042}
8043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008044PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008047Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048
8049static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008050unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 return fixup(self, fixlower);
8053}
8054
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008055#define LEFTSTRIP 0
8056#define RIGHTSTRIP 1
8057#define BOTHSTRIP 2
8058
8059/* Arrays indexed by above */
8060static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8061
8062#define STRIPNAME(i) (stripformat[i]+3)
8063
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008064/* externally visible for str.strip(unicode) */
8065PyObject *
8066_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8067{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008068 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8069 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8070 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8071 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8072 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008073
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008075
Benjamin Peterson14339b62009-01-31 16:36:08 +00008076 i = 0;
8077 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8079 i++;
8080 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008081 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008082
Benjamin Peterson14339b62009-01-31 16:36:08 +00008083 j = len;
8084 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 do {
8086 j--;
8087 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8088 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008089 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008090
Benjamin Peterson14339b62009-01-31 16:36:08 +00008091 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008092 Py_INCREF(self);
8093 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008094 }
8095 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008097}
8098
Guido van Rossumd57fd912000-03-10 22:53:23 +00008099
8100static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008101do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008103 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8104 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008105
Benjamin Peterson14339b62009-01-31 16:36:08 +00008106 i = 0;
8107 if (striptype != RIGHTSTRIP) {
8108 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8109 i++;
8110 }
8111 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008112
Benjamin Peterson14339b62009-01-31 16:36:08 +00008113 j = len;
8114 if (striptype != LEFTSTRIP) {
8115 do {
8116 j--;
8117 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8118 j++;
8119 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008120
Benjamin Peterson14339b62009-01-31 16:36:08 +00008121 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8122 Py_INCREF(self);
8123 return (PyObject*)self;
8124 }
8125 else
8126 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127}
8128
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008129
8130static PyObject *
8131do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8132{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008133 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008134
Benjamin Peterson14339b62009-01-31 16:36:08 +00008135 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8136 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008137
Benjamin Peterson14339b62009-01-31 16:36:08 +00008138 if (sep != NULL && sep != Py_None) {
8139 if (PyUnicode_Check(sep))
8140 return _PyUnicode_XStrip(self, striptype, sep);
8141 else {
8142 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008143 "%s arg must be None or str",
8144 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008145 return NULL;
8146 }
8147 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008148
Benjamin Peterson14339b62009-01-31 16:36:08 +00008149 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008150}
8151
8152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008153PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008155\n\
8156Return a copy of the string S with leading and trailing\n\
8157whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008158If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008159
8160static PyObject *
8161unicode_strip(PyUnicodeObject *self, PyObject *args)
8162{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008163 if (PyTuple_GET_SIZE(args) == 0)
8164 return do_strip(self, BOTHSTRIP); /* Common case */
8165 else
8166 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008167}
8168
8169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008170PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008172\n\
8173Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008174If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008175
8176static PyObject *
8177unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8178{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008179 if (PyTuple_GET_SIZE(args) == 0)
8180 return do_strip(self, LEFTSTRIP); /* Common case */
8181 else
8182 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008183}
8184
8185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008186PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008187 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008188\n\
8189Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008190If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008191
8192static PyObject *
8193unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8194{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008195 if (PyTuple_GET_SIZE(args) == 0)
8196 return do_strip(self, RIGHTSTRIP); /* Common case */
8197 else
8198 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008199}
8200
8201
Guido van Rossumd57fd912000-03-10 22:53:23 +00008202static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008203unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008204{
8205 PyUnicodeObject *u;
8206 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008207 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008208 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008209
Georg Brandl222de0f2009-04-12 12:01:50 +00008210 if (len < 1) {
8211 Py_INCREF(unicode_empty);
8212 return (PyObject *)unicode_empty;
8213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008214
Tim Peters7a29bd52001-09-12 03:03:31 +00008215 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008216 /* no repeat, return original string */
8217 Py_INCREF(str);
8218 return (PyObject*) str;
8219 }
Tim Peters8f422462000-09-09 06:13:41 +00008220
8221 /* ensure # of chars needed doesn't overflow int and # of bytes
8222 * needed doesn't overflow size_t
8223 */
8224 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008225 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008226 PyErr_SetString(PyExc_OverflowError,
8227 "repeated string is too long");
8228 return NULL;
8229 }
8230 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8231 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8232 PyErr_SetString(PyExc_OverflowError,
8233 "repeated string is too long");
8234 return NULL;
8235 }
8236 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008237 if (!u)
8238 return NULL;
8239
8240 p = u->str;
8241
Georg Brandl222de0f2009-04-12 12:01:50 +00008242 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008243 Py_UNICODE_FILL(p, str->str[0], len);
8244 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008245 Py_ssize_t done = str->length; /* number of characters copied this far */
8246 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008248 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008249 Py_UNICODE_COPY(p+done, p, n);
8250 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 }
8253
8254 return (PyObject*) u;
8255}
8256
8257PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 PyObject *subobj,
8259 PyObject *replobj,
8260 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261{
8262 PyObject *self;
8263 PyObject *str1;
8264 PyObject *str2;
8265 PyObject *result;
8266
8267 self = PyUnicode_FromObject(obj);
8268 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270 str1 = PyUnicode_FromObject(subobj);
8271 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 Py_DECREF(self);
8273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274 }
8275 str2 = PyUnicode_FromObject(replobj);
8276 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 Py_DECREF(self);
8278 Py_DECREF(str1);
8279 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008280 }
Tim Petersced69f82003-09-16 20:30:58 +00008281 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 (PyUnicodeObject *)str1,
8283 (PyUnicodeObject *)str2,
8284 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 Py_DECREF(self);
8286 Py_DECREF(str1);
8287 Py_DECREF(str2);
8288 return result;
8289}
8290
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008291PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008292 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293\n\
8294Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008295old replaced by new. If the optional argument count is\n\
8296given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297
8298static PyObject*
8299unicode_replace(PyUnicodeObject *self, PyObject *args)
8300{
8301 PyUnicodeObject *str1;
8302 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008303 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304 PyObject *result;
8305
Martin v. Löwis18e16552006-02-15 17:27:45 +00008306 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 return NULL;
8308 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8309 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008312 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 Py_DECREF(str1);
8314 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008315 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316
8317 result = replace(self, str1, str2, maxcount);
8318
8319 Py_DECREF(str1);
8320 Py_DECREF(str2);
8321 return result;
8322}
8323
8324static
8325PyObject *unicode_repr(PyObject *unicode)
8326{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008327 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008328 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008329 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8330 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8331
8332 /* XXX(nnorwitz): rather than over-allocating, it would be
8333 better to choose a different scheme. Perhaps scan the
8334 first N-chars of the string and allocate based on that size.
8335 */
8336 /* Initial allocation is based on the longest-possible unichr
8337 escape.
8338
8339 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8340 unichr, so in this case it's the longest unichr escape. In
8341 narrow (UTF-16) builds this is five chars per source unichr
8342 since there are two unichrs in the surrogate pair, so in narrow
8343 (UTF-16) builds it's not the longest unichr escape.
8344
8345 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8346 so in the narrow (UTF-16) build case it's the longest unichr
8347 escape.
8348 */
8349
Walter Dörwald1ab83302007-05-18 17:15:44 +00008350 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008352#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008354#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008356#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008358 if (repr == NULL)
8359 return NULL;
8360
Walter Dörwald1ab83302007-05-18 17:15:44 +00008361 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008362
8363 /* Add quote */
8364 *p++ = (findchar(s, size, '\'') &&
8365 !findchar(s, size, '"')) ? '"' : '\'';
8366 while (size-- > 0) {
8367 Py_UNICODE ch = *s++;
8368
8369 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008370 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008371 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008372 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008373 continue;
8374 }
8375
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008377 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008378 *p++ = '\\';
8379 *p++ = 't';
8380 }
8381 else if (ch == '\n') {
8382 *p++ = '\\';
8383 *p++ = 'n';
8384 }
8385 else if (ch == '\r') {
8386 *p++ = '\\';
8387 *p++ = 'r';
8388 }
8389
8390 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008391 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008392 *p++ = '\\';
8393 *p++ = 'x';
8394 *p++ = hexdigits[(ch >> 4) & 0x000F];
8395 *p++ = hexdigits[ch & 0x000F];
8396 }
8397
Georg Brandl559e5d72008-06-11 18:37:52 +00008398 /* Copy ASCII characters as-is */
8399 else if (ch < 0x7F) {
8400 *p++ = ch;
8401 }
8402
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008404 else {
8405 Py_UCS4 ucs = ch;
8406
8407#ifndef Py_UNICODE_WIDE
8408 Py_UNICODE ch2 = 0;
8409 /* Get code point from surrogate pair */
8410 if (size > 0) {
8411 ch2 = *s;
8412 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008414 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008416 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008417 size--;
8418 }
8419 }
8420#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008421 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008422 (categories Z* and C* except ASCII space)
8423 */
8424 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8425 /* Map 8-bit characters to '\xhh' */
8426 if (ucs <= 0xff) {
8427 *p++ = '\\';
8428 *p++ = 'x';
8429 *p++ = hexdigits[(ch >> 4) & 0x000F];
8430 *p++ = hexdigits[ch & 0x000F];
8431 }
8432 /* Map 21-bit characters to '\U00xxxxxx' */
8433 else if (ucs >= 0x10000) {
8434 *p++ = '\\';
8435 *p++ = 'U';
8436 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8437 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8438 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8439 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8440 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8441 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8442 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8443 *p++ = hexdigits[ucs & 0x0000000F];
8444 }
8445 /* Map 16-bit characters to '\uxxxx' */
8446 else {
8447 *p++ = '\\';
8448 *p++ = 'u';
8449 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8450 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8451 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8452 *p++ = hexdigits[ucs & 0x000F];
8453 }
8454 }
8455 /* Copy characters as-is */
8456 else {
8457 *p++ = ch;
8458#ifndef Py_UNICODE_WIDE
8459 if (ucs >= 0x10000)
8460 *p++ = ch2;
8461#endif
8462 }
8463 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008464 }
8465 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008466 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008467
8468 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008469 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008470 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008471}
8472
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008473PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475\n\
8476Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008477such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478arguments start and end are interpreted as in slice notation.\n\
8479\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008480Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008481
8482static PyObject *
8483unicode_rfind(PyUnicodeObject *self, PyObject *args)
8484{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008485 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008486 Py_ssize_t start;
8487 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008488 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008489
Christian Heimes9cd17752007-11-18 19:35:23 +00008490 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008491 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492
Thomas Wouters477c8d52006-05-27 19:21:47 +00008493 result = stringlib_rfind_slice(
8494 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8495 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8496 start, end
8497 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498
8499 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008500
Christian Heimes217cfd12007-12-02 14:31:20 +00008501 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502}
8503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008504PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008507Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008508
8509static PyObject *
8510unicode_rindex(PyUnicodeObject *self, PyObject *args)
8511{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008512 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008513 Py_ssize_t start;
8514 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008515 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008516
Christian Heimes9cd17752007-11-18 19:35:23 +00008517 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008518 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519
Thomas Wouters477c8d52006-05-27 19:21:47 +00008520 result = stringlib_rfind_slice(
8521 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8522 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8523 start, end
8524 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008525
8526 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008527
Guido van Rossumd57fd912000-03-10 22:53:23 +00008528 if (result < 0) {
8529 PyErr_SetString(PyExc_ValueError, "substring not found");
8530 return NULL;
8531 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008532 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533}
8534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008535PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008536 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008538Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008539done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008540
8541static PyObject *
8542unicode_rjust(PyUnicodeObject *self, PyObject *args)
8543{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008544 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008545 Py_UNICODE fillchar = ' ';
8546
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008547 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008548 return NULL;
8549
Tim Peters7a29bd52001-09-12 03:03:31 +00008550 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551 Py_INCREF(self);
8552 return (PyObject*) self;
8553 }
8554
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008555 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556}
8557
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 PyObject *sep,
8560 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561{
8562 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008563
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564 s = PyUnicode_FromObject(s);
8565 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008566 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 if (sep != NULL) {
8568 sep = PyUnicode_FromObject(sep);
8569 if (sep == NULL) {
8570 Py_DECREF(s);
8571 return NULL;
8572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573 }
8574
8575 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8576
8577 Py_DECREF(s);
8578 Py_XDECREF(sep);
8579 return result;
8580}
8581
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008582PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584\n\
8585Return a list of the words in S, using sep as the\n\
8586delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008587splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008588whitespace string is a separator and empty strings are\n\
8589removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590
8591static PyObject*
8592unicode_split(PyUnicodeObject *self, PyObject *args)
8593{
8594 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008595 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596
Martin v. Löwis18e16552006-02-15 17:27:45 +00008597 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598 return NULL;
8599
8600 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606}
8607
Thomas Wouters477c8d52006-05-27 19:21:47 +00008608PyObject *
8609PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8610{
8611 PyObject* str_obj;
8612 PyObject* sep_obj;
8613 PyObject* out;
8614
8615 str_obj = PyUnicode_FromObject(str_in);
8616 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008617 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008618 sep_obj = PyUnicode_FromObject(sep_in);
8619 if (!sep_obj) {
8620 Py_DECREF(str_obj);
8621 return NULL;
8622 }
8623
8624 out = stringlib_partition(
8625 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8626 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8627 );
8628
8629 Py_DECREF(sep_obj);
8630 Py_DECREF(str_obj);
8631
8632 return out;
8633}
8634
8635
8636PyObject *
8637PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8638{
8639 PyObject* str_obj;
8640 PyObject* sep_obj;
8641 PyObject* out;
8642
8643 str_obj = PyUnicode_FromObject(str_in);
8644 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008646 sep_obj = PyUnicode_FromObject(sep_in);
8647 if (!sep_obj) {
8648 Py_DECREF(str_obj);
8649 return NULL;
8650 }
8651
8652 out = stringlib_rpartition(
8653 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8654 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8655 );
8656
8657 Py_DECREF(sep_obj);
8658 Py_DECREF(str_obj);
8659
8660 return out;
8661}
8662
8663PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008665\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008666Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008667the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008668found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008669
8670static PyObject*
8671unicode_partition(PyUnicodeObject *self, PyObject *separator)
8672{
8673 return PyUnicode_Partition((PyObject *)self, separator);
8674}
8675
8676PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008677 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008678\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008679Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008680the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008681separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008682
8683static PyObject*
8684unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8685{
8686 return PyUnicode_RPartition((PyObject *)self, separator);
8687}
8688
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008689PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 PyObject *sep,
8691 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008692{
8693 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008694
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008695 s = PyUnicode_FromObject(s);
8696 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008697 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 if (sep != NULL) {
8699 sep = PyUnicode_FromObject(sep);
8700 if (sep == NULL) {
8701 Py_DECREF(s);
8702 return NULL;
8703 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008704 }
8705
8706 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8707
8708 Py_DECREF(s);
8709 Py_XDECREF(sep);
8710 return result;
8711}
8712
8713PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008715\n\
8716Return a list of the words in S, using sep as the\n\
8717delimiter string, starting at the end of the string and\n\
8718working to the front. If maxsplit is given, at most maxsplit\n\
8719splits are done. If sep is not specified, any whitespace string\n\
8720is a separator.");
8721
8722static PyObject*
8723unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8724{
8725 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008726 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008727
Martin v. Löwis18e16552006-02-15 17:27:45 +00008728 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008729 return NULL;
8730
8731 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008733 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008735 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008737}
8738
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008739PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741\n\
8742Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008743Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008744is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008745
8746static PyObject*
8747unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8748{
Guido van Rossum86662912000-04-11 15:38:46 +00008749 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750
Guido van Rossum86662912000-04-11 15:38:46 +00008751 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752 return NULL;
8753
Guido van Rossum86662912000-04-11 15:38:46 +00008754 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755}
8756
8757static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008758PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008759{
Walter Dörwald346737f2007-05-31 10:44:43 +00008760 if (PyUnicode_CheckExact(self)) {
8761 Py_INCREF(self);
8762 return self;
8763 } else
8764 /* Subtype -- return genuine unicode string with the same value. */
8765 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8766 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767}
8768
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008769PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771\n\
8772Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008773and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774
8775static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008776unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 return fixup(self, fixswapcase);
8779}
8780
Georg Brandlceee0772007-11-27 23:48:05 +00008781PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008782 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008783\n\
8784Return a translation table usable for str.translate().\n\
8785If there is only one argument, it must be a dictionary mapping Unicode\n\
8786ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008787Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008788If there are two arguments, they must be strings of equal length, and\n\
8789in the resulting dictionary, each character in x will be mapped to the\n\
8790character at the same position in y. If there is a third argument, it\n\
8791must be a string, whose characters will be mapped to None in the result.");
8792
8793static PyObject*
8794unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8795{
8796 PyObject *x, *y = NULL, *z = NULL;
8797 PyObject *new = NULL, *key, *value;
8798 Py_ssize_t i = 0;
8799 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008800
Georg Brandlceee0772007-11-27 23:48:05 +00008801 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8802 return NULL;
8803 new = PyDict_New();
8804 if (!new)
8805 return NULL;
8806 if (y != NULL) {
8807 /* x must be a string too, of equal length */
8808 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8809 if (!PyUnicode_Check(x)) {
8810 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8811 "be a string if there is a second argument");
8812 goto err;
8813 }
8814 if (PyUnicode_GET_SIZE(x) != ylen) {
8815 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8816 "arguments must have equal length");
8817 goto err;
8818 }
8819 /* create entries for translating chars in x to those in y */
8820 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008821 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8822 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008823 if (!key || !value)
8824 goto err;
8825 res = PyDict_SetItem(new, key, value);
8826 Py_DECREF(key);
8827 Py_DECREF(value);
8828 if (res < 0)
8829 goto err;
8830 }
8831 /* create entries for deleting chars in z */
8832 if (z != NULL) {
8833 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008834 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008835 if (!key)
8836 goto err;
8837 res = PyDict_SetItem(new, key, Py_None);
8838 Py_DECREF(key);
8839 if (res < 0)
8840 goto err;
8841 }
8842 }
8843 } else {
8844 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008845 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008846 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8847 "to maketrans it must be a dict");
8848 goto err;
8849 }
8850 /* copy entries into the new dict, converting string keys to int keys */
8851 while (PyDict_Next(x, &i, &key, &value)) {
8852 if (PyUnicode_Check(key)) {
8853 /* convert string keys to integer keys */
8854 PyObject *newkey;
8855 if (PyUnicode_GET_SIZE(key) != 1) {
8856 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8857 "table must be of length 1");
8858 goto err;
8859 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008860 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008861 if (!newkey)
8862 goto err;
8863 res = PyDict_SetItem(new, newkey, value);
8864 Py_DECREF(newkey);
8865 if (res < 0)
8866 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008867 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008868 /* just keep integer keys */
8869 if (PyDict_SetItem(new, key, value) < 0)
8870 goto err;
8871 } else {
8872 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8873 "be strings or integers");
8874 goto err;
8875 }
8876 }
8877 }
8878 return new;
8879 err:
8880 Py_DECREF(new);
8881 return NULL;
8882}
8883
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008884PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008885 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886\n\
8887Return a copy of the string S, where all characters have been mapped\n\
8888through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008889Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008890Unmapped characters are left untouched. Characters mapped to None\n\
8891are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892
8893static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008894unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008895{
Georg Brandlceee0772007-11-27 23:48:05 +00008896 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008897}
8898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008899PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008902Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903
8904static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008905unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008906{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907 return fixup(self, fixupper);
8908}
8909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008910PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008911 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008913Pad a numeric string S with zeros on the left, to fill a field\n\
8914of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008915
8916static PyObject *
8917unicode_zfill(PyUnicodeObject *self, PyObject *args)
8918{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008919 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008920 PyUnicodeObject *u;
8921
Martin v. Löwis18e16552006-02-15 17:27:45 +00008922 Py_ssize_t width;
8923 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924 return NULL;
8925
8926 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008927 if (PyUnicode_CheckExact(self)) {
8928 Py_INCREF(self);
8929 return (PyObject*) self;
8930 }
8931 else
8932 return PyUnicode_FromUnicode(
8933 PyUnicode_AS_UNICODE(self),
8934 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936 }
8937
8938 fill = width - self->length;
8939
8940 u = pad(self, fill, 0, '0');
8941
Walter Dörwald068325e2002-04-15 13:36:47 +00008942 if (u == NULL)
8943 return NULL;
8944
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945 if (u->str[fill] == '+' || u->str[fill] == '-') {
8946 /* move sign to beginning of string */
8947 u->str[0] = u->str[fill];
8948 u->str[fill] = '0';
8949 }
8950
8951 return (PyObject*) u;
8952}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008953
8954#if 0
8955static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008956unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957{
Christian Heimes2202f872008-02-06 14:31:34 +00008958 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008960
8961static PyObject *
8962unicode__decimal2ascii(PyObject *self)
8963{
8964 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
8965 PyUnicode_GET_SIZE(self));
8966}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008967#endif
8968
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008969PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008970 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008972Return True if S starts with the specified prefix, False otherwise.\n\
8973With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008974With optional end, stop comparing S at that position.\n\
8975prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008976
8977static PyObject *
8978unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008979 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008981 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008983 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008984 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008985 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008987 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008988 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8989 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008990 if (PyTuple_Check(subobj)) {
8991 Py_ssize_t i;
8992 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8993 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008994 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008995 if (substring == NULL)
8996 return NULL;
8997 result = tailmatch(self, substring, start, end, -1);
8998 Py_DECREF(substring);
8999 if (result) {
9000 Py_RETURN_TRUE;
9001 }
9002 }
9003 /* nothing matched */
9004 Py_RETURN_FALSE;
9005 }
9006 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009008 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009009 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009011 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012}
9013
9014
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009015PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009016 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009017\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009018Return True if S ends with the specified suffix, False otherwise.\n\
9019With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009020With optional end, stop comparing S at that position.\n\
9021suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022
9023static PyObject *
9024unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009025 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009027 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009029 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009030 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009031 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009033 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009034 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9035 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009036 if (PyTuple_Check(subobj)) {
9037 Py_ssize_t i;
9038 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9039 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009041 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009042 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009043 result = tailmatch(self, substring, start, end, +1);
9044 Py_DECREF(substring);
9045 if (result) {
9046 Py_RETURN_TRUE;
9047 }
9048 }
9049 Py_RETURN_FALSE;
9050 }
9051 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009052 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009053 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009054
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009055 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009057 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058}
9059
Eric Smith8c663262007-08-25 02:26:07 +00009060#include "stringlib/string_format.h"
9061
9062PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009063 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009064\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009065Return a formatted version of S, using substitutions from args and kwargs.\n\
9066The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009067
Eric Smith27bbca62010-11-04 17:06:58 +00009068PyDoc_STRVAR(format_map__doc__,
9069 "S.format_map(mapping) -> str\n\
9070\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009071Return a formatted version of S, using substitutions from mapping.\n\
9072The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009073
Eric Smith4a7d76d2008-05-30 18:10:19 +00009074static PyObject *
9075unicode__format__(PyObject* self, PyObject* args)
9076{
9077 PyObject *format_spec;
9078
9079 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9080 return NULL;
9081
9082 return _PyUnicode_FormatAdvanced(self,
9083 PyUnicode_AS_UNICODE(format_spec),
9084 PyUnicode_GET_SIZE(format_spec));
9085}
9086
Eric Smith8c663262007-08-25 02:26:07 +00009087PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009088 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009089\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009090Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009091
9092static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009093unicode__sizeof__(PyUnicodeObject *v)
9094{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009095 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9096 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009097}
9098
9099PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009100 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009101
9102static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009103unicode_getnewargs(PyUnicodeObject *v)
9104{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009105 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009106}
9107
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108static PyMethodDef unicode_methods[] = {
9109
9110 /* Order is according to common usage: often used methods should
9111 appear first, since lookup is done sequentially. */
9112
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009113 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009114 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9115 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009116 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009117 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9118 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9119 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9120 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9121 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9122 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9123 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009124 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009125 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9126 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9127 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009128 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009129 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9130 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9131 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009132 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009133 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009134 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009135 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009136 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9137 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9138 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9139 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9140 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9141 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9142 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9143 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9144 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9145 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9146 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9147 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9148 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9149 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009150 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009151 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009152 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009153 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009154 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009155 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009156 {"maketrans", (PyCFunction) unicode_maketrans,
9157 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009158 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009159#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009160 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161#endif
9162
9163#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009164 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009165 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009166 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009167#endif
9168
Benjamin Peterson14339b62009-01-31 16:36:08 +00009169 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009170 {NULL, NULL}
9171};
9172
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009173static PyObject *
9174unicode_mod(PyObject *v, PyObject *w)
9175{
Benjamin Peterson29060642009-01-31 22:14:21 +00009176 if (!PyUnicode_Check(v)) {
9177 Py_INCREF(Py_NotImplemented);
9178 return Py_NotImplemented;
9179 }
9180 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009181}
9182
9183static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009184 0, /*nb_add*/
9185 0, /*nb_subtract*/
9186 0, /*nb_multiply*/
9187 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009188};
9189
Guido van Rossumd57fd912000-03-10 22:53:23 +00009190static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009191 (lenfunc) unicode_length, /* sq_length */
9192 PyUnicode_Concat, /* sq_concat */
9193 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9194 (ssizeargfunc) unicode_getitem, /* sq_item */
9195 0, /* sq_slice */
9196 0, /* sq_ass_item */
9197 0, /* sq_ass_slice */
9198 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009199};
9200
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009201static PyObject*
9202unicode_subscript(PyUnicodeObject* self, PyObject* item)
9203{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009204 if (PyIndex_Check(item)) {
9205 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009206 if (i == -1 && PyErr_Occurred())
9207 return NULL;
9208 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009209 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009210 return unicode_getitem(self, i);
9211 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009212 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009213 Py_UNICODE* source_buf;
9214 Py_UNICODE* result_buf;
9215 PyObject* result;
9216
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009217 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009218 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009219 return NULL;
9220 }
9221
9222 if (slicelength <= 0) {
9223 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009224 } else if (start == 0 && step == 1 && slicelength == self->length &&
9225 PyUnicode_CheckExact(self)) {
9226 Py_INCREF(self);
9227 return (PyObject *)self;
9228 } else if (step == 1) {
9229 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009230 } else {
9231 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009232 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9233 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009234
Benjamin Peterson29060642009-01-31 22:14:21 +00009235 if (result_buf == NULL)
9236 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009237
9238 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9239 result_buf[i] = source_buf[cur];
9240 }
Tim Petersced69f82003-09-16 20:30:58 +00009241
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009242 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009243 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009244 return result;
9245 }
9246 } else {
9247 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9248 return NULL;
9249 }
9250}
9251
9252static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009253 (lenfunc)unicode_length, /* mp_length */
9254 (binaryfunc)unicode_subscript, /* mp_subscript */
9255 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009256};
9257
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258
Guido van Rossumd57fd912000-03-10 22:53:23 +00009259/* Helpers for PyUnicode_Format() */
9260
9261static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009262getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009263{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009264 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009265 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009266 (*p_argidx)++;
9267 if (arglen < 0)
9268 return args;
9269 else
9270 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009271 }
9272 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009273 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009274 return NULL;
9275}
9276
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009277/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009279static PyObject *
9280formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009281{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009282 char *p;
9283 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009285
Guido van Rossumd57fd912000-03-10 22:53:23 +00009286 x = PyFloat_AsDouble(v);
9287 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009288 return NULL;
9289
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009291 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009292
Eric Smith0923d1d2009-04-16 20:16:10 +00009293 p = PyOS_double_to_string(x, type, prec,
9294 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009295 if (p == NULL)
9296 return NULL;
9297 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009298 PyMem_Free(p);
9299 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009300}
9301
Tim Peters38fd5b62000-09-21 05:43:11 +00009302static PyObject*
9303formatlong(PyObject *val, int flags, int prec, int type)
9304{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009305 char *buf;
9306 int len;
9307 PyObject *str; /* temporary string object. */
9308 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009309
Benjamin Peterson14339b62009-01-31 16:36:08 +00009310 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9311 if (!str)
9312 return NULL;
9313 result = PyUnicode_FromStringAndSize(buf, len);
9314 Py_DECREF(str);
9315 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009316}
9317
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318static int
9319formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009320 size_t buflen,
9321 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009323 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009324 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009325 if (PyUnicode_GET_SIZE(v) == 1) {
9326 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9327 buf[1] = '\0';
9328 return 1;
9329 }
9330#ifndef Py_UNICODE_WIDE
9331 if (PyUnicode_GET_SIZE(v) == 2) {
9332 /* Decode a valid surrogate pair */
9333 int c0 = PyUnicode_AS_UNICODE(v)[0];
9334 int c1 = PyUnicode_AS_UNICODE(v)[1];
9335 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9336 0xDC00 <= c1 && c1 <= 0xDFFF) {
9337 buf[0] = c0;
9338 buf[1] = c1;
9339 buf[2] = '\0';
9340 return 2;
9341 }
9342 }
9343#endif
9344 goto onError;
9345 }
9346 else {
9347 /* Integer input truncated to a character */
9348 long x;
9349 x = PyLong_AsLong(v);
9350 if (x == -1 && PyErr_Occurred())
9351 goto onError;
9352
9353 if (x < 0 || x > 0x10ffff) {
9354 PyErr_SetString(PyExc_OverflowError,
9355 "%c arg not in range(0x110000)");
9356 return -1;
9357 }
9358
9359#ifndef Py_UNICODE_WIDE
9360 if (x > 0xffff) {
9361 x -= 0x10000;
9362 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9363 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9364 return 2;
9365 }
9366#endif
9367 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009368 buf[1] = '\0';
9369 return 1;
9370 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009371
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009373 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009374 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009375 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376}
9377
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009378/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009379 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009380*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009381#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009382
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009384 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009385{
9386 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009387 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388 int args_owned = 0;
9389 PyUnicodeObject *result = NULL;
9390 PyObject *dict = NULL;
9391 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009392
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009394 PyErr_BadInternalCall();
9395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009396 }
9397 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009398 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009399 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400 fmt = PyUnicode_AS_UNICODE(uformat);
9401 fmtcnt = PyUnicode_GET_SIZE(uformat);
9402
9403 reslen = rescnt = fmtcnt + 100;
9404 result = _PyUnicode_New(reslen);
9405 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009406 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009407 res = PyUnicode_AS_UNICODE(result);
9408
9409 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009410 arglen = PyTuple_Size(args);
9411 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009412 }
9413 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009414 arglen = -1;
9415 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009416 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009417 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009418 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009419 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420
9421 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009422 if (*fmt != '%') {
9423 if (--rescnt < 0) {
9424 rescnt = fmtcnt + 100;
9425 reslen += rescnt;
9426 if (_PyUnicode_Resize(&result, reslen) < 0)
9427 goto onError;
9428 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9429 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009430 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009431 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009432 }
9433 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009434 /* Got a format specifier */
9435 int flags = 0;
9436 Py_ssize_t width = -1;
9437 int prec = -1;
9438 Py_UNICODE c = '\0';
9439 Py_UNICODE fill;
9440 int isnumok;
9441 PyObject *v = NULL;
9442 PyObject *temp = NULL;
9443 Py_UNICODE *pbuf;
9444 Py_UNICODE sign;
9445 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009446 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447
Benjamin Peterson29060642009-01-31 22:14:21 +00009448 fmt++;
9449 if (*fmt == '(') {
9450 Py_UNICODE *keystart;
9451 Py_ssize_t keylen;
9452 PyObject *key;
9453 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009454
Benjamin Peterson29060642009-01-31 22:14:21 +00009455 if (dict == NULL) {
9456 PyErr_SetString(PyExc_TypeError,
9457 "format requires a mapping");
9458 goto onError;
9459 }
9460 ++fmt;
9461 --fmtcnt;
9462 keystart = fmt;
9463 /* Skip over balanced parentheses */
9464 while (pcount > 0 && --fmtcnt >= 0) {
9465 if (*fmt == ')')
9466 --pcount;
9467 else if (*fmt == '(')
9468 ++pcount;
9469 fmt++;
9470 }
9471 keylen = fmt - keystart - 1;
9472 if (fmtcnt < 0 || pcount > 0) {
9473 PyErr_SetString(PyExc_ValueError,
9474 "incomplete format key");
9475 goto onError;
9476 }
9477#if 0
9478 /* keys are converted to strings using UTF-8 and
9479 then looked up since Python uses strings to hold
9480 variables names etc. in its namespaces and we
9481 wouldn't want to break common idioms. */
9482 key = PyUnicode_EncodeUTF8(keystart,
9483 keylen,
9484 NULL);
9485#else
9486 key = PyUnicode_FromUnicode(keystart, keylen);
9487#endif
9488 if (key == NULL)
9489 goto onError;
9490 if (args_owned) {
9491 Py_DECREF(args);
9492 args_owned = 0;
9493 }
9494 args = PyObject_GetItem(dict, key);
9495 Py_DECREF(key);
9496 if (args == NULL) {
9497 goto onError;
9498 }
9499 args_owned = 1;
9500 arglen = -1;
9501 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009502 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009503 while (--fmtcnt >= 0) {
9504 switch (c = *fmt++) {
9505 case '-': flags |= F_LJUST; continue;
9506 case '+': flags |= F_SIGN; continue;
9507 case ' ': flags |= F_BLANK; continue;
9508 case '#': flags |= F_ALT; continue;
9509 case '0': flags |= F_ZERO; continue;
9510 }
9511 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009512 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009513 if (c == '*') {
9514 v = getnextarg(args, arglen, &argidx);
9515 if (v == NULL)
9516 goto onError;
9517 if (!PyLong_Check(v)) {
9518 PyErr_SetString(PyExc_TypeError,
9519 "* wants int");
9520 goto onError;
9521 }
9522 width = PyLong_AsLong(v);
9523 if (width == -1 && PyErr_Occurred())
9524 goto onError;
9525 if (width < 0) {
9526 flags |= F_LJUST;
9527 width = -width;
9528 }
9529 if (--fmtcnt >= 0)
9530 c = *fmt++;
9531 }
9532 else if (c >= '0' && c <= '9') {
9533 width = c - '0';
9534 while (--fmtcnt >= 0) {
9535 c = *fmt++;
9536 if (c < '0' || c > '9')
9537 break;
9538 if ((width*10) / 10 != width) {
9539 PyErr_SetString(PyExc_ValueError,
9540 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009541 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009542 }
9543 width = width*10 + (c - '0');
9544 }
9545 }
9546 if (c == '.') {
9547 prec = 0;
9548 if (--fmtcnt >= 0)
9549 c = *fmt++;
9550 if (c == '*') {
9551 v = getnextarg(args, arglen, &argidx);
9552 if (v == NULL)
9553 goto onError;
9554 if (!PyLong_Check(v)) {
9555 PyErr_SetString(PyExc_TypeError,
9556 "* wants int");
9557 goto onError;
9558 }
9559 prec = PyLong_AsLong(v);
9560 if (prec == -1 && PyErr_Occurred())
9561 goto onError;
9562 if (prec < 0)
9563 prec = 0;
9564 if (--fmtcnt >= 0)
9565 c = *fmt++;
9566 }
9567 else if (c >= '0' && c <= '9') {
9568 prec = c - '0';
9569 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009570 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009571 if (c < '0' || c > '9')
9572 break;
9573 if ((prec*10) / 10 != prec) {
9574 PyErr_SetString(PyExc_ValueError,
9575 "prec too big");
9576 goto onError;
9577 }
9578 prec = prec*10 + (c - '0');
9579 }
9580 }
9581 } /* prec */
9582 if (fmtcnt >= 0) {
9583 if (c == 'h' || c == 'l' || c == 'L') {
9584 if (--fmtcnt >= 0)
9585 c = *fmt++;
9586 }
9587 }
9588 if (fmtcnt < 0) {
9589 PyErr_SetString(PyExc_ValueError,
9590 "incomplete format");
9591 goto onError;
9592 }
9593 if (c != '%') {
9594 v = getnextarg(args, arglen, &argidx);
9595 if (v == NULL)
9596 goto onError;
9597 }
9598 sign = 0;
9599 fill = ' ';
9600 switch (c) {
9601
9602 case '%':
9603 pbuf = formatbuf;
9604 /* presume that buffer length is at least 1 */
9605 pbuf[0] = '%';
9606 len = 1;
9607 break;
9608
9609 case 's':
9610 case 'r':
9611 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009612 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009613 temp = v;
9614 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009615 }
9616 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009617 if (c == 's')
9618 temp = PyObject_Str(v);
9619 else if (c == 'r')
9620 temp = PyObject_Repr(v);
9621 else
9622 temp = PyObject_ASCII(v);
9623 if (temp == NULL)
9624 goto onError;
9625 if (PyUnicode_Check(temp))
9626 /* nothing to do */;
9627 else {
9628 Py_DECREF(temp);
9629 PyErr_SetString(PyExc_TypeError,
9630 "%s argument has non-string str()");
9631 goto onError;
9632 }
9633 }
9634 pbuf = PyUnicode_AS_UNICODE(temp);
9635 len = PyUnicode_GET_SIZE(temp);
9636 if (prec >= 0 && len > prec)
9637 len = prec;
9638 break;
9639
9640 case 'i':
9641 case 'd':
9642 case 'u':
9643 case 'o':
9644 case 'x':
9645 case 'X':
9646 if (c == 'i')
9647 c = 'd';
9648 isnumok = 0;
9649 if (PyNumber_Check(v)) {
9650 PyObject *iobj=NULL;
9651
9652 if (PyLong_Check(v)) {
9653 iobj = v;
9654 Py_INCREF(iobj);
9655 }
9656 else {
9657 iobj = PyNumber_Long(v);
9658 }
9659 if (iobj!=NULL) {
9660 if (PyLong_Check(iobj)) {
9661 isnumok = 1;
9662 temp = formatlong(iobj, flags, prec, c);
9663 Py_DECREF(iobj);
9664 if (!temp)
9665 goto onError;
9666 pbuf = PyUnicode_AS_UNICODE(temp);
9667 len = PyUnicode_GET_SIZE(temp);
9668 sign = 1;
9669 }
9670 else {
9671 Py_DECREF(iobj);
9672 }
9673 }
9674 }
9675 if (!isnumok) {
9676 PyErr_Format(PyExc_TypeError,
9677 "%%%c format: a number is required, "
9678 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9679 goto onError;
9680 }
9681 if (flags & F_ZERO)
9682 fill = '0';
9683 break;
9684
9685 case 'e':
9686 case 'E':
9687 case 'f':
9688 case 'F':
9689 case 'g':
9690 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009691 temp = formatfloat(v, flags, prec, c);
9692 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009693 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009694 pbuf = PyUnicode_AS_UNICODE(temp);
9695 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009696 sign = 1;
9697 if (flags & F_ZERO)
9698 fill = '0';
9699 break;
9700
9701 case 'c':
9702 pbuf = formatbuf;
9703 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9704 if (len < 0)
9705 goto onError;
9706 break;
9707
9708 default:
9709 PyErr_Format(PyExc_ValueError,
9710 "unsupported format character '%c' (0x%x) "
9711 "at index %zd",
9712 (31<=c && c<=126) ? (char)c : '?',
9713 (int)c,
9714 (Py_ssize_t)(fmt - 1 -
9715 PyUnicode_AS_UNICODE(uformat)));
9716 goto onError;
9717 }
9718 if (sign) {
9719 if (*pbuf == '-' || *pbuf == '+') {
9720 sign = *pbuf++;
9721 len--;
9722 }
9723 else if (flags & F_SIGN)
9724 sign = '+';
9725 else if (flags & F_BLANK)
9726 sign = ' ';
9727 else
9728 sign = 0;
9729 }
9730 if (width < len)
9731 width = len;
9732 if (rescnt - (sign != 0) < width) {
9733 reslen -= rescnt;
9734 rescnt = width + fmtcnt + 100;
9735 reslen += rescnt;
9736 if (reslen < 0) {
9737 Py_XDECREF(temp);
9738 PyErr_NoMemory();
9739 goto onError;
9740 }
9741 if (_PyUnicode_Resize(&result, reslen) < 0) {
9742 Py_XDECREF(temp);
9743 goto onError;
9744 }
9745 res = PyUnicode_AS_UNICODE(result)
9746 + reslen - rescnt;
9747 }
9748 if (sign) {
9749 if (fill != ' ')
9750 *res++ = sign;
9751 rescnt--;
9752 if (width > len)
9753 width--;
9754 }
9755 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9756 assert(pbuf[0] == '0');
9757 assert(pbuf[1] == c);
9758 if (fill != ' ') {
9759 *res++ = *pbuf++;
9760 *res++ = *pbuf++;
9761 }
9762 rescnt -= 2;
9763 width -= 2;
9764 if (width < 0)
9765 width = 0;
9766 len -= 2;
9767 }
9768 if (width > len && !(flags & F_LJUST)) {
9769 do {
9770 --rescnt;
9771 *res++ = fill;
9772 } while (--width > len);
9773 }
9774 if (fill == ' ') {
9775 if (sign)
9776 *res++ = sign;
9777 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9778 assert(pbuf[0] == '0');
9779 assert(pbuf[1] == c);
9780 *res++ = *pbuf++;
9781 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009782 }
9783 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009784 Py_UNICODE_COPY(res, pbuf, len);
9785 res += len;
9786 rescnt -= len;
9787 while (--width >= len) {
9788 --rescnt;
9789 *res++ = ' ';
9790 }
9791 if (dict && (argidx < arglen) && c != '%') {
9792 PyErr_SetString(PyExc_TypeError,
9793 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009794 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009795 goto onError;
9796 }
9797 Py_XDECREF(temp);
9798 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009799 } /* until end */
9800 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009801 PyErr_SetString(PyExc_TypeError,
9802 "not all arguments converted during string formatting");
9803 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009804 }
9805
Thomas Woutersa96affe2006-03-12 00:29:36 +00009806 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009807 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009808 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009809 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009810 }
9811 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009812 return (PyObject *)result;
9813
Benjamin Peterson29060642009-01-31 22:14:21 +00009814 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009815 Py_XDECREF(result);
9816 Py_DECREF(uformat);
9817 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009818 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819 }
9820 return NULL;
9821}
9822
Jeremy Hylton938ace62002-07-17 16:30:39 +00009823static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009824unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9825
Tim Peters6d6c1a32001-08-02 04:15:00 +00009826static PyObject *
9827unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9828{
Benjamin Peterson29060642009-01-31 22:14:21 +00009829 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009830 static char *kwlist[] = {"object", "encoding", "errors", 0};
9831 char *encoding = NULL;
9832 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009833
Benjamin Peterson14339b62009-01-31 16:36:08 +00009834 if (type != &PyUnicode_Type)
9835 return unicode_subtype_new(type, args, kwds);
9836 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009837 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009838 return NULL;
9839 if (x == NULL)
9840 return (PyObject *)_PyUnicode_New(0);
9841 if (encoding == NULL && errors == NULL)
9842 return PyObject_Str(x);
9843 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009844 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009845}
9846
Guido van Rossume023fe02001-08-30 03:12:59 +00009847static PyObject *
9848unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9849{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009850 PyUnicodeObject *tmp, *pnew;
9851 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009852
Benjamin Peterson14339b62009-01-31 16:36:08 +00009853 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9854 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9855 if (tmp == NULL)
9856 return NULL;
9857 assert(PyUnicode_Check(tmp));
9858 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9859 if (pnew == NULL) {
9860 Py_DECREF(tmp);
9861 return NULL;
9862 }
9863 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9864 if (pnew->str == NULL) {
9865 _Py_ForgetReference((PyObject *)pnew);
9866 PyObject_Del(pnew);
9867 Py_DECREF(tmp);
9868 return PyErr_NoMemory();
9869 }
9870 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9871 pnew->length = n;
9872 pnew->hash = tmp->hash;
9873 Py_DECREF(tmp);
9874 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009875}
9876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009877PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009878 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009879\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009880Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009881encoding defaults to the current default string encoding.\n\
9882errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009883
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009884static PyObject *unicode_iter(PyObject *seq);
9885
Guido van Rossumd57fd912000-03-10 22:53:23 +00009886PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009887 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009888 "str", /* tp_name */
9889 sizeof(PyUnicodeObject), /* tp_size */
9890 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009892 (destructor)unicode_dealloc, /* tp_dealloc */
9893 0, /* tp_print */
9894 0, /* tp_getattr */
9895 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009896 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009897 unicode_repr, /* tp_repr */
9898 &unicode_as_number, /* tp_as_number */
9899 &unicode_as_sequence, /* tp_as_sequence */
9900 &unicode_as_mapping, /* tp_as_mapping */
9901 (hashfunc) unicode_hash, /* tp_hash*/
9902 0, /* tp_call*/
9903 (reprfunc) unicode_str, /* tp_str */
9904 PyObject_GenericGetAttr, /* tp_getattro */
9905 0, /* tp_setattro */
9906 0, /* tp_as_buffer */
9907 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009908 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009909 unicode_doc, /* tp_doc */
9910 0, /* tp_traverse */
9911 0, /* tp_clear */
9912 PyUnicode_RichCompare, /* tp_richcompare */
9913 0, /* tp_weaklistoffset */
9914 unicode_iter, /* tp_iter */
9915 0, /* tp_iternext */
9916 unicode_methods, /* tp_methods */
9917 0, /* tp_members */
9918 0, /* tp_getset */
9919 &PyBaseObject_Type, /* tp_base */
9920 0, /* tp_dict */
9921 0, /* tp_descr_get */
9922 0, /* tp_descr_set */
9923 0, /* tp_dictoffset */
9924 0, /* tp_init */
9925 0, /* tp_alloc */
9926 unicode_new, /* tp_new */
9927 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928};
9929
9930/* Initialize the Unicode implementation */
9931
Thomas Wouters78890102000-07-22 19:25:51 +00009932void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009934 int i;
9935
Thomas Wouters477c8d52006-05-27 19:21:47 +00009936 /* XXX - move this array to unicodectype.c ? */
9937 Py_UNICODE linebreak[] = {
9938 0x000A, /* LINE FEED */
9939 0x000D, /* CARRIAGE RETURN */
9940 0x001C, /* FILE SEPARATOR */
9941 0x001D, /* GROUP SEPARATOR */
9942 0x001E, /* RECORD SEPARATOR */
9943 0x0085, /* NEXT LINE */
9944 0x2028, /* LINE SEPARATOR */
9945 0x2029, /* PARAGRAPH SEPARATOR */
9946 };
9947
Fred Drakee4315f52000-05-09 19:53:39 +00009948 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009949 free_list = NULL;
9950 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009952 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009953 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009954
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009955 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009956 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009957 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009958 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009959
9960 /* initialize the linebreak bloom filter */
9961 bloom_linebreak = make_bloom_mask(
9962 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9963 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009964
9965 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966}
9967
9968/* Finalize the Unicode implementation */
9969
Christian Heimesa156e092008-02-16 07:38:31 +00009970int
9971PyUnicode_ClearFreeList(void)
9972{
9973 int freelist_size = numfree;
9974 PyUnicodeObject *u;
9975
9976 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009977 PyUnicodeObject *v = u;
9978 u = *(PyUnicodeObject **)u;
9979 if (v->str)
9980 PyObject_DEL(v->str);
9981 Py_XDECREF(v->defenc);
9982 PyObject_Del(v);
9983 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009984 }
9985 free_list = NULL;
9986 assert(numfree == 0);
9987 return freelist_size;
9988}
9989
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990void
Thomas Wouters78890102000-07-22 19:25:51 +00009991_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009992{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009993 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009994
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009995 Py_XDECREF(unicode_empty);
9996 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00009997
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009998 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009999 if (unicode_latin1[i]) {
10000 Py_DECREF(unicode_latin1[i]);
10001 unicode_latin1[i] = NULL;
10002 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010003 }
Christian Heimesa156e092008-02-16 07:38:31 +000010004 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010005}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010006
Walter Dörwald16807132007-05-25 13:52:07 +000010007void
10008PyUnicode_InternInPlace(PyObject **p)
10009{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010010 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10011 PyObject *t;
10012 if (s == NULL || !PyUnicode_Check(s))
10013 Py_FatalError(
10014 "PyUnicode_InternInPlace: unicode strings only please!");
10015 /* If it's a subclass, we don't really know what putting
10016 it in the interned dict might do. */
10017 if (!PyUnicode_CheckExact(s))
10018 return;
10019 if (PyUnicode_CHECK_INTERNED(s))
10020 return;
10021 if (interned == NULL) {
10022 interned = PyDict_New();
10023 if (interned == NULL) {
10024 PyErr_Clear(); /* Don't leave an exception */
10025 return;
10026 }
10027 }
10028 /* It might be that the GetItem call fails even
10029 though the key is present in the dictionary,
10030 namely when this happens during a stack overflow. */
10031 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010032 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010033 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010034
Benjamin Peterson29060642009-01-31 22:14:21 +000010035 if (t) {
10036 Py_INCREF(t);
10037 Py_DECREF(*p);
10038 *p = t;
10039 return;
10040 }
Walter Dörwald16807132007-05-25 13:52:07 +000010041
Benjamin Peterson14339b62009-01-31 16:36:08 +000010042 PyThreadState_GET()->recursion_critical = 1;
10043 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10044 PyErr_Clear();
10045 PyThreadState_GET()->recursion_critical = 0;
10046 return;
10047 }
10048 PyThreadState_GET()->recursion_critical = 0;
10049 /* The two references in interned are not counted by refcnt.
10050 The deallocator will take care of this */
10051 Py_REFCNT(s) -= 2;
10052 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010053}
10054
10055void
10056PyUnicode_InternImmortal(PyObject **p)
10057{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010058 PyUnicode_InternInPlace(p);
10059 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10060 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10061 Py_INCREF(*p);
10062 }
Walter Dörwald16807132007-05-25 13:52:07 +000010063}
10064
10065PyObject *
10066PyUnicode_InternFromString(const char *cp)
10067{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010068 PyObject *s = PyUnicode_FromString(cp);
10069 if (s == NULL)
10070 return NULL;
10071 PyUnicode_InternInPlace(&s);
10072 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010073}
10074
10075void _Py_ReleaseInternedUnicodeStrings(void)
10076{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010077 PyObject *keys;
10078 PyUnicodeObject *s;
10079 Py_ssize_t i, n;
10080 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010081
Benjamin Peterson14339b62009-01-31 16:36:08 +000010082 if (interned == NULL || !PyDict_Check(interned))
10083 return;
10084 keys = PyDict_Keys(interned);
10085 if (keys == NULL || !PyList_Check(keys)) {
10086 PyErr_Clear();
10087 return;
10088 }
Walter Dörwald16807132007-05-25 13:52:07 +000010089
Benjamin Peterson14339b62009-01-31 16:36:08 +000010090 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10091 detector, interned unicode strings are not forcibly deallocated;
10092 rather, we give them their stolen references back, and then clear
10093 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010094
Benjamin Peterson14339b62009-01-31 16:36:08 +000010095 n = PyList_GET_SIZE(keys);
10096 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010097 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010098 for (i = 0; i < n; i++) {
10099 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10100 switch (s->state) {
10101 case SSTATE_NOT_INTERNED:
10102 /* XXX Shouldn't happen */
10103 break;
10104 case SSTATE_INTERNED_IMMORTAL:
10105 Py_REFCNT(s) += 1;
10106 immortal_size += s->length;
10107 break;
10108 case SSTATE_INTERNED_MORTAL:
10109 Py_REFCNT(s) += 2;
10110 mortal_size += s->length;
10111 break;
10112 default:
10113 Py_FatalError("Inconsistent interned string state.");
10114 }
10115 s->state = SSTATE_NOT_INTERNED;
10116 }
10117 fprintf(stderr, "total size of all interned strings: "
10118 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10119 "mortal/immortal\n", mortal_size, immortal_size);
10120 Py_DECREF(keys);
10121 PyDict_Clear(interned);
10122 Py_DECREF(interned);
10123 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010124}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010125
10126
10127/********************* Unicode Iterator **************************/
10128
10129typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010130 PyObject_HEAD
10131 Py_ssize_t it_index;
10132 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010133} unicodeiterobject;
10134
10135static void
10136unicodeiter_dealloc(unicodeiterobject *it)
10137{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010138 _PyObject_GC_UNTRACK(it);
10139 Py_XDECREF(it->it_seq);
10140 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010141}
10142
10143static int
10144unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10145{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010146 Py_VISIT(it->it_seq);
10147 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010148}
10149
10150static PyObject *
10151unicodeiter_next(unicodeiterobject *it)
10152{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010153 PyUnicodeObject *seq;
10154 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010155
Benjamin Peterson14339b62009-01-31 16:36:08 +000010156 assert(it != NULL);
10157 seq = it->it_seq;
10158 if (seq == NULL)
10159 return NULL;
10160 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010161
Benjamin Peterson14339b62009-01-31 16:36:08 +000010162 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10163 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010164 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010165 if (item != NULL)
10166 ++it->it_index;
10167 return item;
10168 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010169
Benjamin Peterson14339b62009-01-31 16:36:08 +000010170 Py_DECREF(seq);
10171 it->it_seq = NULL;
10172 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010173}
10174
10175static PyObject *
10176unicodeiter_len(unicodeiterobject *it)
10177{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010178 Py_ssize_t len = 0;
10179 if (it->it_seq)
10180 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10181 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010182}
10183
10184PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10185
10186static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010187 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010188 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010189 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010190};
10191
10192PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010193 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10194 "str_iterator", /* tp_name */
10195 sizeof(unicodeiterobject), /* tp_basicsize */
10196 0, /* tp_itemsize */
10197 /* methods */
10198 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10199 0, /* tp_print */
10200 0, /* tp_getattr */
10201 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010202 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010203 0, /* tp_repr */
10204 0, /* tp_as_number */
10205 0, /* tp_as_sequence */
10206 0, /* tp_as_mapping */
10207 0, /* tp_hash */
10208 0, /* tp_call */
10209 0, /* tp_str */
10210 PyObject_GenericGetAttr, /* tp_getattro */
10211 0, /* tp_setattro */
10212 0, /* tp_as_buffer */
10213 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10214 0, /* tp_doc */
10215 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10216 0, /* tp_clear */
10217 0, /* tp_richcompare */
10218 0, /* tp_weaklistoffset */
10219 PyObject_SelfIter, /* tp_iter */
10220 (iternextfunc)unicodeiter_next, /* tp_iternext */
10221 unicodeiter_methods, /* tp_methods */
10222 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010223};
10224
10225static PyObject *
10226unicode_iter(PyObject *seq)
10227{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010228 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010229
Benjamin Peterson14339b62009-01-31 16:36:08 +000010230 if (!PyUnicode_Check(seq)) {
10231 PyErr_BadInternalCall();
10232 return NULL;
10233 }
10234 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10235 if (it == NULL)
10236 return NULL;
10237 it->it_index = 0;
10238 Py_INCREF(seq);
10239 it->it_seq = (PyUnicodeObject *)seq;
10240 _PyObject_GC_TRACK(it);
10241 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010242}
10243
Martin v. Löwis5b222132007-06-10 09:51:05 +000010244size_t
10245Py_UNICODE_strlen(const Py_UNICODE *u)
10246{
10247 int res = 0;
10248 while(*u++)
10249 res++;
10250 return res;
10251}
10252
10253Py_UNICODE*
10254Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10255{
10256 Py_UNICODE *u = s1;
10257 while ((*u++ = *s2++));
10258 return s1;
10259}
10260
10261Py_UNICODE*
10262Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10263{
10264 Py_UNICODE *u = s1;
10265 while ((*u++ = *s2++))
10266 if (n-- == 0)
10267 break;
10268 return s1;
10269}
10270
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010271Py_UNICODE*
10272Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10273{
10274 Py_UNICODE *u1 = s1;
10275 u1 += Py_UNICODE_strlen(u1);
10276 Py_UNICODE_strcpy(u1, s2);
10277 return s1;
10278}
10279
Martin v. Löwis5b222132007-06-10 09:51:05 +000010280int
10281Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10282{
10283 while (*s1 && *s2 && *s1 == *s2)
10284 s1++, s2++;
10285 if (*s1 && *s2)
10286 return (*s1 < *s2) ? -1 : +1;
10287 if (*s1)
10288 return 1;
10289 if (*s2)
10290 return -1;
10291 return 0;
10292}
10293
Victor Stinneref8d95c2010-08-16 22:03:11 +000010294int
10295Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10296{
10297 register Py_UNICODE u1, u2;
10298 for (; n != 0; n--) {
10299 u1 = *s1;
10300 u2 = *s2;
10301 if (u1 != u2)
10302 return (u1 < u2) ? -1 : +1;
10303 if (u1 == '\0')
10304 return 0;
10305 s1++;
10306 s2++;
10307 }
10308 return 0;
10309}
10310
Martin v. Löwis5b222132007-06-10 09:51:05 +000010311Py_UNICODE*
10312Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10313{
10314 const Py_UNICODE *p;
10315 for (p = s; *p; p++)
10316 if (*p == c)
10317 return (Py_UNICODE*)p;
10318 return NULL;
10319}
10320
Victor Stinner331ea922010-08-10 16:37:20 +000010321Py_UNICODE*
10322Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10323{
10324 const Py_UNICODE *p;
10325 p = s + Py_UNICODE_strlen(s);
10326 while (p != s) {
10327 p--;
10328 if (*p == c)
10329 return (Py_UNICODE*)p;
10330 }
10331 return NULL;
10332}
10333
Victor Stinner71133ff2010-09-01 23:43:53 +000010334Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010335PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010336{
10337 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10338 Py_UNICODE *copy;
10339 Py_ssize_t size;
10340
10341 /* Ensure we won't overflow the size. */
10342 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10343 PyErr_NoMemory();
10344 return NULL;
10345 }
10346 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10347 size *= sizeof(Py_UNICODE);
10348 copy = PyMem_Malloc(size);
10349 if (copy == NULL) {
10350 PyErr_NoMemory();
10351 return NULL;
10352 }
10353 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10354 return copy;
10355}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010356
Georg Brandl66c221e2010-10-14 07:04:07 +000010357/* A _string module, to export formatter_parser and formatter_field_name_split
10358 to the string.Formatter class implemented in Python. */
10359
10360static PyMethodDef _string_methods[] = {
10361 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10362 METH_O, PyDoc_STR("split the argument as a field name")},
10363 {"formatter_parser", (PyCFunction) formatter_parser,
10364 METH_O, PyDoc_STR("parse the argument as a format string")},
10365 {NULL, NULL}
10366};
10367
10368static struct PyModuleDef _string_module = {
10369 PyModuleDef_HEAD_INIT,
10370 "_string",
10371 PyDoc_STR("string helper module"),
10372 0,
10373 _string_methods,
10374 NULL,
10375 NULL,
10376 NULL,
10377 NULL
10378};
10379
10380PyMODINIT_FUNC
10381PyInit__string(void)
10382{
10383 return PyModule_Create(&_string_module);
10384}
10385
10386
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010387#ifdef __cplusplus
10388}
10389#endif