blob: 48ea0a2043e5b8d0e0b1691b1bcb5a48e1cf3874 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000145static PyObject *unicode_encode_call_errorhandler(const char *errors,
146 PyObject **errorHandler,const char *encoding, const char *reason,
147 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
148 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
149
Victor Stinner31be90b2010-04-22 19:38:16 +0000150static void raise_encode_exception(PyObject **exceptionObject,
151 const char *encoding,
152 const Py_UNICODE *unicode, Py_ssize_t size,
153 Py_ssize_t startpos, Py_ssize_t endpos,
154 const char *reason);
155
Christian Heimes190d79e2008-01-30 11:58:22 +0000156/* Same for linebreaks */
157static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000160/* 0x000B, * LINE TABULATION */
161/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x001C, * FILE SEPARATOR */
166/* 0x001D, * GROUP SEPARATOR */
167/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000168 0, 0, 0, 0, 1, 1, 1, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000173
Benjamin Peterson14339b62009-01-31 16:36:08 +0000174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000182};
183
184
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000185Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000186PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000188#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000189 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 /* This is actually an illegal character, so it should
192 not be passed to unichr. */
193 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#endif
195}
196
Thomas Wouters477c8d52006-05-27 19:21:47 +0000197/* --- Bloom Filters ----------------------------------------------------- */
198
199/* stuff to implement simple "bloom filters" for Unicode characters.
200 to keep things simple, we use a single bitmask, using the least 5
201 bits from each unicode characters as the bit index. */
202
203/* the linebreak mask is set up by Unicode_Init below */
204
Antoine Pitrouf068f942010-01-13 14:19:12 +0000205#if LONG_BIT >= 128
206#define BLOOM_WIDTH 128
207#elif LONG_BIT >= 64
208#define BLOOM_WIDTH 64
209#elif LONG_BIT >= 32
210#define BLOOM_WIDTH 32
211#else
212#error "LONG_BIT is smaller than 32"
213#endif
214
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215#define BLOOM_MASK unsigned long
216
217static BLOOM_MASK bloom_linebreak;
218
Antoine Pitrouf068f942010-01-13 14:19:12 +0000219#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
220#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221
Benjamin Peterson29060642009-01-31 22:14:21 +0000222#define BLOOM_LINEBREAK(ch) \
223 ((ch) < 128U ? ascii_linebreak[(ch)] : \
224 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000225
226Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
227{
228 /* calculate simple bloom-style bitmask for a given unicode string */
229
Antoine Pitrouf068f942010-01-13 14:19:12 +0000230 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231 Py_ssize_t i;
232
233 mask = 0;
234 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000235 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000236
237 return mask;
238}
239
240Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
241{
242 Py_ssize_t i;
243
244 for (i = 0; i < setlen; i++)
245 if (set[i] == chr)
246 return 1;
247
248 return 0;
249}
250
Benjamin Peterson29060642009-01-31 22:14:21 +0000251#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000252 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
253
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254/* --- Unicode Object ----------------------------------------------------- */
255
256static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000257int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000258 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259{
260 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000261
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000266 /* Resizing shared object (unicode_empty or single character
267 objects) in-place is not allowed. Use PyUnicode_Resize()
268 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000271 (unicode->length == 1 &&
272 unicode->str[0] < 256U &&
273 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000275 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 return -1;
277 }
278
Thomas Wouters477c8d52006-05-27 19:21:47 +0000279 /* We allocate one more byte to make sure the string is Ux0000 terminated.
280 The overallocation is also used by fastsearch, which assumes that it's
281 safe to look at str[length] (without making any assumptions about what
282 it contains). */
283
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000285 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000286 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 PyErr_NoMemory();
290 return -1;
291 }
292 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294
Benjamin Peterson29060642009-01-31 22:14:21 +0000295 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000297 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000298 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 }
300 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000301
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 return 0;
303}
304
305/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000306 Ux0000 terminated; some code (e.g. new_identifier)
307 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308
309 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000310 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312*/
313
314static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316{
317 register PyUnicodeObject *unicode;
318
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 if (length == 0 && unicode_empty != NULL) {
321 Py_INCREF(unicode_empty);
322 return unicode_empty;
323 }
324
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000325 /* Ensure we won't overflow the size. */
326 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
327 return (PyUnicodeObject *)PyErr_NoMemory();
328 }
329
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000331 if (free_list) {
332 unicode = free_list;
333 free_list = *(PyUnicodeObject **)unicode;
334 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000335 if (unicode->str) {
336 /* Keep-Alive optimization: we only upsize the buffer,
337 never downsize it. */
338 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000339 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000340 PyObject_DEL(unicode->str);
341 unicode->str = NULL;
342 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000343 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 }
348 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 }
350 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000351 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000352 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 if (unicode == NULL)
354 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
356 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 }
358
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000359 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000360 PyErr_NoMemory();
361 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000363 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000364 * the caller fails before initializing str -- unicode_resize()
365 * reads str[0], and the Keep-Alive optimization can keep memory
366 * allocated for str alive across a call to unicode_dealloc(unicode).
367 * We don't want unicode_resize to read uninitialized memory in
368 * that case.
369 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000370 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000374 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000375 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377
Benjamin Peterson29060642009-01-31 22:14:21 +0000378 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000379 /* XXX UNREF/NEWREF interface should be more symmetrical */
380 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000382 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000383 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384}
385
386static
Guido van Rossum9475a232001-10-05 20:51:39 +0000387void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388{
Walter Dörwald16807132007-05-25 13:52:07 +0000389 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 case SSTATE_NOT_INTERNED:
391 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000392
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_INTERNED_MORTAL:
394 /* revive dead object temporarily for DelItem */
395 Py_REFCNT(unicode) = 3;
396 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
397 Py_FatalError(
398 "deletion of interned string failed");
399 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000400
Benjamin Peterson29060642009-01-31 22:14:21 +0000401 case SSTATE_INTERNED_IMMORTAL:
402 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 default:
405 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406 }
407
Guido van Rossum604ddf82001-12-06 20:03:56 +0000408 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000409 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000410 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000411 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
412 PyObject_DEL(unicode->str);
413 unicode->str = NULL;
414 unicode->length = 0;
415 }
416 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000417 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000418 }
419 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000420 *(PyUnicodeObject **)unicode = free_list;
421 free_list = unicode;
422 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000425 PyObject_DEL(unicode->str);
426 Py_XDECREF(unicode->defenc);
427 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428 }
429}
430
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000431static
432int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433{
434 register PyUnicodeObject *v;
435
436 /* Argument checks */
437 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000441 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000442 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000443 PyErr_BadInternalCall();
444 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000445 }
446
447 /* Resizing unicode_empty and single character objects is not
448 possible since these are being shared. We simply return a fresh
449 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000450 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000451 (v == unicode_empty || v->length == 1)) {
452 PyUnicodeObject *w = _PyUnicode_New(length);
453 if (w == NULL)
454 return -1;
455 Py_UNICODE_COPY(w->str, v->str,
456 length < v->length ? length : v->length);
457 Py_DECREF(*unicode);
458 *unicode = w;
459 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 }
461
462 /* Note that we don't have to modify *unicode for unshared Unicode
463 objects, since we can modify them in-place. */
464 return unicode_resize(v, length);
465}
466
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000467int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
468{
469 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
470}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000471
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000473 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474{
475 PyUnicodeObject *unicode;
476
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000477 /* If the Unicode data is known at construction time, we can apply
478 some optimizations which share commonly used objects. */
479 if (u != NULL) {
480
Benjamin Peterson29060642009-01-31 22:14:21 +0000481 /* Optimization for empty strings */
482 if (size == 0 && unicode_empty != NULL) {
483 Py_INCREF(unicode_empty);
484 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000485 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000486
487 /* Single character Unicode objects in the Latin-1 range are
488 shared when using this constructor */
489 if (size == 1 && *u < 256) {
490 unicode = unicode_latin1[*u];
491 if (!unicode) {
492 unicode = _PyUnicode_New(1);
493 if (!unicode)
494 return NULL;
495 unicode->str[0] = *u;
496 unicode_latin1[*u] = unicode;
497 }
498 Py_INCREF(unicode);
499 return (PyObject *)unicode;
500 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000501 }
Tim Petersced69f82003-09-16 20:30:58 +0000502
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 unicode = _PyUnicode_New(size);
504 if (!unicode)
505 return NULL;
506
507 /* Copy the Unicode data into the new object */
508 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000509 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510
511 return (PyObject *)unicode;
512}
513
Walter Dörwaldd2034312007-05-18 16:29:38 +0000514PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000515{
516 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000517
Benjamin Peterson14339b62009-01-31 16:36:08 +0000518 if (size < 0) {
519 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000520 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 return NULL;
522 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000524 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000525 some optimizations which share commonly used objects.
526 Also, this means the input must be UTF-8, so fall back to the
527 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000528 if (u != NULL) {
529
Benjamin Peterson29060642009-01-31 22:14:21 +0000530 /* Optimization for empty strings */
531 if (size == 0 && unicode_empty != NULL) {
532 Py_INCREF(unicode_empty);
533 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000534 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000535
536 /* Single characters are shared when using this constructor.
537 Restrict to ASCII, since the input must be UTF-8. */
538 if (size == 1 && Py_CHARMASK(*u) < 128) {
539 unicode = unicode_latin1[Py_CHARMASK(*u)];
540 if (!unicode) {
541 unicode = _PyUnicode_New(1);
542 if (!unicode)
543 return NULL;
544 unicode->str[0] = Py_CHARMASK(*u);
545 unicode_latin1[Py_CHARMASK(*u)] = unicode;
546 }
547 Py_INCREF(unicode);
548 return (PyObject *)unicode;
549 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000550
551 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000552 }
553
Walter Dörwald55507312007-05-18 13:12:10 +0000554 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 if (!unicode)
556 return NULL;
557
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 return (PyObject *)unicode;
559}
560
Walter Dörwaldd2034312007-05-18 16:29:38 +0000561PyObject *PyUnicode_FromString(const char *u)
562{
563 size_t size = strlen(u);
564 if (size > PY_SSIZE_T_MAX) {
565 PyErr_SetString(PyExc_OverflowError, "input too long");
566 return NULL;
567 }
568
569 return PyUnicode_FromStringAndSize(u, size);
570}
571
Guido van Rossumd57fd912000-03-10 22:53:23 +0000572#ifdef HAVE_WCHAR_H
573
Mark Dickinson081dfee2009-03-18 14:47:41 +0000574#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
575# define CONVERT_WCHAR_TO_SURROGATES
576#endif
577
578#ifdef CONVERT_WCHAR_TO_SURROGATES
579
580/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
581 to convert from UTF32 to UTF16. */
582
583PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
584 Py_ssize_t size)
585{
586 PyUnicodeObject *unicode;
587 register Py_ssize_t i;
588 Py_ssize_t alloc;
589 const wchar_t *orig_w;
590
591 if (w == NULL) {
592 if (size == 0)
593 return PyUnicode_FromStringAndSize(NULL, 0);
594 PyErr_BadInternalCall();
595 return NULL;
596 }
597
598 if (size == -1) {
599 size = wcslen(w);
600 }
601
602 alloc = size;
603 orig_w = w;
604 for (i = size; i > 0; i--) {
605 if (*w > 0xFFFF)
606 alloc++;
607 w++;
608 }
609 w = orig_w;
610 unicode = _PyUnicode_New(alloc);
611 if (!unicode)
612 return NULL;
613
614 /* Copy the wchar_t data into the new object */
615 {
616 register Py_UNICODE *u;
617 u = PyUnicode_AS_UNICODE(unicode);
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF) {
620 wchar_t ordinal = *w++;
621 ordinal -= 0x10000;
622 *u++ = 0xD800 | (ordinal >> 10);
623 *u++ = 0xDC00 | (ordinal & 0x3FF);
624 }
625 else
626 *u++ = *w++;
627 }
628 }
629 return (PyObject *)unicode;
630}
631
632#else
633
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000635 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636{
637 PyUnicodeObject *unicode;
638
639 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000640 if (size == 0)
641 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000642 PyErr_BadInternalCall();
643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000644 }
645
Martin v. Löwis790465f2008-04-05 20:41:37 +0000646 if (size == -1) {
647 size = wcslen(w);
648 }
649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = _PyUnicode_New(size);
651 if (!unicode)
652 return NULL;
653
654 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000655#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000657#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000659 register Py_UNICODE *u;
660 register Py_ssize_t i;
661 u = PyUnicode_AS_UNICODE(unicode);
662 for (i = size; i > 0; i--)
663 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 }
665#endif
666
667 return (PyObject *)unicode;
668}
669
Mark Dickinson081dfee2009-03-18 14:47:41 +0000670#endif /* CONVERT_WCHAR_TO_SURROGATES */
671
672#undef CONVERT_WCHAR_TO_SURROGATES
673
Walter Dörwald346737f2007-05-31 10:44:43 +0000674static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000675makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
676 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000677{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000678 *fmt++ = '%';
679 if (width) {
680 if (zeropad)
681 *fmt++ = '0';
682 fmt += sprintf(fmt, "%d", width);
683 }
684 if (precision)
685 fmt += sprintf(fmt, ".%d", precision);
686 if (longflag)
687 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000688 else if (longlongflag) {
689 /* longlongflag should only ever be nonzero on machines with
690 HAVE_LONG_LONG defined */
691#ifdef HAVE_LONG_LONG
692 char *f = PY_FORMAT_LONG_LONG;
693 while (*f)
694 *fmt++ = *f++;
695#else
696 /* we shouldn't ever get here */
697 assert(0);
698 *fmt++ = 'l';
699#endif
700 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000701 else if (size_tflag) {
702 char *f = PY_FORMAT_SIZE_T;
703 while (*f)
704 *fmt++ = *f++;
705 }
706 *fmt++ = c;
707 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000708}
709
Walter Dörwaldd2034312007-05-18 16:29:38 +0000710#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
711
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000712/* size of fixed-size buffer for formatting single arguments */
713#define ITEM_BUFFER_LEN 21
714/* maximum number of characters required for output of %ld. 21 characters
715 allows for 64-bit integers (in decimal) and an optional sign. */
716#define MAX_LONG_CHARS 21
717/* maximum number of characters required for output of %lld.
718 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
719 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
720#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
721
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722PyObject *
723PyUnicode_FromFormatV(const char *format, va_list vargs)
724{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000725 va_list count;
726 Py_ssize_t callcount = 0;
727 PyObject **callresults = NULL;
728 PyObject **callresult = NULL;
729 Py_ssize_t n = 0;
730 int width = 0;
731 int precision = 0;
732 int zeropad;
733 const char* f;
734 Py_UNICODE *s;
735 PyObject *string;
736 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000737 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000738 /* use abuffer instead of buffer, if we need more space
739 * (which can happen if there's a format specifier with width). */
740 char *abuffer = NULL;
741 char *realbuffer;
742 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000743 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000744 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000745
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000746 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000747 /* step 1: count the number of %S/%R/%A/%s format specifications
748 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
749 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
750 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000751 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000752 if (*f == '%') {
753 if (*(f+1)=='%')
754 continue;
755 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
756 ++callcount;
David Malcolm96960882010-11-05 17:23:41 +0000757 while (Py_ISDIGIT((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000758 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000759 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000760 ;
761 if (*f == 's')
762 ++callcount;
763 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000764 else if (128 <= (unsigned char)*f) {
765 PyErr_Format(PyExc_ValueError,
766 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000767 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000768 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000769 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 }
772 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000773 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000774 if (callcount) {
775 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
776 if (!callresults) {
777 PyErr_NoMemory();
778 return NULL;
779 }
780 callresult = callresults;
781 }
782 /* step 3: figure out how large a buffer we need */
783 for (f = format; *f; f++) {
784 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000785#ifdef HAVE_LONG_LONG
786 int longlongflag = 0;
787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000788 const char* p = f;
789 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000790 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000792 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000793 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794
Benjamin Peterson14339b62009-01-31 16:36:08 +0000795 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
796 * they don't affect the amount of space we reserve.
797 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000798 if (*f == 'l') {
799 if (f[1] == 'd' || f[1] == 'u') {
800 ++f;
801 }
802#ifdef HAVE_LONG_LONG
803 else if (f[1] == 'l' &&
804 (f[2] == 'd' || f[2] == 'u')) {
805 longlongflag = 1;
806 f += 2;
807 }
808#endif
809 }
810 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000811 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000812 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000813
Benjamin Peterson14339b62009-01-31 16:36:08 +0000814 switch (*f) {
815 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000816 {
817#ifndef Py_UNICODE_WIDE
818 int ordinal = va_arg(count, int);
819 if (ordinal > 0xffff)
820 n += 2;
821 else
822 n++;
823#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000824 (void)va_arg(count, int);
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000825 n++;
826#endif
827 break;
828 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 case '%':
830 n++;
831 break;
832 case 'd': case 'u': case 'i': case 'x':
833 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000834#ifdef HAVE_LONG_LONG
835 if (longlongflag) {
836 if (width < MAX_LONG_LONG_CHARS)
837 width = MAX_LONG_LONG_CHARS;
838 }
839 else
840#endif
841 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
842 including sign. Decimal takes the most space. This
843 isn't enough for octal. If a width is specified we
844 need more (which we allocate later). */
845 if (width < MAX_LONG_CHARS)
846 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000847 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000848 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000849 if (abuffersize < width)
850 abuffersize = width;
851 break;
852 case 's':
853 {
854 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000855 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000856 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
857 if (!str)
858 goto fail;
859 n += PyUnicode_GET_SIZE(str);
860 /* Remember the str and switch to the next slot */
861 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000862 break;
863 }
864 case 'U':
865 {
866 PyObject *obj = va_arg(count, PyObject *);
867 assert(obj && PyUnicode_Check(obj));
868 n += PyUnicode_GET_SIZE(obj);
869 break;
870 }
871 case 'V':
872 {
873 PyObject *obj = va_arg(count, PyObject *);
874 const char *str = va_arg(count, const char *);
875 assert(obj || str);
876 assert(!obj || PyUnicode_Check(obj));
877 if (obj)
878 n += PyUnicode_GET_SIZE(obj);
879 else
880 n += strlen(str);
881 break;
882 }
883 case 'S':
884 {
885 PyObject *obj = va_arg(count, PyObject *);
886 PyObject *str;
887 assert(obj);
888 str = PyObject_Str(obj);
889 if (!str)
890 goto fail;
891 n += PyUnicode_GET_SIZE(str);
892 /* Remember the str and switch to the next slot */
893 *callresult++ = str;
894 break;
895 }
896 case 'R':
897 {
898 PyObject *obj = va_arg(count, PyObject *);
899 PyObject *repr;
900 assert(obj);
901 repr = PyObject_Repr(obj);
902 if (!repr)
903 goto fail;
904 n += PyUnicode_GET_SIZE(repr);
905 /* Remember the repr and switch to the next slot */
906 *callresult++ = repr;
907 break;
908 }
909 case 'A':
910 {
911 PyObject *obj = va_arg(count, PyObject *);
912 PyObject *ascii;
913 assert(obj);
914 ascii = PyObject_ASCII(obj);
915 if (!ascii)
916 goto fail;
917 n += PyUnicode_GET_SIZE(ascii);
918 /* Remember the repr and switch to the next slot */
919 *callresult++ = ascii;
920 break;
921 }
922 case 'p':
923 (void) va_arg(count, int);
924 /* maximum 64-bit pointer representation:
925 * 0xffffffffffffffff
926 * so 19 characters is enough.
927 * XXX I count 18 -- what's the extra for?
928 */
929 n += 19;
930 break;
931 default:
932 /* if we stumble upon an unknown
933 formatting code, copy the rest of
934 the format string to the output
935 string. (we cannot just skip the
936 code, since there's no way to know
937 what's in the argument list) */
938 n += strlen(p);
939 goto expand;
940 }
941 } else
942 n++;
943 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000944 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000945 if (abuffersize > ITEM_BUFFER_LEN) {
946 /* add 1 for sprintf's trailing null byte */
947 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000948 if (!abuffer) {
949 PyErr_NoMemory();
950 goto fail;
951 }
952 realbuffer = abuffer;
953 }
954 else
955 realbuffer = buffer;
956 /* step 4: fill the buffer */
957 /* Since we've analyzed how much space we need for the worst case,
958 we don't have to resize the string.
959 There can be no errors beyond this point. */
960 string = PyUnicode_FromUnicode(NULL, n);
961 if (!string)
962 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000963
Benjamin Peterson14339b62009-01-31 16:36:08 +0000964 s = PyUnicode_AS_UNICODE(string);
965 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000966
Benjamin Peterson14339b62009-01-31 16:36:08 +0000967 for (f = format; *f; f++) {
968 if (*f == '%') {
969 const char* p = f++;
970 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000971 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000972 int size_tflag = 0;
973 zeropad = (*f == '0');
974 /* parse the width.precision part */
975 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000976 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000977 width = (width*10) + *f++ - '0';
978 precision = 0;
979 if (*f == '.') {
980 f++;
David Malcolm96960882010-11-05 17:23:41 +0000981 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000982 precision = (precision*10) + *f++ - '0';
983 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000984 /* Handle %ld, %lu, %lld and %llu. */
985 if (*f == 'l') {
986 if (f[1] == 'd' || f[1] == 'u') {
987 longflag = 1;
988 ++f;
989 }
990#ifdef HAVE_LONG_LONG
991 else if (f[1] == 'l' &&
992 (f[2] == 'd' || f[2] == 'u')) {
993 longlongflag = 1;
994 f += 2;
995 }
996#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000997 }
998 /* handle the size_t flag. */
999 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1000 size_tflag = 1;
1001 ++f;
1002 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001003
Benjamin Peterson14339b62009-01-31 16:36:08 +00001004 switch (*f) {
1005 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001006 {
1007 int ordinal = va_arg(vargs, int);
1008#ifndef Py_UNICODE_WIDE
1009 if (ordinal > 0xffff) {
1010 ordinal -= 0x10000;
1011 *s++ = 0xD800 | (ordinal >> 10);
1012 *s++ = 0xDC00 | (ordinal & 0x3FF);
1013 } else
1014#endif
1015 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001016 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001017 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001019 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1020 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 if (longflag)
1022 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001023#ifdef HAVE_LONG_LONG
1024 else if (longlongflag)
1025 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1026#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001027 else if (size_tflag)
1028 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1029 else
1030 sprintf(realbuffer, fmt, va_arg(vargs, int));
1031 appendstring(realbuffer);
1032 break;
1033 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001034 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1035 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001036 if (longflag)
1037 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001038#ifdef HAVE_LONG_LONG
1039 else if (longlongflag)
1040 sprintf(realbuffer, fmt, va_arg(vargs,
1041 unsigned PY_LONG_LONG));
1042#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001043 else if (size_tflag)
1044 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1045 else
1046 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1047 appendstring(realbuffer);
1048 break;
1049 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001050 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 sprintf(realbuffer, fmt, va_arg(vargs, int));
1052 appendstring(realbuffer);
1053 break;
1054 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001055 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001056 sprintf(realbuffer, fmt, va_arg(vargs, int));
1057 appendstring(realbuffer);
1058 break;
1059 case 's':
1060 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001061 /* unused, since we already have the result */
1062 (void) va_arg(vargs, char *);
1063 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1064 PyUnicode_GET_SIZE(*callresult));
1065 s += PyUnicode_GET_SIZE(*callresult);
1066 /* We're done with the unicode()/repr() => forget it */
1067 Py_DECREF(*callresult);
1068 /* switch to next unicode()/repr() result */
1069 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001070 break;
1071 }
1072 case 'U':
1073 {
1074 PyObject *obj = va_arg(vargs, PyObject *);
1075 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1076 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1077 s += size;
1078 break;
1079 }
1080 case 'V':
1081 {
1082 PyObject *obj = va_arg(vargs, PyObject *);
1083 const char *str = va_arg(vargs, const char *);
1084 if (obj) {
1085 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1086 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1087 s += size;
1088 } else {
1089 appendstring(str);
1090 }
1091 break;
1092 }
1093 case 'S':
1094 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001095 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001096 {
1097 Py_UNICODE *ucopy;
1098 Py_ssize_t usize;
1099 Py_ssize_t upos;
1100 /* unused, since we already have the result */
1101 (void) va_arg(vargs, PyObject *);
1102 ucopy = PyUnicode_AS_UNICODE(*callresult);
1103 usize = PyUnicode_GET_SIZE(*callresult);
1104 for (upos = 0; upos<usize;)
1105 *s++ = ucopy[upos++];
1106 /* We're done with the unicode()/repr() => forget it */
1107 Py_DECREF(*callresult);
1108 /* switch to next unicode()/repr() result */
1109 ++callresult;
1110 break;
1111 }
1112 case 'p':
1113 sprintf(buffer, "%p", va_arg(vargs, void*));
1114 /* %p is ill-defined: ensure leading 0x. */
1115 if (buffer[1] == 'X')
1116 buffer[1] = 'x';
1117 else if (buffer[1] != 'x') {
1118 memmove(buffer+2, buffer, strlen(buffer)+1);
1119 buffer[0] = '0';
1120 buffer[1] = 'x';
1121 }
1122 appendstring(buffer);
1123 break;
1124 case '%':
1125 *s++ = '%';
1126 break;
1127 default:
1128 appendstring(p);
1129 goto end;
1130 }
Victor Stinner1205f272010-09-11 00:54:47 +00001131 }
Victor Stinner1205f272010-09-11 00:54:47 +00001132 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001133 *s++ = *f;
1134 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001135
Benjamin Peterson29060642009-01-31 22:14:21 +00001136 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001137 if (callresults)
1138 PyObject_Free(callresults);
1139 if (abuffer)
1140 PyObject_Free(abuffer);
1141 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1142 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001143 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001144 if (callresults) {
1145 PyObject **callresult2 = callresults;
1146 while (callresult2 < callresult) {
1147 Py_DECREF(*callresult2);
1148 ++callresult2;
1149 }
1150 PyObject_Free(callresults);
1151 }
1152 if (abuffer)
1153 PyObject_Free(abuffer);
1154 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001155}
1156
1157#undef appendstring
1158
1159PyObject *
1160PyUnicode_FromFormat(const char *format, ...)
1161{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 PyObject* ret;
1163 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001164
1165#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001166 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001167#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001168 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001169#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001170 ret = PyUnicode_FromFormatV(format, vargs);
1171 va_end(vargs);
1172 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001173}
1174
Victor Stinner5593d8a2010-10-02 11:11:27 +00001175/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1176 convert a Unicode object to a wide character string.
1177
1178 - If w is NULL: return the number of wide characters (including the nul
1179 character) required to convert the unicode object. Ignore size argument.
1180
1181 - Otherwise: return the number of wide characters (excluding the nul
1182 character) written into w. Write at most size wide characters (including
1183 the nul character). */
1184static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001185unicode_aswidechar(PyUnicodeObject *unicode,
1186 wchar_t *w,
1187 Py_ssize_t size)
1188{
1189#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001190 Py_ssize_t res;
1191 if (w != NULL) {
1192 res = PyUnicode_GET_SIZE(unicode);
1193 if (size > res)
1194 size = res + 1;
1195 else
1196 res = size;
1197 memcpy(w, unicode->str, size * sizeof(wchar_t));
1198 return res;
1199 }
1200 else
1201 return PyUnicode_GET_SIZE(unicode) + 1;
1202#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1203 register const Py_UNICODE *u;
1204 const Py_UNICODE *uend;
1205 const wchar_t *worig, *wend;
1206 Py_ssize_t nchar;
1207
Victor Stinner137c34c2010-09-29 10:25:54 +00001208 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001209 uend = u + PyUnicode_GET_SIZE(unicode);
1210 if (w != NULL) {
1211 worig = w;
1212 wend = w + size;
1213 while (u != uend && w != wend) {
1214 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1215 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1216 {
1217 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1218 u += 2;
1219 }
1220 else {
1221 *w = *u;
1222 u++;
1223 }
1224 w++;
1225 }
1226 if (w != wend)
1227 *w = L'\0';
1228 return w - worig;
1229 }
1230 else {
1231 nchar = 1; /* nul character at the end */
1232 while (u != uend) {
1233 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1234 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1235 u += 2;
1236 else
1237 u++;
1238 nchar++;
1239 }
1240 }
1241 return nchar;
1242#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1243 register Py_UNICODE *u, *uend, ordinal;
1244 register Py_ssize_t i;
1245 wchar_t *worig, *wend;
1246 Py_ssize_t nchar;
1247
1248 u = PyUnicode_AS_UNICODE(unicode);
1249 uend = u + PyUnicode_GET_SIZE(u);
1250 if (w != NULL) {
1251 worig = w;
1252 wend = w + size;
1253 while (u != uend && w != wend) {
1254 ordinal = *u;
1255 if (ordinal > 0xffff) {
1256 ordinal -= 0x10000;
1257 *w++ = 0xD800 | (ordinal >> 10);
1258 *w++ = 0xDC00 | (ordinal & 0x3FF);
1259 }
1260 else
1261 *w++ = ordinal;
1262 u++;
1263 }
1264 if (w != wend)
1265 *w = 0;
1266 return w - worig;
1267 }
1268 else {
1269 nchar = 1; /* nul character */
1270 while (u != uend) {
1271 if (*u > 0xffff)
1272 nchar += 2;
1273 else
1274 nchar++;
1275 u++;
1276 }
1277 return nchar;
1278 }
1279#else
1280# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001281#endif
1282}
1283
1284Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001285PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001286 wchar_t *w,
1287 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288{
1289 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001290 PyErr_BadInternalCall();
1291 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001293 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294}
1295
Victor Stinner137c34c2010-09-29 10:25:54 +00001296wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001297PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001298 Py_ssize_t *size)
1299{
1300 wchar_t* buffer;
1301 Py_ssize_t buflen;
1302
1303 if (unicode == NULL) {
1304 PyErr_BadInternalCall();
1305 return NULL;
1306 }
1307
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001308 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001309 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001310 PyErr_NoMemory();
1311 return NULL;
1312 }
1313
Victor Stinner137c34c2010-09-29 10:25:54 +00001314 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1315 if (buffer == NULL) {
1316 PyErr_NoMemory();
1317 return NULL;
1318 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001319 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001320 if (size != NULL)
1321 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001322 return buffer;
1323}
1324
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325#endif
1326
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001327PyObject *PyUnicode_FromOrdinal(int ordinal)
1328{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001329 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001330
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001331 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001332 PyErr_SetString(PyExc_ValueError,
1333 "chr() arg not in range(0x110000)");
1334 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001335 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001336
1337#ifndef Py_UNICODE_WIDE
1338 if (ordinal > 0xffff) {
1339 ordinal -= 0x10000;
1340 s[0] = 0xD800 | (ordinal >> 10);
1341 s[1] = 0xDC00 | (ordinal & 0x3FF);
1342 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001343 }
1344#endif
1345
Hye-Shik Chang40574832004-04-06 07:24:51 +00001346 s[0] = (Py_UNICODE)ordinal;
1347 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001348}
1349
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350PyObject *PyUnicode_FromObject(register PyObject *obj)
1351{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001352 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001353 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001354 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001355 Py_INCREF(obj);
1356 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001357 }
1358 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001359 /* For a Unicode subtype that's not a Unicode object,
1360 return a true Unicode object with the same data. */
1361 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1362 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001363 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001364 PyErr_Format(PyExc_TypeError,
1365 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001366 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001367 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001368}
1369
1370PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001371 const char *encoding,
1372 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001373{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001374 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001375 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001376
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001378 PyErr_BadInternalCall();
1379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001381
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001382 /* Decoding bytes objects is the most common case and should be fast */
1383 if (PyBytes_Check(obj)) {
1384 if (PyBytes_GET_SIZE(obj) == 0) {
1385 Py_INCREF(unicode_empty);
1386 v = (PyObject *) unicode_empty;
1387 }
1388 else {
1389 v = PyUnicode_Decode(
1390 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1391 encoding, errors);
1392 }
1393 return v;
1394 }
1395
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001396 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001397 PyErr_SetString(PyExc_TypeError,
1398 "decoding str is not supported");
1399 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001400 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001401
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001402 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1403 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1404 PyErr_Format(PyExc_TypeError,
1405 "coercing to str: need bytes, bytearray "
1406 "or buffer-like object, %.80s found",
1407 Py_TYPE(obj)->tp_name);
1408 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001409 }
Tim Petersced69f82003-09-16 20:30:58 +00001410
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001411 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001412 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001413 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001414 }
Tim Petersced69f82003-09-16 20:30:58 +00001415 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001416 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001417
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001418 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001419 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420}
1421
Victor Stinner600d3be2010-06-10 12:00:55 +00001422/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001423 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1424 1 on success. */
1425static int
1426normalize_encoding(const char *encoding,
1427 char *lower,
1428 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001430 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001431 char *l;
1432 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001433
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001434 e = encoding;
1435 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001436 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001437 while (*e) {
1438 if (l == l_end)
1439 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001440 if (Py_ISUPPER(*e)) {
1441 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001442 }
1443 else if (*e == '_') {
1444 *l++ = '-';
1445 e++;
1446 }
1447 else {
1448 *l++ = *e++;
1449 }
1450 }
1451 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001452 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001453}
1454
1455PyObject *PyUnicode_Decode(const char *s,
1456 Py_ssize_t size,
1457 const char *encoding,
1458 const char *errors)
1459{
1460 PyObject *buffer = NULL, *unicode;
1461 Py_buffer info;
1462 char lower[11]; /* Enough for any encoding shortcut */
1463
1464 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001465 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001466
1467 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001468 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001469 if ((strcmp(lower, "utf-8") == 0) ||
1470 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001471 return PyUnicode_DecodeUTF8(s, size, errors);
1472 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001473 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001474 (strcmp(lower, "iso-8859-1") == 0))
1475 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001476#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001477 else if (strcmp(lower, "mbcs") == 0)
1478 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001479#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001480 else if (strcmp(lower, "ascii") == 0)
1481 return PyUnicode_DecodeASCII(s, size, errors);
1482 else if (strcmp(lower, "utf-16") == 0)
1483 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1484 else if (strcmp(lower, "utf-32") == 0)
1485 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1486 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001487
1488 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001489 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001490 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001491 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001492 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001493 if (buffer == NULL)
1494 goto onError;
1495 unicode = PyCodec_Decode(buffer, encoding, errors);
1496 if (unicode == NULL)
1497 goto onError;
1498 if (!PyUnicode_Check(unicode)) {
1499 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001500 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001501 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502 Py_DECREF(unicode);
1503 goto onError;
1504 }
1505 Py_DECREF(buffer);
1506 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001507
Benjamin Peterson29060642009-01-31 22:14:21 +00001508 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509 Py_XDECREF(buffer);
1510 return NULL;
1511}
1512
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001513PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1514 const char *encoding,
1515 const char *errors)
1516{
1517 PyObject *v;
1518
1519 if (!PyUnicode_Check(unicode)) {
1520 PyErr_BadArgument();
1521 goto onError;
1522 }
1523
1524 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001525 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001526
1527 /* Decode via the codec registry */
1528 v = PyCodec_Decode(unicode, encoding, errors);
1529 if (v == NULL)
1530 goto onError;
1531 return v;
1532
Benjamin Peterson29060642009-01-31 22:14:21 +00001533 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001534 return NULL;
1535}
1536
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001537PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1538 const char *encoding,
1539 const char *errors)
1540{
1541 PyObject *v;
1542
1543 if (!PyUnicode_Check(unicode)) {
1544 PyErr_BadArgument();
1545 goto onError;
1546 }
1547
1548 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001549 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001550
1551 /* Decode via the codec registry */
1552 v = PyCodec_Decode(unicode, encoding, errors);
1553 if (v == NULL)
1554 goto onError;
1555 if (!PyUnicode_Check(v)) {
1556 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001557 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001558 Py_TYPE(v)->tp_name);
1559 Py_DECREF(v);
1560 goto onError;
1561 }
1562 return v;
1563
Benjamin Peterson29060642009-01-31 22:14:21 +00001564 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001565 return NULL;
1566}
1567
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001569 Py_ssize_t size,
1570 const char *encoding,
1571 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001572{
1573 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001574
Guido van Rossumd57fd912000-03-10 22:53:23 +00001575 unicode = PyUnicode_FromUnicode(s, size);
1576 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001578 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1579 Py_DECREF(unicode);
1580 return v;
1581}
1582
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001583PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1584 const char *encoding,
1585 const char *errors)
1586{
1587 PyObject *v;
1588
1589 if (!PyUnicode_Check(unicode)) {
1590 PyErr_BadArgument();
1591 goto onError;
1592 }
1593
1594 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001595 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001596
1597 /* Encode via the codec registry */
1598 v = PyCodec_Encode(unicode, encoding, errors);
1599 if (v == NULL)
1600 goto onError;
1601 return v;
1602
Benjamin Peterson29060642009-01-31 22:14:21 +00001603 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001604 return NULL;
1605}
1606
Victor Stinnerad158722010-10-27 00:25:46 +00001607PyObject *
1608PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001609{
Victor Stinner313a1202010-06-11 23:56:51 +00001610#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001611 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1612 PyUnicode_GET_SIZE(unicode),
1613 NULL);
1614#elif defined(__APPLE__)
1615 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1616 PyUnicode_GET_SIZE(unicode),
1617 "surrogateescape");
1618#else
1619 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001620 return PyUnicode_AsEncodedString(unicode,
1621 Py_FileSystemDefaultEncoding,
1622 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001623 }
1624 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001625 /* locale encoding with surrogateescape */
1626 wchar_t *wchar;
1627 char *bytes;
1628 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001629 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001630
1631 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1632 if (wchar == NULL)
1633 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001634 bytes = _Py_wchar2char(wchar, &error_pos);
1635 if (bytes == NULL) {
1636 if (error_pos != (size_t)-1) {
1637 char *errmsg = strerror(errno);
1638 PyObject *exc = NULL;
1639 if (errmsg == NULL)
1640 errmsg = "Py_wchar2char() failed";
1641 raise_encode_exception(&exc,
1642 "filesystemencoding",
1643 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1644 error_pos, error_pos+1,
1645 errmsg);
1646 Py_XDECREF(exc);
1647 }
1648 else
1649 PyErr_NoMemory();
1650 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001651 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001652 }
1653 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001654
1655 bytes_obj = PyBytes_FromString(bytes);
1656 PyMem_Free(bytes);
1657 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001658 }
Victor Stinnerad158722010-10-27 00:25:46 +00001659#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001660}
1661
Guido van Rossumd57fd912000-03-10 22:53:23 +00001662PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1663 const char *encoding,
1664 const char *errors)
1665{
1666 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001667 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001668
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 if (!PyUnicode_Check(unicode)) {
1670 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001671 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001672 }
Fred Drakee4315f52000-05-09 19:53:39 +00001673
Tim Petersced69f82003-09-16 20:30:58 +00001674 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001675 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1676 PyUnicode_GET_SIZE(unicode),
1677 errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001678
1679 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001680 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001681 if ((strcmp(lower, "utf-8") == 0) ||
1682 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001683 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1684 PyUnicode_GET_SIZE(unicode),
1685 errors);
1686 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001687 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001688 (strcmp(lower, "iso-8859-1") == 0))
1689 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1690 PyUnicode_GET_SIZE(unicode),
1691 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001692#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001693 else if (strcmp(lower, "mbcs") == 0)
1694 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1695 PyUnicode_GET_SIZE(unicode),
1696 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001697#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001698 else if (strcmp(lower, "ascii") == 0)
1699 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1700 PyUnicode_GET_SIZE(unicode),
1701 errors);
1702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703
1704 /* Encode via the codec registry */
1705 v = PyCodec_Encode(unicode, encoding, errors);
1706 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001707 return NULL;
1708
1709 /* The normal path */
1710 if (PyBytes_Check(v))
1711 return v;
1712
1713 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001714 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001715 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001716 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001717
1718 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1719 "encoder %s returned bytearray instead of bytes",
1720 encoding);
1721 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001722 Py_DECREF(v);
1723 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001724 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001725
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001726 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1727 Py_DECREF(v);
1728 return b;
1729 }
1730
1731 PyErr_Format(PyExc_TypeError,
1732 "encoder did not return a bytes object (type=%.400s)",
1733 Py_TYPE(v)->tp_name);
1734 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001735 return NULL;
1736}
1737
1738PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1739 const char *encoding,
1740 const char *errors)
1741{
1742 PyObject *v;
1743
1744 if (!PyUnicode_Check(unicode)) {
1745 PyErr_BadArgument();
1746 goto onError;
1747 }
1748
1749 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001750 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001751
1752 /* Encode via the codec registry */
1753 v = PyCodec_Encode(unicode, encoding, errors);
1754 if (v == NULL)
1755 goto onError;
1756 if (!PyUnicode_Check(v)) {
1757 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001758 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001759 Py_TYPE(v)->tp_name);
1760 Py_DECREF(v);
1761 goto onError;
1762 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001764
Benjamin Peterson29060642009-01-31 22:14:21 +00001765 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001766 return NULL;
1767}
1768
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001769PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001770 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001771{
1772 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001773 if (v)
1774 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001775 if (errors != NULL)
1776 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001777 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001778 PyUnicode_GET_SIZE(unicode),
1779 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001780 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001781 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001782 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001783 return v;
1784}
1785
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001786PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001787PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001788 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001789 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1790}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001791
Christian Heimes5894ba72007-11-04 11:43:14 +00001792PyObject*
1793PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1794{
Victor Stinnerad158722010-10-27 00:25:46 +00001795#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1796 return PyUnicode_DecodeMBCS(s, size, NULL);
1797#elif defined(__APPLE__)
1798 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1799#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001800 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1801 can be undefined. If it is case, decode using UTF-8. The following assumes
1802 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1803 bootstrapping process where the codecs aren't ready yet.
1804 */
1805 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001806 return PyUnicode_Decode(s, size,
1807 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001808 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001809 }
1810 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001811 /* locale encoding with surrogateescape */
1812 wchar_t *wchar;
1813 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001814 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001815
1816 if (s[size] != '\0' || size != strlen(s)) {
1817 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1818 return NULL;
1819 }
1820
Victor Stinner168e1172010-10-16 23:16:16 +00001821 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001822 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001823 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001824
Victor Stinner168e1172010-10-16 23:16:16 +00001825 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001826 PyMem_Free(wchar);
1827 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001828 }
Victor Stinnerad158722010-10-27 00:25:46 +00001829#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001830}
1831
Martin v. Löwis011e8422009-05-05 04:43:17 +00001832
1833int
1834PyUnicode_FSConverter(PyObject* arg, void* addr)
1835{
1836 PyObject *output = NULL;
1837 Py_ssize_t size;
1838 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001839 if (arg == NULL) {
1840 Py_DECREF(*(PyObject**)addr);
1841 return 1;
1842 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001843 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001844 output = arg;
1845 Py_INCREF(output);
1846 }
1847 else {
1848 arg = PyUnicode_FromObject(arg);
1849 if (!arg)
1850 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001851 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001852 Py_DECREF(arg);
1853 if (!output)
1854 return 0;
1855 if (!PyBytes_Check(output)) {
1856 Py_DECREF(output);
1857 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1858 return 0;
1859 }
1860 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001861 size = PyBytes_GET_SIZE(output);
1862 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001863 if (size != strlen(data)) {
1864 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1865 Py_DECREF(output);
1866 return 0;
1867 }
1868 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001869 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001870}
1871
1872
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001873int
1874PyUnicode_FSDecoder(PyObject* arg, void* addr)
1875{
1876 PyObject *output = NULL;
1877 Py_ssize_t size;
1878 void *data;
1879 if (arg == NULL) {
1880 Py_DECREF(*(PyObject**)addr);
1881 return 1;
1882 }
1883 if (PyUnicode_Check(arg)) {
1884 output = arg;
1885 Py_INCREF(output);
1886 }
1887 else {
1888 arg = PyBytes_FromObject(arg);
1889 if (!arg)
1890 return 0;
1891 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1892 PyBytes_GET_SIZE(arg));
1893 Py_DECREF(arg);
1894 if (!output)
1895 return 0;
1896 if (!PyUnicode_Check(output)) {
1897 Py_DECREF(output);
1898 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1899 return 0;
1900 }
1901 }
1902 size = PyUnicode_GET_SIZE(output);
1903 data = PyUnicode_AS_UNICODE(output);
1904 if (size != Py_UNICODE_strlen(data)) {
1905 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1906 Py_DECREF(output);
1907 return 0;
1908 }
1909 *(PyObject**)addr = output;
1910 return Py_CLEANUP_SUPPORTED;
1911}
1912
1913
Martin v. Löwis5b222132007-06-10 09:51:05 +00001914char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001915_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001916{
Christian Heimesf3863112007-11-22 07:46:41 +00001917 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001918 if (!PyUnicode_Check(unicode)) {
1919 PyErr_BadArgument();
1920 return NULL;
1921 }
Christian Heimesf3863112007-11-22 07:46:41 +00001922 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1923 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001924 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001925 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001926 *psize = PyBytes_GET_SIZE(bytes);
1927 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001928}
1929
1930char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001931_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001932{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001933 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001934}
1935
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1937{
1938 if (!PyUnicode_Check(unicode)) {
1939 PyErr_BadArgument();
1940 goto onError;
1941 }
1942 return PyUnicode_AS_UNICODE(unicode);
1943
Benjamin Peterson29060642009-01-31 22:14:21 +00001944 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945 return NULL;
1946}
1947
Martin v. Löwis18e16552006-02-15 17:27:45 +00001948Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949{
1950 if (!PyUnicode_Check(unicode)) {
1951 PyErr_BadArgument();
1952 goto onError;
1953 }
1954 return PyUnicode_GET_SIZE(unicode);
1955
Benjamin Peterson29060642009-01-31 22:14:21 +00001956 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957 return -1;
1958}
1959
Thomas Wouters78890102000-07-22 19:25:51 +00001960const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001961{
Victor Stinner42cb4622010-09-01 19:39:01 +00001962 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001963}
1964
Victor Stinner554f3f02010-06-16 23:33:54 +00001965/* create or adjust a UnicodeDecodeError */
1966static void
1967make_decode_exception(PyObject **exceptionObject,
1968 const char *encoding,
1969 const char *input, Py_ssize_t length,
1970 Py_ssize_t startpos, Py_ssize_t endpos,
1971 const char *reason)
1972{
1973 if (*exceptionObject == NULL) {
1974 *exceptionObject = PyUnicodeDecodeError_Create(
1975 encoding, input, length, startpos, endpos, reason);
1976 }
1977 else {
1978 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1979 goto onError;
1980 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1981 goto onError;
1982 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1983 goto onError;
1984 }
1985 return;
1986
1987onError:
1988 Py_DECREF(*exceptionObject);
1989 *exceptionObject = NULL;
1990}
1991
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001992/* error handling callback helper:
1993 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00001994 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001995 and adjust various state variables.
1996 return 0 on success, -1 on error
1997*/
1998
1999static
2000int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00002001 const char *encoding, const char *reason,
2002 const char **input, const char **inend, Py_ssize_t *startinpos,
2003 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2004 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002005{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002006 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002007
2008 PyObject *restuple = NULL;
2009 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002010 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002011 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002012 Py_ssize_t requiredsize;
2013 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002014 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002015 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002016 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002017 int res = -1;
2018
2019 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002020 *errorHandler = PyCodec_LookupError(errors);
2021 if (*errorHandler == NULL)
2022 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002023 }
2024
Victor Stinner554f3f02010-06-16 23:33:54 +00002025 make_decode_exception(exceptionObject,
2026 encoding,
2027 *input, *inend - *input,
2028 *startinpos, *endinpos,
2029 reason);
2030 if (*exceptionObject == NULL)
2031 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002032
2033 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2034 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002035 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002036 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002037 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002038 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002039 }
2040 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002041 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002042
2043 /* Copy back the bytes variables, which might have been modified by the
2044 callback */
2045 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2046 if (!inputobj)
2047 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002048 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002049 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002050 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002051 *input = PyBytes_AS_STRING(inputobj);
2052 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002053 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002054 /* we can DECREF safely, as the exception has another reference,
2055 so the object won't go away. */
2056 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002057
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002058 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002059 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002060 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002061 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2062 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002063 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002064
2065 /* need more space? (at least enough for what we
2066 have+the replacement+the rest of the string (starting
2067 at the new input position), so we won't have to check space
2068 when there are no errors in the rest of the string) */
2069 repptr = PyUnicode_AS_UNICODE(repunicode);
2070 repsize = PyUnicode_GET_SIZE(repunicode);
2071 requiredsize = *outpos + repsize + insize-newpos;
2072 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002073 if (requiredsize<2*outsize)
2074 requiredsize = 2*outsize;
2075 if (_PyUnicode_Resize(output, requiredsize) < 0)
2076 goto onError;
2077 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002078 }
2079 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002080 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002081 Py_UNICODE_COPY(*outptr, repptr, repsize);
2082 *outptr += repsize;
2083 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002084
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002085 /* we made it! */
2086 res = 0;
2087
Benjamin Peterson29060642009-01-31 22:14:21 +00002088 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002089 Py_XDECREF(restuple);
2090 return res;
2091}
2092
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002093/* --- UTF-7 Codec -------------------------------------------------------- */
2094
Antoine Pitrou244651a2009-05-04 18:56:13 +00002095/* See RFC2152 for details. We encode conservatively and decode liberally. */
2096
2097/* Three simple macros defining base-64. */
2098
2099/* Is c a base-64 character? */
2100
2101#define IS_BASE64(c) \
2102 (((c) >= 'A' && (c) <= 'Z') || \
2103 ((c) >= 'a' && (c) <= 'z') || \
2104 ((c) >= '0' && (c) <= '9') || \
2105 (c) == '+' || (c) == '/')
2106
2107/* given that c is a base-64 character, what is its base-64 value? */
2108
2109#define FROM_BASE64(c) \
2110 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2111 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2112 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2113 (c) == '+' ? 62 : 63)
2114
2115/* What is the base-64 character of the bottom 6 bits of n? */
2116
2117#define TO_BASE64(n) \
2118 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2119
2120/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2121 * decoded as itself. We are permissive on decoding; the only ASCII
2122 * byte not decoding to itself is the + which begins a base64
2123 * string. */
2124
2125#define DECODE_DIRECT(c) \
2126 ((c) <= 127 && (c) != '+')
2127
2128/* The UTF-7 encoder treats ASCII characters differently according to
2129 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2130 * the above). See RFC2152. This array identifies these different
2131 * sets:
2132 * 0 : "Set D"
2133 * alphanumeric and '(),-./:?
2134 * 1 : "Set O"
2135 * !"#$%&*;<=>@[]^_`{|}
2136 * 2 : "whitespace"
2137 * ht nl cr sp
2138 * 3 : special (must be base64 encoded)
2139 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2140 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002141
Tim Petersced69f82003-09-16 20:30:58 +00002142static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002143char utf7_category[128] = {
2144/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2145 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2146/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2147 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2148/* sp ! " # $ % & ' ( ) * + , - . / */
2149 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2150/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2151 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2152/* @ A B C D E F G H I J K L M N O */
2153 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2154/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2155 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2156/* ` a b c d e f g h i j k l m n o */
2157 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2158/* p q r s t u v w x y z { | } ~ del */
2159 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002160};
2161
Antoine Pitrou244651a2009-05-04 18:56:13 +00002162/* ENCODE_DIRECT: this character should be encoded as itself. The
2163 * answer depends on whether we are encoding set O as itself, and also
2164 * on whether we are encoding whitespace as itself. RFC2152 makes it
2165 * clear that the answers to these questions vary between
2166 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002167
Antoine Pitrou244651a2009-05-04 18:56:13 +00002168#define ENCODE_DIRECT(c, directO, directWS) \
2169 ((c) < 128 && (c) > 0 && \
2170 ((utf7_category[(c)] == 0) || \
2171 (directWS && (utf7_category[(c)] == 2)) || \
2172 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002173
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002174PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002175 Py_ssize_t size,
2176 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002177{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002178 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2179}
2180
Antoine Pitrou244651a2009-05-04 18:56:13 +00002181/* The decoder. The only state we preserve is our read position,
2182 * i.e. how many characters we have consumed. So if we end in the
2183 * middle of a shift sequence we have to back off the read position
2184 * and the output to the beginning of the sequence, otherwise we lose
2185 * all the shift state (seen bits, number of bits seen, high
2186 * surrogate). */
2187
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002188PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002189 Py_ssize_t size,
2190 const char *errors,
2191 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002192{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002193 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002194 Py_ssize_t startinpos;
2195 Py_ssize_t endinpos;
2196 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002197 const char *e;
2198 PyUnicodeObject *unicode;
2199 Py_UNICODE *p;
2200 const char *errmsg = "";
2201 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002202 Py_UNICODE *shiftOutStart;
2203 unsigned int base64bits = 0;
2204 unsigned long base64buffer = 0;
2205 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002206 PyObject *errorHandler = NULL;
2207 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002208
2209 unicode = _PyUnicode_New(size);
2210 if (!unicode)
2211 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002212 if (size == 0) {
2213 if (consumed)
2214 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002215 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002216 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002217
2218 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002219 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002220 e = s + size;
2221
2222 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002223 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002224 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002225 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002226
Antoine Pitrou244651a2009-05-04 18:56:13 +00002227 if (inShift) { /* in a base-64 section */
2228 if (IS_BASE64(ch)) { /* consume a base-64 character */
2229 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2230 base64bits += 6;
2231 s++;
2232 if (base64bits >= 16) {
2233 /* we have enough bits for a UTF-16 value */
2234 Py_UNICODE outCh = (Py_UNICODE)
2235 (base64buffer >> (base64bits-16));
2236 base64bits -= 16;
2237 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2238 if (surrogate) {
2239 /* expecting a second surrogate */
2240 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2241#ifdef Py_UNICODE_WIDE
2242 *p++ = (((surrogate & 0x3FF)<<10)
2243 | (outCh & 0x3FF)) + 0x10000;
2244#else
2245 *p++ = surrogate;
2246 *p++ = outCh;
2247#endif
2248 surrogate = 0;
2249 }
2250 else {
2251 surrogate = 0;
2252 errmsg = "second surrogate missing";
2253 goto utf7Error;
2254 }
2255 }
2256 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2257 /* first surrogate */
2258 surrogate = outCh;
2259 }
2260 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2261 errmsg = "unexpected second surrogate";
2262 goto utf7Error;
2263 }
2264 else {
2265 *p++ = outCh;
2266 }
2267 }
2268 }
2269 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002270 inShift = 0;
2271 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002272 if (surrogate) {
2273 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002274 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002275 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002276 if (base64bits > 0) { /* left-over bits */
2277 if (base64bits >= 6) {
2278 /* We've seen at least one base-64 character */
2279 errmsg = "partial character in shift sequence";
2280 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002281 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002282 else {
2283 /* Some bits remain; they should be zero */
2284 if (base64buffer != 0) {
2285 errmsg = "non-zero padding bits in shift sequence";
2286 goto utf7Error;
2287 }
2288 }
2289 }
2290 if (ch != '-') {
2291 /* '-' is absorbed; other terminating
2292 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002293 *p++ = ch;
2294 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002295 }
2296 }
2297 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002298 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002299 s++; /* consume '+' */
2300 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002301 s++;
2302 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002303 }
2304 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002305 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002306 shiftOutStart = p;
2307 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002308 }
2309 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002310 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002311 *p++ = ch;
2312 s++;
2313 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002314 else {
2315 startinpos = s-starts;
2316 s++;
2317 errmsg = "unexpected special character";
2318 goto utf7Error;
2319 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002320 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002321utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002322 outpos = p-PyUnicode_AS_UNICODE(unicode);
2323 endinpos = s-starts;
2324 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002325 errors, &errorHandler,
2326 "utf7", errmsg,
2327 &starts, &e, &startinpos, &endinpos, &exc, &s,
2328 &unicode, &outpos, &p))
2329 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002330 }
2331
Antoine Pitrou244651a2009-05-04 18:56:13 +00002332 /* end of string */
2333
2334 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2335 /* if we're in an inconsistent state, that's an error */
2336 if (surrogate ||
2337 (base64bits >= 6) ||
2338 (base64bits > 0 && base64buffer != 0)) {
2339 outpos = p-PyUnicode_AS_UNICODE(unicode);
2340 endinpos = size;
2341 if (unicode_decode_call_errorhandler(
2342 errors, &errorHandler,
2343 "utf7", "unterminated shift sequence",
2344 &starts, &e, &startinpos, &endinpos, &exc, &s,
2345 &unicode, &outpos, &p))
2346 goto onError;
2347 if (s < e)
2348 goto restart;
2349 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002350 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002351
2352 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002353 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002354 if (inShift) {
2355 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002356 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002357 }
2358 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002359 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002360 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002361 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002362
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002363 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002364 goto onError;
2365
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002366 Py_XDECREF(errorHandler);
2367 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002368 return (PyObject *)unicode;
2369
Benjamin Peterson29060642009-01-31 22:14:21 +00002370 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002371 Py_XDECREF(errorHandler);
2372 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002373 Py_DECREF(unicode);
2374 return NULL;
2375}
2376
2377
2378PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002379 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002380 int base64SetO,
2381 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002382 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002383{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002384 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002385 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002386 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002387 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002388 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002389 unsigned int base64bits = 0;
2390 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002391 char * out;
2392 char * start;
2393
2394 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002395 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002396
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002397 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002398 return PyErr_NoMemory();
2399
Antoine Pitrou244651a2009-05-04 18:56:13 +00002400 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002401 if (v == NULL)
2402 return NULL;
2403
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002404 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002405 for (;i < size; ++i) {
2406 Py_UNICODE ch = s[i];
2407
Antoine Pitrou244651a2009-05-04 18:56:13 +00002408 if (inShift) {
2409 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2410 /* shifting out */
2411 if (base64bits) { /* output remaining bits */
2412 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2413 base64buffer = 0;
2414 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002415 }
2416 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002417 /* Characters not in the BASE64 set implicitly unshift the sequence
2418 so no '-' is required, except if the character is itself a '-' */
2419 if (IS_BASE64(ch) || ch == '-') {
2420 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002421 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002422 *out++ = (char) ch;
2423 }
2424 else {
2425 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002426 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002427 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002428 else { /* not in a shift sequence */
2429 if (ch == '+') {
2430 *out++ = '+';
2431 *out++ = '-';
2432 }
2433 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2434 *out++ = (char) ch;
2435 }
2436 else {
2437 *out++ = '+';
2438 inShift = 1;
2439 goto encode_char;
2440 }
2441 }
2442 continue;
2443encode_char:
2444#ifdef Py_UNICODE_WIDE
2445 if (ch >= 0x10000) {
2446 /* code first surrogate */
2447 base64bits += 16;
2448 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2449 while (base64bits >= 6) {
2450 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2451 base64bits -= 6;
2452 }
2453 /* prepare second surrogate */
2454 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2455 }
2456#endif
2457 base64bits += 16;
2458 base64buffer = (base64buffer << 16) | ch;
2459 while (base64bits >= 6) {
2460 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2461 base64bits -= 6;
2462 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002463 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002464 if (base64bits)
2465 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2466 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002467 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002468 if (_PyBytes_Resize(&v, out - start) < 0)
2469 return NULL;
2470 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002471}
2472
Antoine Pitrou244651a2009-05-04 18:56:13 +00002473#undef IS_BASE64
2474#undef FROM_BASE64
2475#undef TO_BASE64
2476#undef DECODE_DIRECT
2477#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002478
Guido van Rossumd57fd912000-03-10 22:53:23 +00002479/* --- UTF-8 Codec -------------------------------------------------------- */
2480
Tim Petersced69f82003-09-16 20:30:58 +00002481static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002483 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2484 illegal prefix. See RFC 3629 for details */
2485 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2486 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002487 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002488 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2489 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2490 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2491 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002492 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2495 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002496 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2497 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2498 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2499 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2500 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002501};
2502
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002504 Py_ssize_t size,
2505 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506{
Walter Dörwald69652032004-09-07 20:24:22 +00002507 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2508}
2509
Antoine Pitrouab868312009-01-10 15:40:25 +00002510/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2511#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2512
2513/* Mask to quickly check whether a C 'long' contains a
2514 non-ASCII, UTF8-encoded char. */
2515#if (SIZEOF_LONG == 8)
2516# define ASCII_CHAR_MASK 0x8080808080808080L
2517#elif (SIZEOF_LONG == 4)
2518# define ASCII_CHAR_MASK 0x80808080L
2519#else
2520# error C 'long' size should be either 4 or 8!
2521#endif
2522
Walter Dörwald69652032004-09-07 20:24:22 +00002523PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002524 Py_ssize_t size,
2525 const char *errors,
2526 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002527{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002528 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002530 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002531 Py_ssize_t startinpos;
2532 Py_ssize_t endinpos;
2533 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002534 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 PyUnicodeObject *unicode;
2536 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002537 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002538 PyObject *errorHandler = NULL;
2539 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540
2541 /* Note: size will always be longer than the resulting Unicode
2542 character count */
2543 unicode = _PyUnicode_New(size);
2544 if (!unicode)
2545 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002546 if (size == 0) {
2547 if (consumed)
2548 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551
2552 /* Unpack UTF-8 encoded data */
2553 p = unicode->str;
2554 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002555 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556
2557 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002558 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559
2560 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002561 /* Fast path for runs of ASCII characters. Given that common UTF-8
2562 input will consist of an overwhelming majority of ASCII
2563 characters, we try to optimize for this case by checking
2564 as many characters as a C 'long' can contain.
2565 First, check if we can do an aligned read, as most CPUs have
2566 a penalty for unaligned reads.
2567 */
2568 if (!((size_t) s & LONG_PTR_MASK)) {
2569 /* Help register allocation */
2570 register const char *_s = s;
2571 register Py_UNICODE *_p = p;
2572 while (_s < aligned_end) {
2573 /* Read a whole long at a time (either 4 or 8 bytes),
2574 and do a fast unrolled copy if it only contains ASCII
2575 characters. */
2576 unsigned long data = *(unsigned long *) _s;
2577 if (data & ASCII_CHAR_MASK)
2578 break;
2579 _p[0] = (unsigned char) _s[0];
2580 _p[1] = (unsigned char) _s[1];
2581 _p[2] = (unsigned char) _s[2];
2582 _p[3] = (unsigned char) _s[3];
2583#if (SIZEOF_LONG == 8)
2584 _p[4] = (unsigned char) _s[4];
2585 _p[5] = (unsigned char) _s[5];
2586 _p[6] = (unsigned char) _s[6];
2587 _p[7] = (unsigned char) _s[7];
2588#endif
2589 _s += SIZEOF_LONG;
2590 _p += SIZEOF_LONG;
2591 }
2592 s = _s;
2593 p = _p;
2594 if (s == e)
2595 break;
2596 ch = (unsigned char)*s;
2597 }
2598 }
2599
2600 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002601 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 s++;
2603 continue;
2604 }
2605
2606 n = utf8_code_length[ch];
2607
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002608 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002609 if (consumed)
2610 break;
2611 else {
2612 errmsg = "unexpected end of data";
2613 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002614 endinpos = startinpos+1;
2615 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2616 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002617 goto utf8Error;
2618 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002619 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620
2621 switch (n) {
2622
2623 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002624 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002625 startinpos = s-starts;
2626 endinpos = startinpos+1;
2627 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628
2629 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002630 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002631 startinpos = s-starts;
2632 endinpos = startinpos+1;
2633 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002634
2635 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002636 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002637 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002638 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002639 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002640 goto utf8Error;
2641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002642 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002643 assert ((ch > 0x007F) && (ch <= 0x07FF));
2644 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645 break;
2646
2647 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002648 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2649 will result in surrogates in range d800-dfff. Surrogates are
2650 not valid UTF-8 so they are rejected.
2651 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2652 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002653 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002654 (s[2] & 0xc0) != 0x80 ||
2655 ((unsigned char)s[0] == 0xE0 &&
2656 (unsigned char)s[1] < 0xA0) ||
2657 ((unsigned char)s[0] == 0xED &&
2658 (unsigned char)s[1] > 0x9F)) {
2659 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002660 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002661 endinpos = startinpos + 1;
2662
2663 /* if s[1] first two bits are 1 and 0, then the invalid
2664 continuation byte is s[2], so increment endinpos by 1,
2665 if not, s[1] is invalid and endinpos doesn't need to
2666 be incremented. */
2667 if ((s[1] & 0xC0) == 0x80)
2668 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002669 goto utf8Error;
2670 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002672 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2673 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002674 break;
2675
2676 case 4:
2677 if ((s[1] & 0xc0) != 0x80 ||
2678 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002679 (s[3] & 0xc0) != 0x80 ||
2680 ((unsigned char)s[0] == 0xF0 &&
2681 (unsigned char)s[1] < 0x90) ||
2682 ((unsigned char)s[0] == 0xF4 &&
2683 (unsigned char)s[1] > 0x8F)) {
2684 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002685 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002686 endinpos = startinpos + 1;
2687 if ((s[1] & 0xC0) == 0x80) {
2688 endinpos++;
2689 if ((s[2] & 0xC0) == 0x80)
2690 endinpos++;
2691 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002692 goto utf8Error;
2693 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002694 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002695 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2696 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2697
Fredrik Lundh8f455852001-06-27 18:59:43 +00002698#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002699 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002700#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002701 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002702
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002703 /* translate from 10000..10FFFF to 0..FFFF */
2704 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002705
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002706 /* high surrogate = top 10 bits added to D800 */
2707 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002708
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002709 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002710 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002711#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002712 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713 }
2714 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002715 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002716
Benjamin Peterson29060642009-01-31 22:14:21 +00002717 utf8Error:
2718 outpos = p-PyUnicode_AS_UNICODE(unicode);
2719 if (unicode_decode_call_errorhandler(
2720 errors, &errorHandler,
2721 "utf8", errmsg,
2722 &starts, &e, &startinpos, &endinpos, &exc, &s,
2723 &unicode, &outpos, &p))
2724 goto onError;
2725 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726 }
Walter Dörwald69652032004-09-07 20:24:22 +00002727 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002728 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729
2730 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002731 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732 goto onError;
2733
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002734 Py_XDECREF(errorHandler);
2735 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 return (PyObject *)unicode;
2737
Benjamin Peterson29060642009-01-31 22:14:21 +00002738 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002739 Py_XDECREF(errorHandler);
2740 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 Py_DECREF(unicode);
2742 return NULL;
2743}
2744
Antoine Pitrouab868312009-01-10 15:40:25 +00002745#undef ASCII_CHAR_MASK
2746
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002747#ifdef __APPLE__
2748
2749/* Simplified UTF-8 decoder using surrogateescape error handler,
2750 used to decode the command line arguments on Mac OS X. */
2751
2752wchar_t*
2753_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2754{
2755 int n;
2756 const char *e;
2757 wchar_t *unicode, *p;
2758
2759 /* Note: size will always be longer than the resulting Unicode
2760 character count */
2761 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2762 PyErr_NoMemory();
2763 return NULL;
2764 }
2765 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2766 if (!unicode)
2767 return NULL;
2768
2769 /* Unpack UTF-8 encoded data */
2770 p = unicode;
2771 e = s + size;
2772 while (s < e) {
2773 Py_UCS4 ch = (unsigned char)*s;
2774
2775 if (ch < 0x80) {
2776 *p++ = (wchar_t)ch;
2777 s++;
2778 continue;
2779 }
2780
2781 n = utf8_code_length[ch];
2782 if (s + n > e) {
2783 goto surrogateescape;
2784 }
2785
2786 switch (n) {
2787 case 0:
2788 case 1:
2789 goto surrogateescape;
2790
2791 case 2:
2792 if ((s[1] & 0xc0) != 0x80)
2793 goto surrogateescape;
2794 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2795 assert ((ch > 0x007F) && (ch <= 0x07FF));
2796 *p++ = (wchar_t)ch;
2797 break;
2798
2799 case 3:
2800 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2801 will result in surrogates in range d800-dfff. Surrogates are
2802 not valid UTF-8 so they are rejected.
2803 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2804 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2805 if ((s[1] & 0xc0) != 0x80 ||
2806 (s[2] & 0xc0) != 0x80 ||
2807 ((unsigned char)s[0] == 0xE0 &&
2808 (unsigned char)s[1] < 0xA0) ||
2809 ((unsigned char)s[0] == 0xED &&
2810 (unsigned char)s[1] > 0x9F)) {
2811
2812 goto surrogateescape;
2813 }
2814 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2815 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2816 *p++ = (Py_UNICODE)ch;
2817 break;
2818
2819 case 4:
2820 if ((s[1] & 0xc0) != 0x80 ||
2821 (s[2] & 0xc0) != 0x80 ||
2822 (s[3] & 0xc0) != 0x80 ||
2823 ((unsigned char)s[0] == 0xF0 &&
2824 (unsigned char)s[1] < 0x90) ||
2825 ((unsigned char)s[0] == 0xF4 &&
2826 (unsigned char)s[1] > 0x8F)) {
2827 goto surrogateescape;
2828 }
2829 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2830 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2831 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2832
2833#if SIZEOF_WCHAR_T == 4
2834 *p++ = (wchar_t)ch;
2835#else
2836 /* compute and append the two surrogates: */
2837
2838 /* translate from 10000..10FFFF to 0..FFFF */
2839 ch -= 0x10000;
2840
2841 /* high surrogate = top 10 bits added to D800 */
2842 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2843
2844 /* low surrogate = bottom 10 bits added to DC00 */
2845 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2846#endif
2847 break;
2848 }
2849 s += n;
2850 continue;
2851
2852 surrogateescape:
2853 *p++ = 0xDC00 + ch;
2854 s++;
2855 }
2856 *p = L'\0';
2857 return unicode;
2858}
2859
2860#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002861
Tim Peters602f7402002-04-27 18:03:26 +00002862/* Allocation strategy: if the string is short, convert into a stack buffer
2863 and allocate exactly as much space needed at the end. Else allocate the
2864 maximum possible needed (4 result bytes per Unicode character), and return
2865 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002866*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002867PyObject *
2868PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002869 Py_ssize_t size,
2870 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871{
Tim Peters602f7402002-04-27 18:03:26 +00002872#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002873
Guido van Rossum98297ee2007-11-06 21:34:58 +00002874 Py_ssize_t i; /* index into s of next input byte */
2875 PyObject *result; /* result string object */
2876 char *p; /* next free byte in output buffer */
2877 Py_ssize_t nallocated; /* number of result bytes allocated */
2878 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002879 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002880 PyObject *errorHandler = NULL;
2881 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002882
Tim Peters602f7402002-04-27 18:03:26 +00002883 assert(s != NULL);
2884 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885
Tim Peters602f7402002-04-27 18:03:26 +00002886 if (size <= MAX_SHORT_UNICHARS) {
2887 /* Write into the stack buffer; nallocated can't overflow.
2888 * At the end, we'll allocate exactly as much heap space as it
2889 * turns out we need.
2890 */
2891 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002892 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002893 p = stackbuf;
2894 }
2895 else {
2896 /* Overallocate on the heap, and give the excess back at the end. */
2897 nallocated = size * 4;
2898 if (nallocated / 4 != size) /* overflow! */
2899 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002900 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002901 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002902 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002903 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002904 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002905
Tim Peters602f7402002-04-27 18:03:26 +00002906 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002907 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002908
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002909 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002910 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002911 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002912
Guido van Rossumd57fd912000-03-10 22:53:23 +00002913 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002914 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002915 *p++ = (char)(0xc0 | (ch >> 6));
2916 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002917 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002918#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002919 /* Special case: check for high and low surrogate */
2920 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2921 Py_UCS4 ch2 = s[i];
2922 /* Combine the two surrogates to form a UCS4 value */
2923 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2924 i++;
2925
2926 /* Encode UCS4 Unicode ordinals */
2927 *p++ = (char)(0xf0 | (ch >> 18));
2928 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002929 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2930 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002931 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002932#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002933 Py_ssize_t newpos;
2934 PyObject *rep;
2935 Py_ssize_t repsize, k;
2936 rep = unicode_encode_call_errorhandler
2937 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2938 s, size, &exc, i-1, i, &newpos);
2939 if (!rep)
2940 goto error;
2941
2942 if (PyBytes_Check(rep))
2943 repsize = PyBytes_GET_SIZE(rep);
2944 else
2945 repsize = PyUnicode_GET_SIZE(rep);
2946
2947 if (repsize > 4) {
2948 Py_ssize_t offset;
2949
2950 if (result == NULL)
2951 offset = p - stackbuf;
2952 else
2953 offset = p - PyBytes_AS_STRING(result);
2954
2955 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2956 /* integer overflow */
2957 PyErr_NoMemory();
2958 goto error;
2959 }
2960 nallocated += repsize - 4;
2961 if (result != NULL) {
2962 if (_PyBytes_Resize(&result, nallocated) < 0)
2963 goto error;
2964 } else {
2965 result = PyBytes_FromStringAndSize(NULL, nallocated);
2966 if (result == NULL)
2967 goto error;
2968 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2969 }
2970 p = PyBytes_AS_STRING(result) + offset;
2971 }
2972
2973 if (PyBytes_Check(rep)) {
2974 char *prep = PyBytes_AS_STRING(rep);
2975 for(k = repsize; k > 0; k--)
2976 *p++ = *prep++;
2977 } else /* rep is unicode */ {
2978 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2979 Py_UNICODE c;
2980
2981 for(k=0; k<repsize; k++) {
2982 c = prep[k];
2983 if (0x80 <= c) {
2984 raise_encode_exception(&exc, "utf-8", s, size,
2985 i-1, i, "surrogates not allowed");
2986 goto error;
2987 }
2988 *p++ = (char)prep[k];
2989 }
2990 }
2991 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00002992#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002993 }
Victor Stinner445a6232010-04-22 20:01:57 +00002994#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002995 } else if (ch < 0x10000) {
2996 *p++ = (char)(0xe0 | (ch >> 12));
2997 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2998 *p++ = (char)(0x80 | (ch & 0x3f));
2999 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003000 /* Encode UCS4 Unicode ordinals */
3001 *p++ = (char)(0xf0 | (ch >> 18));
3002 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3003 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3004 *p++ = (char)(0x80 | (ch & 0x3f));
3005 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003006 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003007
Guido van Rossum98297ee2007-11-06 21:34:58 +00003008 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003009 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003010 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003011 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003012 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003013 }
3014 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003015 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003016 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003017 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003018 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003019 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003020 Py_XDECREF(errorHandler);
3021 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003022 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003023 error:
3024 Py_XDECREF(errorHandler);
3025 Py_XDECREF(exc);
3026 Py_XDECREF(result);
3027 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003028
Tim Peters602f7402002-04-27 18:03:26 +00003029#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030}
3031
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3033{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 if (!PyUnicode_Check(unicode)) {
3035 PyErr_BadArgument();
3036 return NULL;
3037 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003038 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003039 PyUnicode_GET_SIZE(unicode),
3040 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003041}
3042
Walter Dörwald41980ca2007-08-16 21:55:45 +00003043/* --- UTF-32 Codec ------------------------------------------------------- */
3044
3045PyObject *
3046PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003047 Py_ssize_t size,
3048 const char *errors,
3049 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003050{
3051 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3052}
3053
3054PyObject *
3055PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003056 Py_ssize_t size,
3057 const char *errors,
3058 int *byteorder,
3059 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003060{
3061 const char *starts = s;
3062 Py_ssize_t startinpos;
3063 Py_ssize_t endinpos;
3064 Py_ssize_t outpos;
3065 PyUnicodeObject *unicode;
3066 Py_UNICODE *p;
3067#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003068 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003069 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003070#else
3071 const int pairs = 0;
3072#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003073 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003074 int bo = 0; /* assume native ordering by default */
3075 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003076 /* Offsets from q for retrieving bytes in the right order. */
3077#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3078 int iorder[] = {0, 1, 2, 3};
3079#else
3080 int iorder[] = {3, 2, 1, 0};
3081#endif
3082 PyObject *errorHandler = NULL;
3083 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003084
Walter Dörwald41980ca2007-08-16 21:55:45 +00003085 q = (unsigned char *)s;
3086 e = q + size;
3087
3088 if (byteorder)
3089 bo = *byteorder;
3090
3091 /* Check for BOM marks (U+FEFF) in the input and adjust current
3092 byte order setting accordingly. In native mode, the leading BOM
3093 mark is skipped, in all other modes, it is copied to the output
3094 stream as-is (giving a ZWNBSP character). */
3095 if (bo == 0) {
3096 if (size >= 4) {
3097 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003098 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003099#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003100 if (bom == 0x0000FEFF) {
3101 q += 4;
3102 bo = -1;
3103 }
3104 else if (bom == 0xFFFE0000) {
3105 q += 4;
3106 bo = 1;
3107 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003108#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003109 if (bom == 0x0000FEFF) {
3110 q += 4;
3111 bo = 1;
3112 }
3113 else if (bom == 0xFFFE0000) {
3114 q += 4;
3115 bo = -1;
3116 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003117#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003118 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003119 }
3120
3121 if (bo == -1) {
3122 /* force LE */
3123 iorder[0] = 0;
3124 iorder[1] = 1;
3125 iorder[2] = 2;
3126 iorder[3] = 3;
3127 }
3128 else if (bo == 1) {
3129 /* force BE */
3130 iorder[0] = 3;
3131 iorder[1] = 2;
3132 iorder[2] = 1;
3133 iorder[3] = 0;
3134 }
3135
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003136 /* On narrow builds we split characters outside the BMP into two
3137 codepoints => count how much extra space we need. */
3138#ifndef Py_UNICODE_WIDE
3139 for (qq = q; qq < e; qq += 4)
3140 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3141 pairs++;
3142#endif
3143
3144 /* This might be one to much, because of a BOM */
3145 unicode = _PyUnicode_New((size+3)/4+pairs);
3146 if (!unicode)
3147 return NULL;
3148 if (size == 0)
3149 return (PyObject *)unicode;
3150
3151 /* Unpack UTF-32 encoded data */
3152 p = unicode->str;
3153
Walter Dörwald41980ca2007-08-16 21:55:45 +00003154 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003155 Py_UCS4 ch;
3156 /* remaining bytes at the end? (size should be divisible by 4) */
3157 if (e-q<4) {
3158 if (consumed)
3159 break;
3160 errmsg = "truncated data";
3161 startinpos = ((const char *)q)-starts;
3162 endinpos = ((const char *)e)-starts;
3163 goto utf32Error;
3164 /* The remaining input chars are ignored if the callback
3165 chooses to skip the input */
3166 }
3167 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3168 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003169
Benjamin Peterson29060642009-01-31 22:14:21 +00003170 if (ch >= 0x110000)
3171 {
3172 errmsg = "codepoint not in range(0x110000)";
3173 startinpos = ((const char *)q)-starts;
3174 endinpos = startinpos+4;
3175 goto utf32Error;
3176 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003177#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003178 if (ch >= 0x10000)
3179 {
3180 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3181 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3182 }
3183 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003184#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003185 *p++ = ch;
3186 q += 4;
3187 continue;
3188 utf32Error:
3189 outpos = p-PyUnicode_AS_UNICODE(unicode);
3190 if (unicode_decode_call_errorhandler(
3191 errors, &errorHandler,
3192 "utf32", errmsg,
3193 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3194 &unicode, &outpos, &p))
3195 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003196 }
3197
3198 if (byteorder)
3199 *byteorder = bo;
3200
3201 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003202 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003203
3204 /* Adjust length */
3205 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3206 goto onError;
3207
3208 Py_XDECREF(errorHandler);
3209 Py_XDECREF(exc);
3210 return (PyObject *)unicode;
3211
Benjamin Peterson29060642009-01-31 22:14:21 +00003212 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003213 Py_DECREF(unicode);
3214 Py_XDECREF(errorHandler);
3215 Py_XDECREF(exc);
3216 return NULL;
3217}
3218
3219PyObject *
3220PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003221 Py_ssize_t size,
3222 const char *errors,
3223 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003224{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003225 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003226 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003227 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003228#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003229 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003230#else
3231 const int pairs = 0;
3232#endif
3233 /* Offsets from p for storing byte pairs in the right order. */
3234#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3235 int iorder[] = {0, 1, 2, 3};
3236#else
3237 int iorder[] = {3, 2, 1, 0};
3238#endif
3239
Benjamin Peterson29060642009-01-31 22:14:21 +00003240#define STORECHAR(CH) \
3241 do { \
3242 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3243 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3244 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3245 p[iorder[0]] = (CH) & 0xff; \
3246 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003247 } while(0)
3248
3249 /* In narrow builds we can output surrogate pairs as one codepoint,
3250 so we need less space. */
3251#ifndef Py_UNICODE_WIDE
3252 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003253 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3254 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3255 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003256#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003257 nsize = (size - pairs + (byteorder == 0));
3258 bytesize = nsize * 4;
3259 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003260 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003261 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003262 if (v == NULL)
3263 return NULL;
3264
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003265 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003266 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003267 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003268 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003269 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003270
3271 if (byteorder == -1) {
3272 /* force LE */
3273 iorder[0] = 0;
3274 iorder[1] = 1;
3275 iorder[2] = 2;
3276 iorder[3] = 3;
3277 }
3278 else if (byteorder == 1) {
3279 /* force BE */
3280 iorder[0] = 3;
3281 iorder[1] = 2;
3282 iorder[2] = 1;
3283 iorder[3] = 0;
3284 }
3285
3286 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003287 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003288#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003289 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3290 Py_UCS4 ch2 = *s;
3291 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3292 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3293 s++;
3294 size--;
3295 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003296 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003297#endif
3298 STORECHAR(ch);
3299 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003300
3301 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003302 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003303#undef STORECHAR
3304}
3305
3306PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3307{
3308 if (!PyUnicode_Check(unicode)) {
3309 PyErr_BadArgument();
3310 return NULL;
3311 }
3312 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003313 PyUnicode_GET_SIZE(unicode),
3314 NULL,
3315 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003316}
3317
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318/* --- UTF-16 Codec ------------------------------------------------------- */
3319
Tim Peters772747b2001-08-09 22:21:55 +00003320PyObject *
3321PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003322 Py_ssize_t size,
3323 const char *errors,
3324 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003325{
Walter Dörwald69652032004-09-07 20:24:22 +00003326 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3327}
3328
Antoine Pitrouab868312009-01-10 15:40:25 +00003329/* Two masks for fast checking of whether a C 'long' may contain
3330 UTF16-encoded surrogate characters. This is an efficient heuristic,
3331 assuming that non-surrogate characters with a code point >= 0x8000 are
3332 rare in most input.
3333 FAST_CHAR_MASK is used when the input is in native byte ordering,
3334 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003335*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003336#if (SIZEOF_LONG == 8)
3337# define FAST_CHAR_MASK 0x8000800080008000L
3338# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3339#elif (SIZEOF_LONG == 4)
3340# define FAST_CHAR_MASK 0x80008000L
3341# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3342#else
3343# error C 'long' size should be either 4 or 8!
3344#endif
3345
Walter Dörwald69652032004-09-07 20:24:22 +00003346PyObject *
3347PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003348 Py_ssize_t size,
3349 const char *errors,
3350 int *byteorder,
3351 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003352{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003353 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003354 Py_ssize_t startinpos;
3355 Py_ssize_t endinpos;
3356 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357 PyUnicodeObject *unicode;
3358 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003359 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003360 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003361 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003362 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003363 /* Offsets from q for retrieving byte pairs in the right order. */
3364#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3365 int ihi = 1, ilo = 0;
3366#else
3367 int ihi = 0, ilo = 1;
3368#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003369 PyObject *errorHandler = NULL;
3370 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371
3372 /* Note: size will always be longer than the resulting Unicode
3373 character count */
3374 unicode = _PyUnicode_New(size);
3375 if (!unicode)
3376 return NULL;
3377 if (size == 0)
3378 return (PyObject *)unicode;
3379
3380 /* Unpack UTF-16 encoded data */
3381 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003382 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003383 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384
3385 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003386 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003387
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003388 /* Check for BOM marks (U+FEFF) in the input and adjust current
3389 byte order setting accordingly. In native mode, the leading BOM
3390 mark is skipped, in all other modes, it is copied to the output
3391 stream as-is (giving a ZWNBSP character). */
3392 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003393 if (size >= 2) {
3394 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003395#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003396 if (bom == 0xFEFF) {
3397 q += 2;
3398 bo = -1;
3399 }
3400 else if (bom == 0xFFFE) {
3401 q += 2;
3402 bo = 1;
3403 }
Tim Petersced69f82003-09-16 20:30:58 +00003404#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003405 if (bom == 0xFEFF) {
3406 q += 2;
3407 bo = 1;
3408 }
3409 else if (bom == 0xFFFE) {
3410 q += 2;
3411 bo = -1;
3412 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003413#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003414 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416
Tim Peters772747b2001-08-09 22:21:55 +00003417 if (bo == -1) {
3418 /* force LE */
3419 ihi = 1;
3420 ilo = 0;
3421 }
3422 else if (bo == 1) {
3423 /* force BE */
3424 ihi = 0;
3425 ilo = 1;
3426 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003427#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3428 native_ordering = ilo < ihi;
3429#else
3430 native_ordering = ilo > ihi;
3431#endif
Tim Peters772747b2001-08-09 22:21:55 +00003432
Antoine Pitrouab868312009-01-10 15:40:25 +00003433 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003434 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003435 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003436 /* First check for possible aligned read of a C 'long'. Unaligned
3437 reads are more expensive, better to defer to another iteration. */
3438 if (!((size_t) q & LONG_PTR_MASK)) {
3439 /* Fast path for runs of non-surrogate chars. */
3440 register const unsigned char *_q = q;
3441 Py_UNICODE *_p = p;
3442 if (native_ordering) {
3443 /* Native ordering is simple: as long as the input cannot
3444 possibly contain a surrogate char, do an unrolled copy
3445 of several 16-bit code points to the target object.
3446 The non-surrogate check is done on several input bytes
3447 at a time (as many as a C 'long' can contain). */
3448 while (_q < aligned_end) {
3449 unsigned long data = * (unsigned long *) _q;
3450 if (data & FAST_CHAR_MASK)
3451 break;
3452 _p[0] = ((unsigned short *) _q)[0];
3453 _p[1] = ((unsigned short *) _q)[1];
3454#if (SIZEOF_LONG == 8)
3455 _p[2] = ((unsigned short *) _q)[2];
3456 _p[3] = ((unsigned short *) _q)[3];
3457#endif
3458 _q += SIZEOF_LONG;
3459 _p += SIZEOF_LONG / 2;
3460 }
3461 }
3462 else {
3463 /* Byteswapped ordering is similar, but we must decompose
3464 the copy bytewise, and take care of zero'ing out the
3465 upper bytes if the target object is in 32-bit units
3466 (that is, in UCS-4 builds). */
3467 while (_q < aligned_end) {
3468 unsigned long data = * (unsigned long *) _q;
3469 if (data & SWAPPED_FAST_CHAR_MASK)
3470 break;
3471 /* Zero upper bytes in UCS-4 builds */
3472#if (Py_UNICODE_SIZE > 2)
3473 _p[0] = 0;
3474 _p[1] = 0;
3475#if (SIZEOF_LONG == 8)
3476 _p[2] = 0;
3477 _p[3] = 0;
3478#endif
3479#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003480 /* Issue #4916; UCS-4 builds on big endian machines must
3481 fill the two last bytes of each 4-byte unit. */
3482#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3483# define OFF 2
3484#else
3485# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003486#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003487 ((unsigned char *) _p)[OFF + 1] = _q[0];
3488 ((unsigned char *) _p)[OFF + 0] = _q[1];
3489 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3490 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3491#if (SIZEOF_LONG == 8)
3492 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3493 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3494 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3495 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3496#endif
3497#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003498 _q += SIZEOF_LONG;
3499 _p += SIZEOF_LONG / 2;
3500 }
3501 }
3502 p = _p;
3503 q = _q;
3504 if (q >= e)
3505 break;
3506 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003507 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003508
Benjamin Peterson14339b62009-01-31 16:36:08 +00003509 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003510
3511 if (ch < 0xD800 || ch > 0xDFFF) {
3512 *p++ = ch;
3513 continue;
3514 }
3515
3516 /* UTF-16 code pair: */
3517 if (q > e) {
3518 errmsg = "unexpected end of data";
3519 startinpos = (((const char *)q) - 2) - starts;
3520 endinpos = ((const char *)e) + 1 - starts;
3521 goto utf16Error;
3522 }
3523 if (0xD800 <= ch && ch <= 0xDBFF) {
3524 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3525 q += 2;
3526 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003527#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003528 *p++ = ch;
3529 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003530#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003531 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003532#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003533 continue;
3534 }
3535 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003536 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003537 startinpos = (((const char *)q)-4)-starts;
3538 endinpos = startinpos+2;
3539 goto utf16Error;
3540 }
3541
Benjamin Peterson14339b62009-01-31 16:36:08 +00003542 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003543 errmsg = "illegal encoding";
3544 startinpos = (((const char *)q)-2)-starts;
3545 endinpos = startinpos+2;
3546 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003547
Benjamin Peterson29060642009-01-31 22:14:21 +00003548 utf16Error:
3549 outpos = p - PyUnicode_AS_UNICODE(unicode);
3550 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003551 errors,
3552 &errorHandler,
3553 "utf16", errmsg,
3554 &starts,
3555 (const char **)&e,
3556 &startinpos,
3557 &endinpos,
3558 &exc,
3559 (const char **)&q,
3560 &unicode,
3561 &outpos,
3562 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003563 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003565 /* remaining byte at the end? (size should be even) */
3566 if (e == q) {
3567 if (!consumed) {
3568 errmsg = "truncated data";
3569 startinpos = ((const char *)q) - starts;
3570 endinpos = ((const char *)e) + 1 - starts;
3571 outpos = p - PyUnicode_AS_UNICODE(unicode);
3572 if (unicode_decode_call_errorhandler(
3573 errors,
3574 &errorHandler,
3575 "utf16", errmsg,
3576 &starts,
3577 (const char **)&e,
3578 &startinpos,
3579 &endinpos,
3580 &exc,
3581 (const char **)&q,
3582 &unicode,
3583 &outpos,
3584 &p))
3585 goto onError;
3586 /* The remaining input chars are ignored if the callback
3587 chooses to skip the input */
3588 }
3589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590
3591 if (byteorder)
3592 *byteorder = bo;
3593
Walter Dörwald69652032004-09-07 20:24:22 +00003594 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003595 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003596
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003598 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599 goto onError;
3600
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 Py_XDECREF(errorHandler);
3602 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603 return (PyObject *)unicode;
3604
Benjamin Peterson29060642009-01-31 22:14:21 +00003605 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003606 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003607 Py_XDECREF(errorHandler);
3608 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609 return NULL;
3610}
3611
Antoine Pitrouab868312009-01-10 15:40:25 +00003612#undef FAST_CHAR_MASK
3613#undef SWAPPED_FAST_CHAR_MASK
3614
Tim Peters772747b2001-08-09 22:21:55 +00003615PyObject *
3616PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003617 Py_ssize_t size,
3618 const char *errors,
3619 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003621 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003622 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003623 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003624#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003625 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003626#else
3627 const int pairs = 0;
3628#endif
Tim Peters772747b2001-08-09 22:21:55 +00003629 /* Offsets from p for storing byte pairs in the right order. */
3630#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3631 int ihi = 1, ilo = 0;
3632#else
3633 int ihi = 0, ilo = 1;
3634#endif
3635
Benjamin Peterson29060642009-01-31 22:14:21 +00003636#define STORECHAR(CH) \
3637 do { \
3638 p[ihi] = ((CH) >> 8) & 0xff; \
3639 p[ilo] = (CH) & 0xff; \
3640 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003641 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003643#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003644 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003645 if (s[i] >= 0x10000)
3646 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003647#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003648 /* 2 * (size + pairs + (byteorder == 0)) */
3649 if (size > PY_SSIZE_T_MAX ||
3650 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003651 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003652 nsize = size + pairs + (byteorder == 0);
3653 bytesize = nsize * 2;
3654 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003655 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003656 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 if (v == NULL)
3658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003660 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003663 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003664 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003665
3666 if (byteorder == -1) {
3667 /* force LE */
3668 ihi = 1;
3669 ilo = 0;
3670 }
3671 else if (byteorder == 1) {
3672 /* force BE */
3673 ihi = 0;
3674 ilo = 1;
3675 }
3676
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003677 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003678 Py_UNICODE ch = *s++;
3679 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003680#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003681 if (ch >= 0x10000) {
3682 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3683 ch = 0xD800 | ((ch-0x10000) >> 10);
3684 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003685#endif
Tim Peters772747b2001-08-09 22:21:55 +00003686 STORECHAR(ch);
3687 if (ch2)
3688 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003689 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003690
3691 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003692 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003693#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694}
3695
3696PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3697{
3698 if (!PyUnicode_Check(unicode)) {
3699 PyErr_BadArgument();
3700 return NULL;
3701 }
3702 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003703 PyUnicode_GET_SIZE(unicode),
3704 NULL,
3705 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706}
3707
3708/* --- Unicode Escape Codec ----------------------------------------------- */
3709
Fredrik Lundh06d12682001-01-24 07:59:11 +00003710static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003711
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003713 Py_ssize_t size,
3714 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003717 Py_ssize_t startinpos;
3718 Py_ssize_t endinpos;
3719 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003720 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003721 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003724 char* message;
3725 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003726 PyObject *errorHandler = NULL;
3727 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003728
Guido van Rossumd57fd912000-03-10 22:53:23 +00003729 /* Escaped strings will always be longer than the resulting
3730 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003731 length after conversion to the true value.
3732 (but if the error callback returns a long replacement string
3733 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003734 v = _PyUnicode_New(size);
3735 if (v == NULL)
3736 goto onError;
3737 if (size == 0)
3738 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003739
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003740 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003741 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003742
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743 while (s < end) {
3744 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003745 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003746 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747
3748 /* Non-escape characters are interpreted as Unicode ordinals */
3749 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003750 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751 continue;
3752 }
3753
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003754 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 /* \ - Escapes */
3756 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003757 c = *s++;
3758 if (s > end)
3759 c = '\0'; /* Invalid after \ */
3760 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761
Benjamin Peterson29060642009-01-31 22:14:21 +00003762 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763 case '\n': break;
3764 case '\\': *p++ = '\\'; break;
3765 case '\'': *p++ = '\''; break;
3766 case '\"': *p++ = '\"'; break;
3767 case 'b': *p++ = '\b'; break;
3768 case 'f': *p++ = '\014'; break; /* FF */
3769 case 't': *p++ = '\t'; break;
3770 case 'n': *p++ = '\n'; break;
3771 case 'r': *p++ = '\r'; break;
3772 case 'v': *p++ = '\013'; break; /* VT */
3773 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3774
Benjamin Peterson29060642009-01-31 22:14:21 +00003775 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 case '0': case '1': case '2': case '3':
3777 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003778 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003779 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003780 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003781 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003782 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003784 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 break;
3786
Benjamin Peterson29060642009-01-31 22:14:21 +00003787 /* hex escapes */
3788 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003790 digits = 2;
3791 message = "truncated \\xXX escape";
3792 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793
Benjamin Peterson29060642009-01-31 22:14:21 +00003794 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003796 digits = 4;
3797 message = "truncated \\uXXXX escape";
3798 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799
Benjamin Peterson29060642009-01-31 22:14:21 +00003800 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003801 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003802 digits = 8;
3803 message = "truncated \\UXXXXXXXX escape";
3804 hexescape:
3805 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003806 outpos = p-PyUnicode_AS_UNICODE(v);
3807 if (s+digits>end) {
3808 endinpos = size;
3809 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003810 errors, &errorHandler,
3811 "unicodeescape", "end of string in escape sequence",
3812 &starts, &end, &startinpos, &endinpos, &exc, &s,
3813 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003814 goto onError;
3815 goto nextByte;
3816 }
3817 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003818 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003819 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003820 endinpos = (s+i+1)-starts;
3821 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003822 errors, &errorHandler,
3823 "unicodeescape", message,
3824 &starts, &end, &startinpos, &endinpos, &exc, &s,
3825 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003826 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003827 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003828 }
3829 chr = (chr<<4) & ~0xF;
3830 if (c >= '0' && c <= '9')
3831 chr += c - '0';
3832 else if (c >= 'a' && c <= 'f')
3833 chr += 10 + c - 'a';
3834 else
3835 chr += 10 + c - 'A';
3836 }
3837 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003838 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003839 /* _decoding_error will have already written into the
3840 target buffer. */
3841 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003842 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003843 /* when we get here, chr is a 32-bit unicode character */
3844 if (chr <= 0xffff)
3845 /* UCS-2 character */
3846 *p++ = (Py_UNICODE) chr;
3847 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003848 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003849 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003850#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003851 *p++ = chr;
3852#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003853 chr -= 0x10000L;
3854 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003855 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003856#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003857 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003858 endinpos = s-starts;
3859 outpos = p-PyUnicode_AS_UNICODE(v);
3860 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003861 errors, &errorHandler,
3862 "unicodeescape", "illegal Unicode character",
3863 &starts, &end, &startinpos, &endinpos, &exc, &s,
3864 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003865 goto onError;
3866 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003867 break;
3868
Benjamin Peterson29060642009-01-31 22:14:21 +00003869 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003870 case 'N':
3871 message = "malformed \\N character escape";
3872 if (ucnhash_CAPI == NULL) {
3873 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003874 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003875 if (ucnhash_CAPI == NULL)
3876 goto ucnhashError;
3877 }
3878 if (*s == '{') {
3879 const char *start = s+1;
3880 /* look for the closing brace */
3881 while (*s != '}' && s < end)
3882 s++;
3883 if (s > start && s < end && *s == '}') {
3884 /* found a name. look it up in the unicode database */
3885 message = "unknown Unicode character name";
3886 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003887 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003888 goto store;
3889 }
3890 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003891 endinpos = s-starts;
3892 outpos = p-PyUnicode_AS_UNICODE(v);
3893 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003894 errors, &errorHandler,
3895 "unicodeescape", message,
3896 &starts, &end, &startinpos, &endinpos, &exc, &s,
3897 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003898 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003899 break;
3900
3901 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003902 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003903 message = "\\ at end of string";
3904 s--;
3905 endinpos = s-starts;
3906 outpos = p-PyUnicode_AS_UNICODE(v);
3907 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003908 errors, &errorHandler,
3909 "unicodeescape", message,
3910 &starts, &end, &startinpos, &endinpos, &exc, &s,
3911 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003912 goto onError;
3913 }
3914 else {
3915 *p++ = '\\';
3916 *p++ = (unsigned char)s[-1];
3917 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003918 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003920 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003921 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003922 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003923 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003924 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003925 Py_XDECREF(errorHandler);
3926 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003927 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003928
Benjamin Peterson29060642009-01-31 22:14:21 +00003929 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003930 PyErr_SetString(
3931 PyExc_UnicodeError,
3932 "\\N escapes not supported (can't load unicodedata module)"
3933 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003934 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003935 Py_XDECREF(errorHandler);
3936 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003937 return NULL;
3938
Benjamin Peterson29060642009-01-31 22:14:21 +00003939 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003941 Py_XDECREF(errorHandler);
3942 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943 return NULL;
3944}
3945
3946/* Return a Unicode-Escape string version of the Unicode object.
3947
3948 If quotes is true, the string is enclosed in u"" or u'' quotes as
3949 appropriate.
3950
3951*/
3952
Thomas Wouters477c8d52006-05-27 19:21:47 +00003953Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003954 Py_ssize_t size,
3955 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003956{
3957 /* like wcschr, but doesn't stop at NULL characters */
3958
3959 while (size-- > 0) {
3960 if (*s == ch)
3961 return s;
3962 s++;
3963 }
3964
3965 return NULL;
3966}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003967
Walter Dörwald79e913e2007-05-12 11:08:06 +00003968static const char *hexdigits = "0123456789abcdef";
3969
3970PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003971 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003973 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003976#ifdef Py_UNICODE_WIDE
3977 const Py_ssize_t expandsize = 10;
3978#else
3979 const Py_ssize_t expandsize = 6;
3980#endif
3981
Thomas Wouters89f507f2006-12-13 04:49:30 +00003982 /* XXX(nnorwitz): rather than over-allocating, it would be
3983 better to choose a different scheme. Perhaps scan the
3984 first N-chars of the string and allocate based on that size.
3985 */
3986 /* Initial allocation is based on the longest-possible unichr
3987 escape.
3988
3989 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3990 unichr, so in this case it's the longest unichr escape. In
3991 narrow (UTF-16) builds this is five chars per source unichr
3992 since there are two unichrs in the surrogate pair, so in narrow
3993 (UTF-16) builds it's not the longest unichr escape.
3994
3995 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3996 so in the narrow (UTF-16) build case it's the longest unichr
3997 escape.
3998 */
3999
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004000 if (size == 0)
4001 return PyBytes_FromStringAndSize(NULL, 0);
4002
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004003 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004004 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004005
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004006 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004007 2
4008 + expandsize*size
4009 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 if (repr == NULL)
4011 return NULL;
4012
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004013 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015 while (size-- > 0) {
4016 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004017
Walter Dörwald79e913e2007-05-12 11:08:06 +00004018 /* Escape backslashes */
4019 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004020 *p++ = '\\';
4021 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004022 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004023 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004024
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004025#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004026 /* Map 21-bit characters to '\U00xxxxxx' */
4027 else if (ch >= 0x10000) {
4028 *p++ = '\\';
4029 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004030 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4031 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4032 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4033 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4034 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4035 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4036 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4037 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004038 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004039 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004040#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004041 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4042 else if (ch >= 0xD800 && ch < 0xDC00) {
4043 Py_UNICODE ch2;
4044 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004045
Benjamin Peterson29060642009-01-31 22:14:21 +00004046 ch2 = *s++;
4047 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004048 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004049 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4050 *p++ = '\\';
4051 *p++ = 'U';
4052 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4053 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4054 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4055 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4056 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4057 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4058 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4059 *p++ = hexdigits[ucs & 0x0000000F];
4060 continue;
4061 }
4062 /* Fall through: isolated surrogates are copied as-is */
4063 s--;
4064 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004065 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004066#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004067
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004069 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 *p++ = '\\';
4071 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004072 *p++ = hexdigits[(ch >> 12) & 0x000F];
4073 *p++ = hexdigits[(ch >> 8) & 0x000F];
4074 *p++ = hexdigits[(ch >> 4) & 0x000F];
4075 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004077
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004078 /* Map special whitespace to '\t', \n', '\r' */
4079 else if (ch == '\t') {
4080 *p++ = '\\';
4081 *p++ = 't';
4082 }
4083 else if (ch == '\n') {
4084 *p++ = '\\';
4085 *p++ = 'n';
4086 }
4087 else if (ch == '\r') {
4088 *p++ = '\\';
4089 *p++ = 'r';
4090 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004091
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004092 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004093 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004095 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004096 *p++ = hexdigits[(ch >> 4) & 0x000F];
4097 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004098 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004099
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100 /* Copy everything else as-is */
4101 else
4102 *p++ = (char) ch;
4103 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004105 assert(p - PyBytes_AS_STRING(repr) > 0);
4106 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4107 return NULL;
4108 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109}
4110
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00004111PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004113 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004114 if (!PyUnicode_Check(unicode)) {
4115 PyErr_BadArgument();
4116 return NULL;
4117 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004118 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4119 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004120 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121}
4122
4123/* --- Raw Unicode Escape Codec ------------------------------------------- */
4124
4125PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004126 Py_ssize_t size,
4127 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004129 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004130 Py_ssize_t startinpos;
4131 Py_ssize_t endinpos;
4132 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004134 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 const char *end;
4136 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004137 PyObject *errorHandler = NULL;
4138 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004139
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 /* Escaped strings will always be longer than the resulting
4141 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142 length after conversion to the true value. (But decoding error
4143 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144 v = _PyUnicode_New(size);
4145 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004146 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004148 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150 end = s + size;
4151 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004152 unsigned char c;
4153 Py_UCS4 x;
4154 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004155 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156
Benjamin Peterson29060642009-01-31 22:14:21 +00004157 /* Non-escape characters are interpreted as Unicode ordinals */
4158 if (*s != '\\') {
4159 *p++ = (unsigned char)*s++;
4160 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004161 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004162 startinpos = s-starts;
4163
4164 /* \u-escapes are only interpreted iff the number of leading
4165 backslashes if odd */
4166 bs = s;
4167 for (;s < end;) {
4168 if (*s != '\\')
4169 break;
4170 *p++ = (unsigned char)*s++;
4171 }
4172 if (((s - bs) & 1) == 0 ||
4173 s >= end ||
4174 (*s != 'u' && *s != 'U')) {
4175 continue;
4176 }
4177 p--;
4178 count = *s=='u' ? 4 : 8;
4179 s++;
4180
4181 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4182 outpos = p-PyUnicode_AS_UNICODE(v);
4183 for (x = 0, i = 0; i < count; ++i, ++s) {
4184 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004185 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004186 endinpos = s-starts;
4187 if (unicode_decode_call_errorhandler(
4188 errors, &errorHandler,
4189 "rawunicodeescape", "truncated \\uXXXX",
4190 &starts, &end, &startinpos, &endinpos, &exc, &s,
4191 &v, &outpos, &p))
4192 goto onError;
4193 goto nextByte;
4194 }
4195 x = (x<<4) & ~0xF;
4196 if (c >= '0' && c <= '9')
4197 x += c - '0';
4198 else if (c >= 'a' && c <= 'f')
4199 x += 10 + c - 'a';
4200 else
4201 x += 10 + c - 'A';
4202 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004203 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004204 /* UCS-2 character */
4205 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004206 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004207 /* UCS-4 character. Either store directly, or as
4208 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004209#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004210 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004211#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 x -= 0x10000L;
4213 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4214 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004215#endif
4216 } else {
4217 endinpos = s-starts;
4218 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004219 if (unicode_decode_call_errorhandler(
4220 errors, &errorHandler,
4221 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004222 &starts, &end, &startinpos, &endinpos, &exc, &s,
4223 &v, &outpos, &p))
4224 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004225 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 nextByte:
4227 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004229 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004230 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004231 Py_XDECREF(errorHandler);
4232 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004233 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004234
Benjamin Peterson29060642009-01-31 22:14:21 +00004235 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004237 Py_XDECREF(errorHandler);
4238 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004239 return NULL;
4240}
4241
4242PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004243 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004244{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004245 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004246 char *p;
4247 char *q;
4248
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004249#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004250 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004251#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004252 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004253#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004254
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004255 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004257
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004258 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259 if (repr == NULL)
4260 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004261 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004262 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004263
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004264 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004265 while (size-- > 0) {
4266 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004267#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 /* Map 32-bit characters to '\Uxxxxxxxx' */
4269 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004270 *p++ = '\\';
4271 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004272 *p++ = hexdigits[(ch >> 28) & 0xf];
4273 *p++ = hexdigits[(ch >> 24) & 0xf];
4274 *p++ = hexdigits[(ch >> 20) & 0xf];
4275 *p++ = hexdigits[(ch >> 16) & 0xf];
4276 *p++ = hexdigits[(ch >> 12) & 0xf];
4277 *p++ = hexdigits[(ch >> 8) & 0xf];
4278 *p++ = hexdigits[(ch >> 4) & 0xf];
4279 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004280 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004281 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004282#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4284 if (ch >= 0xD800 && ch < 0xDC00) {
4285 Py_UNICODE ch2;
4286 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004287
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 ch2 = *s++;
4289 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004290 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004291 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4292 *p++ = '\\';
4293 *p++ = 'U';
4294 *p++ = hexdigits[(ucs >> 28) & 0xf];
4295 *p++ = hexdigits[(ucs >> 24) & 0xf];
4296 *p++ = hexdigits[(ucs >> 20) & 0xf];
4297 *p++ = hexdigits[(ucs >> 16) & 0xf];
4298 *p++ = hexdigits[(ucs >> 12) & 0xf];
4299 *p++ = hexdigits[(ucs >> 8) & 0xf];
4300 *p++ = hexdigits[(ucs >> 4) & 0xf];
4301 *p++ = hexdigits[ucs & 0xf];
4302 continue;
4303 }
4304 /* Fall through: isolated surrogates are copied as-is */
4305 s--;
4306 size++;
4307 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004308#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004309 /* Map 16-bit characters to '\uxxxx' */
4310 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311 *p++ = '\\';
4312 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004313 *p++ = hexdigits[(ch >> 12) & 0xf];
4314 *p++ = hexdigits[(ch >> 8) & 0xf];
4315 *p++ = hexdigits[(ch >> 4) & 0xf];
4316 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 /* Copy everything else as-is */
4319 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320 *p++ = (char) ch;
4321 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004322 size = p - q;
4323
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004324 assert(size > 0);
4325 if (_PyBytes_Resize(&repr, size) < 0)
4326 return NULL;
4327 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328}
4329
4330PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4331{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004332 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004334 PyErr_BadArgument();
4335 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004337 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4338 PyUnicode_GET_SIZE(unicode));
4339
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004340 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341}
4342
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004343/* --- Unicode Internal Codec ------------------------------------------- */
4344
4345PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004346 Py_ssize_t size,
4347 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004348{
4349 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004350 Py_ssize_t startinpos;
4351 Py_ssize_t endinpos;
4352 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004353 PyUnicodeObject *v;
4354 Py_UNICODE *p;
4355 const char *end;
4356 const char *reason;
4357 PyObject *errorHandler = NULL;
4358 PyObject *exc = NULL;
4359
Neal Norwitzd43069c2006-01-08 01:12:10 +00004360#ifdef Py_UNICODE_WIDE
4361 Py_UNICODE unimax = PyUnicode_GetMax();
4362#endif
4363
Thomas Wouters89f507f2006-12-13 04:49:30 +00004364 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004365 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4366 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004368 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004369 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004370 p = PyUnicode_AS_UNICODE(v);
4371 end = s + size;
4372
4373 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004374 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004375 /* We have to sanity check the raw data, otherwise doom looms for
4376 some malformed UCS-4 data. */
4377 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004378#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004379 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004380#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004381 end-s < Py_UNICODE_SIZE
4382 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004383 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004384 startinpos = s - starts;
4385 if (end-s < Py_UNICODE_SIZE) {
4386 endinpos = end-starts;
4387 reason = "truncated input";
4388 }
4389 else {
4390 endinpos = s - starts + Py_UNICODE_SIZE;
4391 reason = "illegal code point (> 0x10FFFF)";
4392 }
4393 outpos = p - PyUnicode_AS_UNICODE(v);
4394 if (unicode_decode_call_errorhandler(
4395 errors, &errorHandler,
4396 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004397 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004398 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004399 goto onError;
4400 }
4401 }
4402 else {
4403 p++;
4404 s += Py_UNICODE_SIZE;
4405 }
4406 }
4407
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004408 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004409 goto onError;
4410 Py_XDECREF(errorHandler);
4411 Py_XDECREF(exc);
4412 return (PyObject *)v;
4413
Benjamin Peterson29060642009-01-31 22:14:21 +00004414 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004415 Py_XDECREF(v);
4416 Py_XDECREF(errorHandler);
4417 Py_XDECREF(exc);
4418 return NULL;
4419}
4420
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421/* --- Latin-1 Codec ------------------------------------------------------ */
4422
4423PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004424 Py_ssize_t size,
4425 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426{
4427 PyUnicodeObject *v;
4428 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004429 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004430
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004432 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004433 Py_UNICODE r = *(unsigned char*)s;
4434 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004435 }
4436
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437 v = _PyUnicode_New(size);
4438 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004439 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004441 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004443 e = s + size;
4444 /* Unrolling the copy makes it much faster by reducing the looping
4445 overhead. This is similar to what many memcpy() implementations do. */
4446 unrolled_end = e - 4;
4447 while (s < unrolled_end) {
4448 p[0] = (unsigned char) s[0];
4449 p[1] = (unsigned char) s[1];
4450 p[2] = (unsigned char) s[2];
4451 p[3] = (unsigned char) s[3];
4452 s += 4;
4453 p += 4;
4454 }
4455 while (s < e)
4456 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004458
Benjamin Peterson29060642009-01-31 22:14:21 +00004459 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460 Py_XDECREF(v);
4461 return NULL;
4462}
4463
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004464/* create or adjust a UnicodeEncodeError */
4465static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004466 const char *encoding,
4467 const Py_UNICODE *unicode, Py_ssize_t size,
4468 Py_ssize_t startpos, Py_ssize_t endpos,
4469 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004471 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004472 *exceptionObject = PyUnicodeEncodeError_Create(
4473 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474 }
4475 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004476 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4477 goto onError;
4478 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4479 goto onError;
4480 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4481 goto onError;
4482 return;
4483 onError:
4484 Py_DECREF(*exceptionObject);
4485 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486 }
4487}
4488
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004489/* raises a UnicodeEncodeError */
4490static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 const char *encoding,
4492 const Py_UNICODE *unicode, Py_ssize_t size,
4493 Py_ssize_t startpos, Py_ssize_t endpos,
4494 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004495{
4496 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004498 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004499 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004500}
4501
4502/* error handling callback helper:
4503 build arguments, call the callback and check the arguments,
4504 put the result into newpos and return the replacement string, which
4505 has to be freed by the caller */
4506static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 PyObject **errorHandler,
4508 const char *encoding, const char *reason,
4509 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4510 Py_ssize_t startpos, Py_ssize_t endpos,
4511 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004512{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004513 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514
4515 PyObject *restuple;
4516 PyObject *resunicode;
4517
4518 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004520 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004521 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522 }
4523
4524 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004525 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004526 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004528
4529 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004532 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004533 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004534 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004535 Py_DECREF(restuple);
4536 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004538 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004539 &resunicode, newpos)) {
4540 Py_DECREF(restuple);
4541 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004543 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4544 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4545 Py_DECREF(restuple);
4546 return NULL;
4547 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004548 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004549 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004550 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004551 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4552 Py_DECREF(restuple);
4553 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004554 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555 Py_INCREF(resunicode);
4556 Py_DECREF(restuple);
4557 return resunicode;
4558}
4559
4560static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004561 Py_ssize_t size,
4562 const char *errors,
4563 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564{
4565 /* output object */
4566 PyObject *res;
4567 /* pointers to the beginning and end+1 of input */
4568 const Py_UNICODE *startp = p;
4569 const Py_UNICODE *endp = p + size;
4570 /* pointer to the beginning of the unencodable characters */
4571 /* const Py_UNICODE *badp = NULL; */
4572 /* pointer into the output */
4573 char *str;
4574 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004575 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004576 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4577 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578 PyObject *errorHandler = NULL;
4579 PyObject *exc = NULL;
4580 /* the following variable is used for caching string comparisons
4581 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4582 int known_errorHandler = -1;
4583
4584 /* allocate enough for a simple encoding without
4585 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004586 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004587 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004588 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004589 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004590 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004591 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004592 ressize = size;
4593
4594 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004595 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004596
Benjamin Peterson29060642009-01-31 22:14:21 +00004597 /* can we encode this? */
4598 if (c<limit) {
4599 /* no overflow check, because we know that the space is enough */
4600 *str++ = (char)c;
4601 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004602 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004603 else {
4604 Py_ssize_t unicodepos = p-startp;
4605 Py_ssize_t requiredsize;
4606 PyObject *repunicode;
4607 Py_ssize_t repsize;
4608 Py_ssize_t newpos;
4609 Py_ssize_t respos;
4610 Py_UNICODE *uni2;
4611 /* startpos for collecting unencodable chars */
4612 const Py_UNICODE *collstart = p;
4613 const Py_UNICODE *collend = p;
4614 /* find all unecodable characters */
4615 while ((collend < endp) && ((*collend)>=limit))
4616 ++collend;
4617 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4618 if (known_errorHandler==-1) {
4619 if ((errors==NULL) || (!strcmp(errors, "strict")))
4620 known_errorHandler = 1;
4621 else if (!strcmp(errors, "replace"))
4622 known_errorHandler = 2;
4623 else if (!strcmp(errors, "ignore"))
4624 known_errorHandler = 3;
4625 else if (!strcmp(errors, "xmlcharrefreplace"))
4626 known_errorHandler = 4;
4627 else
4628 known_errorHandler = 0;
4629 }
4630 switch (known_errorHandler) {
4631 case 1: /* strict */
4632 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4633 goto onError;
4634 case 2: /* replace */
4635 while (collstart++<collend)
4636 *str++ = '?'; /* fall through */
4637 case 3: /* ignore */
4638 p = collend;
4639 break;
4640 case 4: /* xmlcharrefreplace */
4641 respos = str - PyBytes_AS_STRING(res);
4642 /* determine replacement size (temporarily (mis)uses p) */
4643 for (p = collstart, repsize = 0; p < collend; ++p) {
4644 if (*p<10)
4645 repsize += 2+1+1;
4646 else if (*p<100)
4647 repsize += 2+2+1;
4648 else if (*p<1000)
4649 repsize += 2+3+1;
4650 else if (*p<10000)
4651 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004652#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004653 else
4654 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004655#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004656 else if (*p<100000)
4657 repsize += 2+5+1;
4658 else if (*p<1000000)
4659 repsize += 2+6+1;
4660 else
4661 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004662#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004663 }
4664 requiredsize = respos+repsize+(endp-collend);
4665 if (requiredsize > ressize) {
4666 if (requiredsize<2*ressize)
4667 requiredsize = 2*ressize;
4668 if (_PyBytes_Resize(&res, requiredsize))
4669 goto onError;
4670 str = PyBytes_AS_STRING(res) + respos;
4671 ressize = requiredsize;
4672 }
4673 /* generate replacement (temporarily (mis)uses p) */
4674 for (p = collstart; p < collend; ++p) {
4675 str += sprintf(str, "&#%d;", (int)*p);
4676 }
4677 p = collend;
4678 break;
4679 default:
4680 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4681 encoding, reason, startp, size, &exc,
4682 collstart-startp, collend-startp, &newpos);
4683 if (repunicode == NULL)
4684 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004685 if (PyBytes_Check(repunicode)) {
4686 /* Directly copy bytes result to output. */
4687 repsize = PyBytes_Size(repunicode);
4688 if (repsize > 1) {
4689 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004690 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004691 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4692 Py_DECREF(repunicode);
4693 goto onError;
4694 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004695 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004696 ressize += repsize-1;
4697 }
4698 memcpy(str, PyBytes_AsString(repunicode), repsize);
4699 str += repsize;
4700 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004701 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004702 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004703 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004704 /* need more space? (at least enough for what we
4705 have+the replacement+the rest of the string, so
4706 we won't have to check space for encodable characters) */
4707 respos = str - PyBytes_AS_STRING(res);
4708 repsize = PyUnicode_GET_SIZE(repunicode);
4709 requiredsize = respos+repsize+(endp-collend);
4710 if (requiredsize > ressize) {
4711 if (requiredsize<2*ressize)
4712 requiredsize = 2*ressize;
4713 if (_PyBytes_Resize(&res, requiredsize)) {
4714 Py_DECREF(repunicode);
4715 goto onError;
4716 }
4717 str = PyBytes_AS_STRING(res) + respos;
4718 ressize = requiredsize;
4719 }
4720 /* check if there is anything unencodable in the replacement
4721 and copy it to the output */
4722 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4723 c = *uni2;
4724 if (c >= limit) {
4725 raise_encode_exception(&exc, encoding, startp, size,
4726 unicodepos, unicodepos+1, reason);
4727 Py_DECREF(repunicode);
4728 goto onError;
4729 }
4730 *str = (char)c;
4731 }
4732 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004733 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004734 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004735 }
4736 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004737 /* Resize if we allocated to much */
4738 size = str - PyBytes_AS_STRING(res);
4739 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004740 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004741 if (_PyBytes_Resize(&res, size) < 0)
4742 goto onError;
4743 }
4744
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004745 Py_XDECREF(errorHandler);
4746 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004747 return res;
4748
4749 onError:
4750 Py_XDECREF(res);
4751 Py_XDECREF(errorHandler);
4752 Py_XDECREF(exc);
4753 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004754}
4755
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004757 Py_ssize_t size,
4758 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004760 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761}
4762
4763PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4764{
4765 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004766 PyErr_BadArgument();
4767 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 }
4769 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004770 PyUnicode_GET_SIZE(unicode),
4771 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772}
4773
4774/* --- 7-bit ASCII Codec -------------------------------------------------- */
4775
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004777 Py_ssize_t size,
4778 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004780 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781 PyUnicodeObject *v;
4782 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004783 Py_ssize_t startinpos;
4784 Py_ssize_t endinpos;
4785 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004786 const char *e;
4787 PyObject *errorHandler = NULL;
4788 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004789
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004791 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004792 Py_UNICODE r = *(unsigned char*)s;
4793 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004794 }
Tim Petersced69f82003-09-16 20:30:58 +00004795
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796 v = _PyUnicode_New(size);
4797 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004800 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004802 e = s + size;
4803 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004804 register unsigned char c = (unsigned char)*s;
4805 if (c < 128) {
4806 *p++ = c;
4807 ++s;
4808 }
4809 else {
4810 startinpos = s-starts;
4811 endinpos = startinpos + 1;
4812 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4813 if (unicode_decode_call_errorhandler(
4814 errors, &errorHandler,
4815 "ascii", "ordinal not in range(128)",
4816 &starts, &e, &startinpos, &endinpos, &exc, &s,
4817 &v, &outpos, &p))
4818 goto onError;
4819 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004821 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004822 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4823 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 Py_XDECREF(errorHandler);
4825 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004827
Benjamin Peterson29060642009-01-31 22:14:21 +00004828 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004830 Py_XDECREF(errorHandler);
4831 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 return NULL;
4833}
4834
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004836 Py_ssize_t size,
4837 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004839 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840}
4841
4842PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4843{
4844 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004845 PyErr_BadArgument();
4846 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847 }
4848 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004849 PyUnicode_GET_SIZE(unicode),
4850 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851}
4852
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004853#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004854
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004855/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004856
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004857#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004858#define NEED_RETRY
4859#endif
4860
4861/* XXX This code is limited to "true" double-byte encodings, as
4862 a) it assumes an incomplete character consists of a single byte, and
4863 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004864 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004865
4866static int is_dbcs_lead_byte(const char *s, int offset)
4867{
4868 const char *curr = s + offset;
4869
4870 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004871 const char *prev = CharPrev(s, curr);
4872 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004873 }
4874 return 0;
4875}
4876
4877/*
4878 * Decode MBCS string into unicode object. If 'final' is set, converts
4879 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4880 */
4881static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004882 const char *s, /* MBCS string */
4883 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004884 int final,
4885 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004886{
4887 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004888 Py_ssize_t n;
4889 DWORD usize;
4890 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004891
4892 assert(size >= 0);
4893
Victor Stinner554f3f02010-06-16 23:33:54 +00004894 /* check and handle 'errors' arg */
4895 if (errors==NULL || strcmp(errors, "strict")==0)
4896 flags = MB_ERR_INVALID_CHARS;
4897 else if (strcmp(errors, "ignore")==0)
4898 flags = 0;
4899 else {
4900 PyErr_Format(PyExc_ValueError,
4901 "mbcs encoding does not support errors='%s'",
4902 errors);
4903 return -1;
4904 }
4905
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004906 /* Skip trailing lead-byte unless 'final' is set */
4907 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004908 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004909
4910 /* First get the size of the result */
4911 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004912 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4913 if (usize==0)
4914 goto mbcs_decode_error;
4915 } else
4916 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004917
4918 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004919 /* Create unicode object */
4920 *v = _PyUnicode_New(usize);
4921 if (*v == NULL)
4922 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004923 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004924 }
4925 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 /* Extend unicode object */
4927 n = PyUnicode_GET_SIZE(*v);
4928 if (_PyUnicode_Resize(v, n + usize) < 0)
4929 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004930 }
4931
4932 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004933 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004934 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004935 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4936 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004938 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004939 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004940
4941mbcs_decode_error:
4942 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4943 we raise a UnicodeDecodeError - else it is a 'generic'
4944 windows error
4945 */
4946 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4947 /* Ideally, we should get reason from FormatMessage - this
4948 is the Windows 2000 English version of the message
4949 */
4950 PyObject *exc = NULL;
4951 const char *reason = "No mapping for the Unicode character exists "
4952 "in the target multi-byte code page.";
4953 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4954 if (exc != NULL) {
4955 PyCodec_StrictErrors(exc);
4956 Py_DECREF(exc);
4957 }
4958 } else {
4959 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4960 }
4961 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004962}
4963
4964PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004965 Py_ssize_t size,
4966 const char *errors,
4967 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004968{
4969 PyUnicodeObject *v = NULL;
4970 int done;
4971
4972 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004973 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004974
4975#ifdef NEED_RETRY
4976 retry:
4977 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004978 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004979 else
4980#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00004981 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004982
4983 if (done < 0) {
4984 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00004985 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004986 }
4987
4988 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004989 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004990
4991#ifdef NEED_RETRY
4992 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004993 s += done;
4994 size -= done;
4995 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004996 }
4997#endif
4998
4999 return (PyObject *)v;
5000}
5001
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005002PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005003 Py_ssize_t size,
5004 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005005{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005006 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5007}
5008
5009/*
5010 * Convert unicode into string object (MBCS).
5011 * Returns 0 if succeed, -1 otherwise.
5012 */
5013static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00005014 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00005015 int size, /* size of unicode */
5016 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005017{
Victor Stinner554f3f02010-06-16 23:33:54 +00005018 BOOL usedDefaultChar = FALSE;
5019 BOOL *pusedDefaultChar;
5020 int mbcssize;
5021 Py_ssize_t n;
5022 PyObject *exc = NULL;
5023 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005024
5025 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005026
Victor Stinner554f3f02010-06-16 23:33:54 +00005027 /* check and handle 'errors' arg */
5028 if (errors==NULL || strcmp(errors, "strict")==0) {
5029 flags = WC_NO_BEST_FIT_CHARS;
5030 pusedDefaultChar = &usedDefaultChar;
5031 } else if (strcmp(errors, "replace")==0) {
5032 flags = 0;
5033 pusedDefaultChar = NULL;
5034 } else {
5035 PyErr_Format(PyExc_ValueError,
5036 "mbcs encoding does not support errors='%s'",
5037 errors);
5038 return -1;
5039 }
5040
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005041 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005042 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005043 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5044 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005045 if (mbcssize == 0) {
5046 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5047 return -1;
5048 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005049 /* If we used a default char, then we failed! */
5050 if (pusedDefaultChar && *pusedDefaultChar)
5051 goto mbcs_encode_error;
5052 } else {
5053 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005054 }
5055
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005056 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005057 /* Create string object */
5058 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5059 if (*repr == NULL)
5060 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005061 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005062 }
5063 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 /* Extend string object */
5065 n = PyBytes_Size(*repr);
5066 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5067 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005068 }
5069
5070 /* Do the conversion */
5071 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005073 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5074 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5076 return -1;
5077 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005078 if (pusedDefaultChar && *pusedDefaultChar)
5079 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005080 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005081 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005082
5083mbcs_encode_error:
5084 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5085 Py_XDECREF(exc);
5086 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005087}
5088
5089PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005090 Py_ssize_t size,
5091 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005092{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005093 PyObject *repr = NULL;
5094 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005095
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005096#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005097 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005098 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005099 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005100 else
5101#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005102 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005103
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005104 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005105 Py_XDECREF(repr);
5106 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005107 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005108
5109#ifdef NEED_RETRY
5110 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 p += INT_MAX;
5112 size -= INT_MAX;
5113 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005114 }
5115#endif
5116
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005117 return repr;
5118}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005119
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005120PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5121{
5122 if (!PyUnicode_Check(unicode)) {
5123 PyErr_BadArgument();
5124 return NULL;
5125 }
5126 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005127 PyUnicode_GET_SIZE(unicode),
5128 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005129}
5130
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005131#undef NEED_RETRY
5132
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005133#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005134
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135/* --- Character Mapping Codec -------------------------------------------- */
5136
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005138 Py_ssize_t size,
5139 PyObject *mapping,
5140 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005142 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005143 Py_ssize_t startinpos;
5144 Py_ssize_t endinpos;
5145 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005146 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147 PyUnicodeObject *v;
5148 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005149 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005150 PyObject *errorHandler = NULL;
5151 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005152 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005153 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005154
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155 /* Default to Latin-1 */
5156 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158
5159 v = _PyUnicode_New(size);
5160 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005161 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005163 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005165 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005166 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 mapstring = PyUnicode_AS_UNICODE(mapping);
5168 maplen = PyUnicode_GET_SIZE(mapping);
5169 while (s < e) {
5170 unsigned char ch = *s;
5171 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005172
Benjamin Peterson29060642009-01-31 22:14:21 +00005173 if (ch < maplen)
5174 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175
Benjamin Peterson29060642009-01-31 22:14:21 +00005176 if (x == 0xfffe) {
5177 /* undefined mapping */
5178 outpos = p-PyUnicode_AS_UNICODE(v);
5179 startinpos = s-starts;
5180 endinpos = startinpos+1;
5181 if (unicode_decode_call_errorhandler(
5182 errors, &errorHandler,
5183 "charmap", "character maps to <undefined>",
5184 &starts, &e, &startinpos, &endinpos, &exc, &s,
5185 &v, &outpos, &p)) {
5186 goto onError;
5187 }
5188 continue;
5189 }
5190 *p++ = x;
5191 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005192 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005193 }
5194 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005195 while (s < e) {
5196 unsigned char ch = *s;
5197 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005198
Benjamin Peterson29060642009-01-31 22:14:21 +00005199 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5200 w = PyLong_FromLong((long)ch);
5201 if (w == NULL)
5202 goto onError;
5203 x = PyObject_GetItem(mapping, w);
5204 Py_DECREF(w);
5205 if (x == NULL) {
5206 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5207 /* No mapping found means: mapping is undefined. */
5208 PyErr_Clear();
5209 x = Py_None;
5210 Py_INCREF(x);
5211 } else
5212 goto onError;
5213 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005214
Benjamin Peterson29060642009-01-31 22:14:21 +00005215 /* Apply mapping */
5216 if (PyLong_Check(x)) {
5217 long value = PyLong_AS_LONG(x);
5218 if (value < 0 || value > 65535) {
5219 PyErr_SetString(PyExc_TypeError,
5220 "character mapping must be in range(65536)");
5221 Py_DECREF(x);
5222 goto onError;
5223 }
5224 *p++ = (Py_UNICODE)value;
5225 }
5226 else if (x == Py_None) {
5227 /* undefined mapping */
5228 outpos = p-PyUnicode_AS_UNICODE(v);
5229 startinpos = s-starts;
5230 endinpos = startinpos+1;
5231 if (unicode_decode_call_errorhandler(
5232 errors, &errorHandler,
5233 "charmap", "character maps to <undefined>",
5234 &starts, &e, &startinpos, &endinpos, &exc, &s,
5235 &v, &outpos, &p)) {
5236 Py_DECREF(x);
5237 goto onError;
5238 }
5239 Py_DECREF(x);
5240 continue;
5241 }
5242 else if (PyUnicode_Check(x)) {
5243 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005244
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 if (targetsize == 1)
5246 /* 1-1 mapping */
5247 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005248
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 else if (targetsize > 1) {
5250 /* 1-n mapping */
5251 if (targetsize > extrachars) {
5252 /* resize first */
5253 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5254 Py_ssize_t needed = (targetsize - extrachars) + \
5255 (targetsize << 2);
5256 extrachars += needed;
5257 /* XXX overflow detection missing */
5258 if (_PyUnicode_Resize(&v,
5259 PyUnicode_GET_SIZE(v) + needed) < 0) {
5260 Py_DECREF(x);
5261 goto onError;
5262 }
5263 p = PyUnicode_AS_UNICODE(v) + oldpos;
5264 }
5265 Py_UNICODE_COPY(p,
5266 PyUnicode_AS_UNICODE(x),
5267 targetsize);
5268 p += targetsize;
5269 extrachars -= targetsize;
5270 }
5271 /* 1-0 mapping: skip the character */
5272 }
5273 else {
5274 /* wrong return value */
5275 PyErr_SetString(PyExc_TypeError,
5276 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005277 Py_DECREF(x);
5278 goto onError;
5279 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005280 Py_DECREF(x);
5281 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005282 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005283 }
5284 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005285 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5286 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005287 Py_XDECREF(errorHandler);
5288 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005290
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005292 Py_XDECREF(errorHandler);
5293 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294 Py_XDECREF(v);
5295 return NULL;
5296}
5297
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005298/* Charmap encoding: the lookup table */
5299
5300struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005301 PyObject_HEAD
5302 unsigned char level1[32];
5303 int count2, count3;
5304 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005305};
5306
5307static PyObject*
5308encoding_map_size(PyObject *obj, PyObject* args)
5309{
5310 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005311 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005313}
5314
5315static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005316 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005317 PyDoc_STR("Return the size (in bytes) of this object") },
5318 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005319};
5320
5321static void
5322encoding_map_dealloc(PyObject* o)
5323{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005324 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005325}
5326
5327static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005328 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 "EncodingMap", /*tp_name*/
5330 sizeof(struct encoding_map), /*tp_basicsize*/
5331 0, /*tp_itemsize*/
5332 /* methods */
5333 encoding_map_dealloc, /*tp_dealloc*/
5334 0, /*tp_print*/
5335 0, /*tp_getattr*/
5336 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005337 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 0, /*tp_repr*/
5339 0, /*tp_as_number*/
5340 0, /*tp_as_sequence*/
5341 0, /*tp_as_mapping*/
5342 0, /*tp_hash*/
5343 0, /*tp_call*/
5344 0, /*tp_str*/
5345 0, /*tp_getattro*/
5346 0, /*tp_setattro*/
5347 0, /*tp_as_buffer*/
5348 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5349 0, /*tp_doc*/
5350 0, /*tp_traverse*/
5351 0, /*tp_clear*/
5352 0, /*tp_richcompare*/
5353 0, /*tp_weaklistoffset*/
5354 0, /*tp_iter*/
5355 0, /*tp_iternext*/
5356 encoding_map_methods, /*tp_methods*/
5357 0, /*tp_members*/
5358 0, /*tp_getset*/
5359 0, /*tp_base*/
5360 0, /*tp_dict*/
5361 0, /*tp_descr_get*/
5362 0, /*tp_descr_set*/
5363 0, /*tp_dictoffset*/
5364 0, /*tp_init*/
5365 0, /*tp_alloc*/
5366 0, /*tp_new*/
5367 0, /*tp_free*/
5368 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005369};
5370
5371PyObject*
5372PyUnicode_BuildEncodingMap(PyObject* string)
5373{
5374 Py_UNICODE *decode;
5375 PyObject *result;
5376 struct encoding_map *mresult;
5377 int i;
5378 int need_dict = 0;
5379 unsigned char level1[32];
5380 unsigned char level2[512];
5381 unsigned char *mlevel1, *mlevel2, *mlevel3;
5382 int count2 = 0, count3 = 0;
5383
5384 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5385 PyErr_BadArgument();
5386 return NULL;
5387 }
5388 decode = PyUnicode_AS_UNICODE(string);
5389 memset(level1, 0xFF, sizeof level1);
5390 memset(level2, 0xFF, sizeof level2);
5391
5392 /* If there isn't a one-to-one mapping of NULL to \0,
5393 or if there are non-BMP characters, we need to use
5394 a mapping dictionary. */
5395 if (decode[0] != 0)
5396 need_dict = 1;
5397 for (i = 1; i < 256; i++) {
5398 int l1, l2;
5399 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005400#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005401 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005402#endif
5403 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005404 need_dict = 1;
5405 break;
5406 }
5407 if (decode[i] == 0xFFFE)
5408 /* unmapped character */
5409 continue;
5410 l1 = decode[i] >> 11;
5411 l2 = decode[i] >> 7;
5412 if (level1[l1] == 0xFF)
5413 level1[l1] = count2++;
5414 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005415 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005416 }
5417
5418 if (count2 >= 0xFF || count3 >= 0xFF)
5419 need_dict = 1;
5420
5421 if (need_dict) {
5422 PyObject *result = PyDict_New();
5423 PyObject *key, *value;
5424 if (!result)
5425 return NULL;
5426 for (i = 0; i < 256; i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00005427 key = PyLong_FromLong(decode[i]);
5428 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005429 if (!key || !value)
5430 goto failed1;
5431 if (PyDict_SetItem(result, key, value) == -1)
5432 goto failed1;
5433 Py_DECREF(key);
5434 Py_DECREF(value);
5435 }
5436 return result;
5437 failed1:
5438 Py_XDECREF(key);
5439 Py_XDECREF(value);
5440 Py_DECREF(result);
5441 return NULL;
5442 }
5443
5444 /* Create a three-level trie */
5445 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5446 16*count2 + 128*count3 - 1);
5447 if (!result)
5448 return PyErr_NoMemory();
5449 PyObject_Init(result, &EncodingMapType);
5450 mresult = (struct encoding_map*)result;
5451 mresult->count2 = count2;
5452 mresult->count3 = count3;
5453 mlevel1 = mresult->level1;
5454 mlevel2 = mresult->level23;
5455 mlevel3 = mresult->level23 + 16*count2;
5456 memcpy(mlevel1, level1, 32);
5457 memset(mlevel2, 0xFF, 16*count2);
5458 memset(mlevel3, 0, 128*count3);
5459 count3 = 0;
5460 for (i = 1; i < 256; i++) {
5461 int o1, o2, o3, i2, i3;
5462 if (decode[i] == 0xFFFE)
5463 /* unmapped character */
5464 continue;
5465 o1 = decode[i]>>11;
5466 o2 = (decode[i]>>7) & 0xF;
5467 i2 = 16*mlevel1[o1] + o2;
5468 if (mlevel2[i2] == 0xFF)
5469 mlevel2[i2] = count3++;
5470 o3 = decode[i] & 0x7F;
5471 i3 = 128*mlevel2[i2] + o3;
5472 mlevel3[i3] = i;
5473 }
5474 return result;
5475}
5476
5477static int
5478encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5479{
5480 struct encoding_map *map = (struct encoding_map*)mapping;
5481 int l1 = c>>11;
5482 int l2 = (c>>7) & 0xF;
5483 int l3 = c & 0x7F;
5484 int i;
5485
5486#ifdef Py_UNICODE_WIDE
5487 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005488 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005489 }
5490#endif
5491 if (c == 0)
5492 return 0;
5493 /* level 1*/
5494 i = map->level1[l1];
5495 if (i == 0xFF) {
5496 return -1;
5497 }
5498 /* level 2*/
5499 i = map->level23[16*i+l2];
5500 if (i == 0xFF) {
5501 return -1;
5502 }
5503 /* level 3 */
5504 i = map->level23[16*map->count2 + 128*i + l3];
5505 if (i == 0) {
5506 return -1;
5507 }
5508 return i;
5509}
5510
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005511/* Lookup the character ch in the mapping. If the character
5512 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005513 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005514static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515{
Christian Heimes217cfd12007-12-02 14:31:20 +00005516 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005517 PyObject *x;
5518
5519 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005521 x = PyObject_GetItem(mapping, w);
5522 Py_DECREF(w);
5523 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005524 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5525 /* No mapping found means: mapping is undefined. */
5526 PyErr_Clear();
5527 x = Py_None;
5528 Py_INCREF(x);
5529 return x;
5530 } else
5531 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005533 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005534 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005535 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 long value = PyLong_AS_LONG(x);
5537 if (value < 0 || value > 255) {
5538 PyErr_SetString(PyExc_TypeError,
5539 "character mapping must be in range(256)");
5540 Py_DECREF(x);
5541 return NULL;
5542 }
5543 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005545 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005548 /* wrong return value */
5549 PyErr_Format(PyExc_TypeError,
5550 "character mapping must return integer, bytes or None, not %.400s",
5551 x->ob_type->tp_name);
5552 Py_DECREF(x);
5553 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 }
5555}
5556
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005557static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005558charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005559{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005560 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5561 /* exponentially overallocate to minimize reallocations */
5562 if (requiredsize < 2*outsize)
5563 requiredsize = 2*outsize;
5564 if (_PyBytes_Resize(outobj, requiredsize))
5565 return -1;
5566 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005567}
5568
Benjamin Peterson14339b62009-01-31 16:36:08 +00005569typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005571}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005572/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005573 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005574 space is available. Return a new reference to the object that
5575 was put in the output buffer, or Py_None, if the mapping was undefined
5576 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005577 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005578static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005579charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005581{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005582 PyObject *rep;
5583 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005584 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005585
Christian Heimes90aa7642007-12-19 02:45:37 +00005586 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005587 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005589 if (res == -1)
5590 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005591 if (outsize<requiredsize)
5592 if (charmapencode_resize(outobj, outpos, requiredsize))
5593 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005594 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 outstart[(*outpos)++] = (char)res;
5596 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005597 }
5598
5599 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005600 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005602 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005603 Py_DECREF(rep);
5604 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005605 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005606 if (PyLong_Check(rep)) {
5607 Py_ssize_t requiredsize = *outpos+1;
5608 if (outsize<requiredsize)
5609 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5610 Py_DECREF(rep);
5611 return enc_EXCEPTION;
5612 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005613 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005614 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005615 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005616 else {
5617 const char *repchars = PyBytes_AS_STRING(rep);
5618 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5619 Py_ssize_t requiredsize = *outpos+repsize;
5620 if (outsize<requiredsize)
5621 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5622 Py_DECREF(rep);
5623 return enc_EXCEPTION;
5624 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005625 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 memcpy(outstart + *outpos, repchars, repsize);
5627 *outpos += repsize;
5628 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005629 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005630 Py_DECREF(rep);
5631 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005632}
5633
5634/* handle an error in PyUnicode_EncodeCharmap
5635 Return 0 on success, -1 on error */
5636static
5637int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005638 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005639 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005640 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005641 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005642{
5643 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005644 Py_ssize_t repsize;
5645 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005646 Py_UNICODE *uni2;
5647 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005648 Py_ssize_t collstartpos = *inpos;
5649 Py_ssize_t collendpos = *inpos+1;
5650 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005651 char *encoding = "charmap";
5652 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005653 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005654
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005655 /* find all unencodable characters */
5656 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005657 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005658 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005659 int res = encoding_map_lookup(p[collendpos], mapping);
5660 if (res != -1)
5661 break;
5662 ++collendpos;
5663 continue;
5664 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005665
Benjamin Peterson29060642009-01-31 22:14:21 +00005666 rep = charmapencode_lookup(p[collendpos], mapping);
5667 if (rep==NULL)
5668 return -1;
5669 else if (rep!=Py_None) {
5670 Py_DECREF(rep);
5671 break;
5672 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005673 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005675 }
5676 /* cache callback name lookup
5677 * (if not done yet, i.e. it's the first error) */
5678 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005679 if ((errors==NULL) || (!strcmp(errors, "strict")))
5680 *known_errorHandler = 1;
5681 else if (!strcmp(errors, "replace"))
5682 *known_errorHandler = 2;
5683 else if (!strcmp(errors, "ignore"))
5684 *known_errorHandler = 3;
5685 else if (!strcmp(errors, "xmlcharrefreplace"))
5686 *known_errorHandler = 4;
5687 else
5688 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005689 }
5690 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005691 case 1: /* strict */
5692 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5693 return -1;
5694 case 2: /* replace */
5695 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 x = charmapencode_output('?', mapping, res, respos);
5697 if (x==enc_EXCEPTION) {
5698 return -1;
5699 }
5700 else if (x==enc_FAILED) {
5701 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5702 return -1;
5703 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005704 }
5705 /* fall through */
5706 case 3: /* ignore */
5707 *inpos = collendpos;
5708 break;
5709 case 4: /* xmlcharrefreplace */
5710 /* generate replacement (temporarily (mis)uses p) */
5711 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 char buffer[2+29+1+1];
5713 char *cp;
5714 sprintf(buffer, "&#%d;", (int)p[collpos]);
5715 for (cp = buffer; *cp; ++cp) {
5716 x = charmapencode_output(*cp, mapping, res, respos);
5717 if (x==enc_EXCEPTION)
5718 return -1;
5719 else if (x==enc_FAILED) {
5720 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5721 return -1;
5722 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005723 }
5724 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005725 *inpos = collendpos;
5726 break;
5727 default:
5728 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 encoding, reason, p, size, exceptionObject,
5730 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005731 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005732 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005733 if (PyBytes_Check(repunicode)) {
5734 /* Directly copy bytes result to output. */
5735 Py_ssize_t outsize = PyBytes_Size(*res);
5736 Py_ssize_t requiredsize;
5737 repsize = PyBytes_Size(repunicode);
5738 requiredsize = *respos + repsize;
5739 if (requiredsize > outsize)
5740 /* Make room for all additional bytes. */
5741 if (charmapencode_resize(res, respos, requiredsize)) {
5742 Py_DECREF(repunicode);
5743 return -1;
5744 }
5745 memcpy(PyBytes_AsString(*res) + *respos,
5746 PyBytes_AsString(repunicode), repsize);
5747 *respos += repsize;
5748 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005749 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005750 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005751 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005752 /* generate replacement */
5753 repsize = PyUnicode_GET_SIZE(repunicode);
5754 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 x = charmapencode_output(*uni2, mapping, res, respos);
5756 if (x==enc_EXCEPTION) {
5757 return -1;
5758 }
5759 else if (x==enc_FAILED) {
5760 Py_DECREF(repunicode);
5761 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5762 return -1;
5763 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005764 }
5765 *inpos = newpos;
5766 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 }
5768 return 0;
5769}
5770
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 Py_ssize_t size,
5773 PyObject *mapping,
5774 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005776 /* output object */
5777 PyObject *res = NULL;
5778 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005779 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005780 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005781 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782 PyObject *errorHandler = NULL;
5783 PyObject *exc = NULL;
5784 /* the following variable is used for caching string comparisons
5785 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5786 * 3=ignore, 4=xmlcharrefreplace */
5787 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788
5789 /* Default to Latin-1 */
5790 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005793 /* allocate enough for a simple encoding without
5794 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005795 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005796 if (res == NULL)
5797 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005798 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005799 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005801 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 /* try to encode it */
5803 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5804 if (x==enc_EXCEPTION) /* error */
5805 goto onError;
5806 if (x==enc_FAILED) { /* unencodable character */
5807 if (charmap_encoding_error(p, size, &inpos, mapping,
5808 &exc,
5809 &known_errorHandler, &errorHandler, errors,
5810 &res, &respos)) {
5811 goto onError;
5812 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005813 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005814 else
5815 /* done with this character => adjust input position */
5816 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005817 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005819 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005820 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005821 if (_PyBytes_Resize(&res, respos) < 0)
5822 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005823
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005824 Py_XDECREF(exc);
5825 Py_XDECREF(errorHandler);
5826 return res;
5827
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005829 Py_XDECREF(res);
5830 Py_XDECREF(exc);
5831 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832 return NULL;
5833}
5834
5835PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005837{
5838 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005839 PyErr_BadArgument();
5840 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005841 }
5842 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 PyUnicode_GET_SIZE(unicode),
5844 mapping,
5845 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005846}
5847
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005848/* create or adjust a UnicodeTranslateError */
5849static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 const Py_UNICODE *unicode, Py_ssize_t size,
5851 Py_ssize_t startpos, Py_ssize_t endpos,
5852 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005854 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005855 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005856 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857 }
5858 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5860 goto onError;
5861 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5862 goto onError;
5863 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5864 goto onError;
5865 return;
5866 onError:
5867 Py_DECREF(*exceptionObject);
5868 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 }
5870}
5871
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005872/* raises a UnicodeTranslateError */
5873static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005874 const Py_UNICODE *unicode, Py_ssize_t size,
5875 Py_ssize_t startpos, Py_ssize_t endpos,
5876 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005877{
5878 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005879 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005880 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882}
5883
5884/* error handling callback helper:
5885 build arguments, call the callback and check the arguments,
5886 put the result into newpos and return the replacement string, which
5887 has to be freed by the caller */
5888static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 PyObject **errorHandler,
5890 const char *reason,
5891 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5892 Py_ssize_t startpos, Py_ssize_t endpos,
5893 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005894{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005895 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005896
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005897 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 PyObject *restuple;
5899 PyObject *resunicode;
5900
5901 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005902 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005903 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005905 }
5906
5907 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005909 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005911
5912 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005914 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005917 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 Py_DECREF(restuple);
5919 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005920 }
5921 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 &resunicode, &i_newpos)) {
5923 Py_DECREF(restuple);
5924 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005925 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005926 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005928 else
5929 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005930 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5932 Py_DECREF(restuple);
5933 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005934 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005935 Py_INCREF(resunicode);
5936 Py_DECREF(restuple);
5937 return resunicode;
5938}
5939
5940/* Lookup the character ch in the mapping and put the result in result,
5941 which must be decrefed by the caller.
5942 Return 0 on success, -1 on error */
5943static
5944int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5945{
Christian Heimes217cfd12007-12-02 14:31:20 +00005946 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005947 PyObject *x;
5948
5949 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005951 x = PyObject_GetItem(mapping, w);
5952 Py_DECREF(w);
5953 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5955 /* No mapping found means: use 1:1 mapping. */
5956 PyErr_Clear();
5957 *result = NULL;
5958 return 0;
5959 } else
5960 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005961 }
5962 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 *result = x;
5964 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005965 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005966 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005967 long value = PyLong_AS_LONG(x);
5968 long max = PyUnicode_GetMax();
5969 if (value < 0 || value > max) {
5970 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005971 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 Py_DECREF(x);
5973 return -1;
5974 }
5975 *result = x;
5976 return 0;
5977 }
5978 else if (PyUnicode_Check(x)) {
5979 *result = x;
5980 return 0;
5981 }
5982 else {
5983 /* wrong return value */
5984 PyErr_SetString(PyExc_TypeError,
5985 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005986 Py_DECREF(x);
5987 return -1;
5988 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005989}
5990/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00005991 if not reallocate and adjust various state variables.
5992 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005993static
Walter Dörwald4894c302003-10-24 14:25:28 +00005994int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005996{
Martin v. Löwis18e16552006-02-15 17:27:45 +00005997 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00005998 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 /* remember old output position */
6000 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6001 /* exponentially overallocate to minimize reallocations */
6002 if (requiredsize < 2 * oldsize)
6003 requiredsize = 2 * oldsize;
6004 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6005 return -1;
6006 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006007 }
6008 return 0;
6009}
6010/* lookup the character, put the result in the output string and adjust
6011 various state variables. Return a new reference to the object that
6012 was put in the output buffer in *result, or Py_None, if the mapping was
6013 undefined (in which case no character was written).
6014 The called must decref result.
6015 Return 0 on success, -1 on error. */
6016static
Walter Dörwald4894c302003-10-24 14:25:28 +00006017int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6019 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006020{
Walter Dörwald4894c302003-10-24 14:25:28 +00006021 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006023 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 /* not found => default to 1:1 mapping */
6025 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026 }
6027 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006029 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006030 /* no overflow check, because we know that the space is enough */
6031 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006032 }
6033 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006034 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6035 if (repsize==1) {
6036 /* no overflow check, because we know that the space is enough */
6037 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6038 }
6039 else if (repsize!=0) {
6040 /* more than one character */
6041 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6042 (insize - (curinp-startinp)) +
6043 repsize - 1;
6044 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6045 return -1;
6046 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6047 *outp += repsize;
6048 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006049 }
6050 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006052 return 0;
6053}
6054
6055PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 Py_ssize_t size,
6057 PyObject *mapping,
6058 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006060 /* output object */
6061 PyObject *res = NULL;
6062 /* pointers to the beginning and end+1 of input */
6063 const Py_UNICODE *startp = p;
6064 const Py_UNICODE *endp = p + size;
6065 /* pointer into the output */
6066 Py_UNICODE *str;
6067 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006068 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006069 char *reason = "character maps to <undefined>";
6070 PyObject *errorHandler = NULL;
6071 PyObject *exc = NULL;
6072 /* the following variable is used for caching string comparisons
6073 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6074 * 3=ignore, 4=xmlcharrefreplace */
6075 int known_errorHandler = -1;
6076
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 PyErr_BadArgument();
6079 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006081
6082 /* allocate enough for a simple 1:1 translation without
6083 replacements, if we need more, we'll resize */
6084 res = PyUnicode_FromUnicode(NULL, size);
6085 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006087 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006089 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006091 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 /* try to encode it */
6093 PyObject *x = NULL;
6094 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6095 Py_XDECREF(x);
6096 goto onError;
6097 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006098 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 if (x!=Py_None) /* it worked => adjust input pointer */
6100 ++p;
6101 else { /* untranslatable character */
6102 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6103 Py_ssize_t repsize;
6104 Py_ssize_t newpos;
6105 Py_UNICODE *uni2;
6106 /* startpos for collecting untranslatable chars */
6107 const Py_UNICODE *collstart = p;
6108 const Py_UNICODE *collend = p+1;
6109 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 /* find all untranslatable characters */
6112 while (collend < endp) {
6113 if (charmaptranslate_lookup(*collend, mapping, &x))
6114 goto onError;
6115 Py_XDECREF(x);
6116 if (x!=Py_None)
6117 break;
6118 ++collend;
6119 }
6120 /* cache callback name lookup
6121 * (if not done yet, i.e. it's the first error) */
6122 if (known_errorHandler==-1) {
6123 if ((errors==NULL) || (!strcmp(errors, "strict")))
6124 known_errorHandler = 1;
6125 else if (!strcmp(errors, "replace"))
6126 known_errorHandler = 2;
6127 else if (!strcmp(errors, "ignore"))
6128 known_errorHandler = 3;
6129 else if (!strcmp(errors, "xmlcharrefreplace"))
6130 known_errorHandler = 4;
6131 else
6132 known_errorHandler = 0;
6133 }
6134 switch (known_errorHandler) {
6135 case 1: /* strict */
6136 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006137 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 case 2: /* replace */
6139 /* No need to check for space, this is a 1:1 replacement */
6140 for (coll = collstart; coll<collend; ++coll)
6141 *str++ = '?';
6142 /* fall through */
6143 case 3: /* ignore */
6144 p = collend;
6145 break;
6146 case 4: /* xmlcharrefreplace */
6147 /* generate replacement (temporarily (mis)uses p) */
6148 for (p = collstart; p < collend; ++p) {
6149 char buffer[2+29+1+1];
6150 char *cp;
6151 sprintf(buffer, "&#%d;", (int)*p);
6152 if (charmaptranslate_makespace(&res, &str,
6153 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6154 goto onError;
6155 for (cp = buffer; *cp; ++cp)
6156 *str++ = *cp;
6157 }
6158 p = collend;
6159 break;
6160 default:
6161 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6162 reason, startp, size, &exc,
6163 collstart-startp, collend-startp, &newpos);
6164 if (repunicode == NULL)
6165 goto onError;
6166 /* generate replacement */
6167 repsize = PyUnicode_GET_SIZE(repunicode);
6168 if (charmaptranslate_makespace(&res, &str,
6169 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6170 Py_DECREF(repunicode);
6171 goto onError;
6172 }
6173 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6174 *str++ = *uni2;
6175 p = startp + newpos;
6176 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006177 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006178 }
6179 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006180 /* Resize if we allocated to much */
6181 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006182 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006183 if (PyUnicode_Resize(&res, respos) < 0)
6184 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006185 }
6186 Py_XDECREF(exc);
6187 Py_XDECREF(errorHandler);
6188 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189
Benjamin Peterson29060642009-01-31 22:14:21 +00006190 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006191 Py_XDECREF(res);
6192 Py_XDECREF(exc);
6193 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 return NULL;
6195}
6196
6197PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 PyObject *mapping,
6199 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200{
6201 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006202
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203 str = PyUnicode_FromObject(str);
6204 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 PyUnicode_GET_SIZE(str),
6208 mapping,
6209 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210 Py_DECREF(str);
6211 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006212
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 Py_XDECREF(str);
6215 return NULL;
6216}
Tim Petersced69f82003-09-16 20:30:58 +00006217
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006218PyObject *
6219PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6220 Py_ssize_t length)
6221{
6222 PyObject *result;
6223 Py_UNICODE *p; /* write pointer into result */
6224 Py_ssize_t i;
6225 /* Copy to a new string */
6226 result = (PyObject *)_PyUnicode_New(length);
6227 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6228 if (result == NULL)
6229 return result;
6230 p = PyUnicode_AS_UNICODE(result);
6231 /* Iterate over code points */
6232 for (i = 0; i < length; i++) {
6233 Py_UNICODE ch =s[i];
6234 if (ch > 127) {
6235 int decimal = Py_UNICODE_TODECIMAL(ch);
6236 if (decimal >= 0)
6237 p[i] = '0' + decimal;
6238 }
6239 }
6240 return result;
6241}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006242/* --- Decimal Encoder ---------------------------------------------------- */
6243
6244int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 Py_ssize_t length,
6246 char *output,
6247 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006248{
6249 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006250 PyObject *errorHandler = NULL;
6251 PyObject *exc = NULL;
6252 const char *encoding = "decimal";
6253 const char *reason = "invalid decimal Unicode string";
6254 /* the following variable is used for caching string comparisons
6255 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6256 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006257
6258 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 PyErr_BadArgument();
6260 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006261 }
6262
6263 p = s;
6264 end = s + length;
6265 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 register Py_UNICODE ch = *p;
6267 int decimal;
6268 PyObject *repunicode;
6269 Py_ssize_t repsize;
6270 Py_ssize_t newpos;
6271 Py_UNICODE *uni2;
6272 Py_UNICODE *collstart;
6273 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006274
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006276 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 ++p;
6278 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006279 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 decimal = Py_UNICODE_TODECIMAL(ch);
6281 if (decimal >= 0) {
6282 *output++ = '0' + decimal;
6283 ++p;
6284 continue;
6285 }
6286 if (0 < ch && ch < 256) {
6287 *output++ = (char)ch;
6288 ++p;
6289 continue;
6290 }
6291 /* All other characters are considered unencodable */
6292 collstart = p;
6293 collend = p+1;
6294 while (collend < end) {
6295 if ((0 < *collend && *collend < 256) ||
6296 !Py_UNICODE_ISSPACE(*collend) ||
6297 Py_UNICODE_TODECIMAL(*collend))
6298 break;
6299 }
6300 /* cache callback name lookup
6301 * (if not done yet, i.e. it's the first error) */
6302 if (known_errorHandler==-1) {
6303 if ((errors==NULL) || (!strcmp(errors, "strict")))
6304 known_errorHandler = 1;
6305 else if (!strcmp(errors, "replace"))
6306 known_errorHandler = 2;
6307 else if (!strcmp(errors, "ignore"))
6308 known_errorHandler = 3;
6309 else if (!strcmp(errors, "xmlcharrefreplace"))
6310 known_errorHandler = 4;
6311 else
6312 known_errorHandler = 0;
6313 }
6314 switch (known_errorHandler) {
6315 case 1: /* strict */
6316 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6317 goto onError;
6318 case 2: /* replace */
6319 for (p = collstart; p < collend; ++p)
6320 *output++ = '?';
6321 /* fall through */
6322 case 3: /* ignore */
6323 p = collend;
6324 break;
6325 case 4: /* xmlcharrefreplace */
6326 /* generate replacement (temporarily (mis)uses p) */
6327 for (p = collstart; p < collend; ++p)
6328 output += sprintf(output, "&#%d;", (int)*p);
6329 p = collend;
6330 break;
6331 default:
6332 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6333 encoding, reason, s, length, &exc,
6334 collstart-s, collend-s, &newpos);
6335 if (repunicode == NULL)
6336 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006337 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006338 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006339 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6340 Py_DECREF(repunicode);
6341 goto onError;
6342 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006343 /* generate replacement */
6344 repsize = PyUnicode_GET_SIZE(repunicode);
6345 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6346 Py_UNICODE ch = *uni2;
6347 if (Py_UNICODE_ISSPACE(ch))
6348 *output++ = ' ';
6349 else {
6350 decimal = Py_UNICODE_TODECIMAL(ch);
6351 if (decimal >= 0)
6352 *output++ = '0' + decimal;
6353 else if (0 < ch && ch < 256)
6354 *output++ = (char)ch;
6355 else {
6356 Py_DECREF(repunicode);
6357 raise_encode_exception(&exc, encoding,
6358 s, length, collstart-s, collend-s, reason);
6359 goto onError;
6360 }
6361 }
6362 }
6363 p = s + newpos;
6364 Py_DECREF(repunicode);
6365 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006366 }
6367 /* 0-terminate the output string */
6368 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006369 Py_XDECREF(exc);
6370 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006371 return 0;
6372
Benjamin Peterson29060642009-01-31 22:14:21 +00006373 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006374 Py_XDECREF(exc);
6375 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006376 return -1;
6377}
6378
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379/* --- Helpers ------------------------------------------------------------ */
6380
Eric Smith8c663262007-08-25 02:26:07 +00006381#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006382#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006383
Thomas Wouters477c8d52006-05-27 19:21:47 +00006384#include "stringlib/count.h"
6385#include "stringlib/find.h"
6386#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006387#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006388
Eric Smith5807c412008-05-11 21:00:57 +00006389#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006390#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006391#include "stringlib/localeutil.h"
6392
Thomas Wouters477c8d52006-05-27 19:21:47 +00006393/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006394#define ADJUST_INDICES(start, end, len) \
6395 if (end > len) \
6396 end = len; \
6397 else if (end < 0) { \
6398 end += len; \
6399 if (end < 0) \
6400 end = 0; \
6401 } \
6402 if (start < 0) { \
6403 start += len; \
6404 if (start < 0) \
6405 start = 0; \
6406 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006407
Martin v. Löwis18e16552006-02-15 17:27:45 +00006408Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006409 PyObject *substr,
6410 Py_ssize_t start,
6411 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006413 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006414 PyUnicodeObject* str_obj;
6415 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006416
Thomas Wouters477c8d52006-05-27 19:21:47 +00006417 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6418 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006419 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006420 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6421 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 Py_DECREF(str_obj);
6423 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 }
Tim Petersced69f82003-09-16 20:30:58 +00006425
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006426 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006427 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006428 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6429 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006430 );
6431
6432 Py_DECREF(sub_obj);
6433 Py_DECREF(str_obj);
6434
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 return result;
6436}
6437
Martin v. Löwis18e16552006-02-15 17:27:45 +00006438Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006439 PyObject *sub,
6440 Py_ssize_t start,
6441 Py_ssize_t end,
6442 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006444 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006445
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006447 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006449 sub = PyUnicode_FromObject(sub);
6450 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006451 Py_DECREF(str);
6452 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 }
Tim Petersced69f82003-09-16 20:30:58 +00006454
Thomas Wouters477c8d52006-05-27 19:21:47 +00006455 if (direction > 0)
6456 result = stringlib_find_slice(
6457 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6458 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6459 start, end
6460 );
6461 else
6462 result = stringlib_rfind_slice(
6463 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6464 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6465 start, end
6466 );
6467
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006469 Py_DECREF(sub);
6470
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 return result;
6472}
6473
Tim Petersced69f82003-09-16 20:30:58 +00006474static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006476 PyUnicodeObject *substring,
6477 Py_ssize_t start,
6478 Py_ssize_t end,
6479 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 if (substring->length == 0)
6482 return 1;
6483
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006484 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 end -= substring->length;
6486 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488
6489 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006490 if (Py_UNICODE_MATCH(self, end, substring))
6491 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492 } else {
6493 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495 }
6496
6497 return 0;
6498}
6499
Martin v. Löwis18e16552006-02-15 17:27:45 +00006500Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006501 PyObject *substr,
6502 Py_ssize_t start,
6503 Py_ssize_t end,
6504 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006506 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006507
Guido van Rossumd57fd912000-03-10 22:53:23 +00006508 str = PyUnicode_FromObject(str);
6509 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 substr = PyUnicode_FromObject(substr);
6512 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 Py_DECREF(str);
6514 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515 }
Tim Petersced69f82003-09-16 20:30:58 +00006516
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 (PyUnicodeObject *)substr,
6519 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520 Py_DECREF(str);
6521 Py_DECREF(substr);
6522 return result;
6523}
6524
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525/* Apply fixfct filter to the Unicode object self and return a
6526 reference to the modified object */
6527
Tim Petersced69f82003-09-16 20:30:58 +00006528static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531{
6532
6533 PyUnicodeObject *u;
6534
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006535 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006536 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006538
6539 Py_UNICODE_COPY(u->str, self->str, self->length);
6540
Tim Peters7a29bd52001-09-12 03:03:31 +00006541 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 /* fixfct should return TRUE if it modified the buffer. If
6543 FALSE, return a reference to the original buffer instead
6544 (to save space, not time) */
6545 Py_INCREF(self);
6546 Py_DECREF(u);
6547 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 }
6549 return (PyObject*) u;
6550}
6551
Tim Petersced69f82003-09-16 20:30:58 +00006552static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553int fixupper(PyUnicodeObject *self)
6554{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006555 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556 Py_UNICODE *s = self->str;
6557 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006558
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006561
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 ch = Py_UNICODE_TOUPPER(*s);
6563 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 *s = ch;
6566 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 s++;
6568 }
6569
6570 return status;
6571}
6572
Tim Petersced69f82003-09-16 20:30:58 +00006573static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574int fixlower(PyUnicodeObject *self)
6575{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006576 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 Py_UNICODE *s = self->str;
6578 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006579
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006582
Benjamin Peterson29060642009-01-31 22:14:21 +00006583 ch = Py_UNICODE_TOLOWER(*s);
6584 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 *s = ch;
6587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 s++;
6589 }
6590
6591 return status;
6592}
6593
Tim Petersced69f82003-09-16 20:30:58 +00006594static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595int fixswapcase(PyUnicodeObject *self)
6596{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006597 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598 Py_UNICODE *s = self->str;
6599 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006600
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601 while (len-- > 0) {
6602 if (Py_UNICODE_ISUPPER(*s)) {
6603 *s = Py_UNICODE_TOLOWER(*s);
6604 status = 1;
6605 } else if (Py_UNICODE_ISLOWER(*s)) {
6606 *s = Py_UNICODE_TOUPPER(*s);
6607 status = 1;
6608 }
6609 s++;
6610 }
6611
6612 return status;
6613}
6614
Tim Petersced69f82003-09-16 20:30:58 +00006615static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616int fixcapitalize(PyUnicodeObject *self)
6617{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006618 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006619 Py_UNICODE *s = self->str;
6620 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006621
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006622 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006623 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006624 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 *s = Py_UNICODE_TOUPPER(*s);
6626 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006628 s++;
6629 while (--len > 0) {
6630 if (Py_UNICODE_ISUPPER(*s)) {
6631 *s = Py_UNICODE_TOLOWER(*s);
6632 status = 1;
6633 }
6634 s++;
6635 }
6636 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637}
6638
6639static
6640int fixtitle(PyUnicodeObject *self)
6641{
6642 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6643 register Py_UNICODE *e;
6644 int previous_is_cased;
6645
6646 /* Shortcut for single character strings */
6647 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006648 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6649 if (*p != ch) {
6650 *p = ch;
6651 return 1;
6652 }
6653 else
6654 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 }
Tim Petersced69f82003-09-16 20:30:58 +00006656
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657 e = p + PyUnicode_GET_SIZE(self);
6658 previous_is_cased = 0;
6659 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006660 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006661
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 if (previous_is_cased)
6663 *p = Py_UNICODE_TOLOWER(ch);
6664 else
6665 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006666
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 if (Py_UNICODE_ISLOWER(ch) ||
6668 Py_UNICODE_ISUPPER(ch) ||
6669 Py_UNICODE_ISTITLE(ch))
6670 previous_is_cased = 1;
6671 else
6672 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673 }
6674 return 1;
6675}
6676
Tim Peters8ce9f162004-08-27 01:49:32 +00006677PyObject *
6678PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679{
Skip Montanaro6543b452004-09-16 03:28:13 +00006680 const Py_UNICODE blank = ' ';
6681 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006682 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006683 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006684 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6685 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006686 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6687 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006688 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006689 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690
Tim Peters05eba1f2004-08-27 21:32:02 +00006691 fseq = PySequence_Fast(seq, "");
6692 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006693 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006694 }
6695
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006696 /* NOTE: the following code can't call back into Python code,
6697 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006698 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006699
Tim Peters05eba1f2004-08-27 21:32:02 +00006700 seqlen = PySequence_Fast_GET_SIZE(fseq);
6701 /* If empty sequence, return u"". */
6702 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006703 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6704 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006705 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006706 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006707 /* If singleton sequence with an exact Unicode, return that. */
6708 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006709 item = items[0];
6710 if (PyUnicode_CheckExact(item)) {
6711 Py_INCREF(item);
6712 res = (PyUnicodeObject *)item;
6713 goto Done;
6714 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006715 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006716 else {
6717 /* Set up sep and seplen */
6718 if (separator == NULL) {
6719 sep = &blank;
6720 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006721 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006722 else {
6723 if (!PyUnicode_Check(separator)) {
6724 PyErr_Format(PyExc_TypeError,
6725 "separator: expected str instance,"
6726 " %.80s found",
6727 Py_TYPE(separator)->tp_name);
6728 goto onError;
6729 }
6730 sep = PyUnicode_AS_UNICODE(separator);
6731 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006732 }
6733 }
6734
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006735 /* There are at least two things to join, or else we have a subclass
6736 * of str in the sequence.
6737 * Do a pre-pass to figure out the total amount of space we'll
6738 * need (sz), and see whether all argument are strings.
6739 */
6740 sz = 0;
6741 for (i = 0; i < seqlen; i++) {
6742 const Py_ssize_t old_sz = sz;
6743 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 if (!PyUnicode_Check(item)) {
6745 PyErr_Format(PyExc_TypeError,
6746 "sequence item %zd: expected str instance,"
6747 " %.80s found",
6748 i, Py_TYPE(item)->tp_name);
6749 goto onError;
6750 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006751 sz += PyUnicode_GET_SIZE(item);
6752 if (i != 0)
6753 sz += seplen;
6754 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6755 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006756 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006757 goto onError;
6758 }
6759 }
Tim Petersced69f82003-09-16 20:30:58 +00006760
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006761 res = _PyUnicode_New(sz);
6762 if (res == NULL)
6763 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006764
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006765 /* Catenate everything. */
6766 res_p = PyUnicode_AS_UNICODE(res);
6767 for (i = 0; i < seqlen; ++i) {
6768 Py_ssize_t itemlen;
6769 item = items[i];
6770 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 /* Copy item, and maybe the separator. */
6772 if (i) {
6773 Py_UNICODE_COPY(res_p, sep, seplen);
6774 res_p += seplen;
6775 }
6776 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6777 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006778 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006779
Benjamin Peterson29060642009-01-31 22:14:21 +00006780 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006781 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782 return (PyObject *)res;
6783
Benjamin Peterson29060642009-01-31 22:14:21 +00006784 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006785 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006786 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787 return NULL;
6788}
6789
Tim Petersced69f82003-09-16 20:30:58 +00006790static
6791PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 Py_ssize_t left,
6793 Py_ssize_t right,
6794 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795{
6796 PyUnicodeObject *u;
6797
6798 if (left < 0)
6799 left = 0;
6800 if (right < 0)
6801 right = 0;
6802
Tim Peters7a29bd52001-09-12 03:03:31 +00006803 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804 Py_INCREF(self);
6805 return self;
6806 }
6807
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006808 if (left > PY_SSIZE_T_MAX - self->length ||
6809 right > PY_SSIZE_T_MAX - (left + self->length)) {
6810 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6811 return NULL;
6812 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 u = _PyUnicode_New(left + self->length + right);
6814 if (u) {
6815 if (left)
6816 Py_UNICODE_FILL(u->str, fill, left);
6817 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6818 if (right)
6819 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6820 }
6821
6822 return u;
6823}
6824
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006825PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006828
6829 string = PyUnicode_FromObject(string);
6830 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006833 list = stringlib_splitlines(
6834 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6835 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836
6837 Py_DECREF(string);
6838 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839}
6840
Tim Petersced69f82003-09-16 20:30:58 +00006841static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006843 PyUnicodeObject *substring,
6844 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006847 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006850 return stringlib_split_whitespace(
6851 (PyObject*) self, self->str, self->length, maxcount
6852 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006854 return stringlib_split(
6855 (PyObject*) self, self->str, self->length,
6856 substring->str, substring->length,
6857 maxcount
6858 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859}
6860
Tim Petersced69f82003-09-16 20:30:58 +00006861static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006862PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 PyUnicodeObject *substring,
6864 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006865{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006866 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006867 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006868
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006869 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006870 return stringlib_rsplit_whitespace(
6871 (PyObject*) self, self->str, self->length, maxcount
6872 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006873
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006874 return stringlib_rsplit(
6875 (PyObject*) self, self->str, self->length,
6876 substring->str, substring->length,
6877 maxcount
6878 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006879}
6880
6881static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006882PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 PyUnicodeObject *str1,
6884 PyUnicodeObject *str2,
6885 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886{
6887 PyUnicodeObject *u;
6888
6889 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006890 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006891 else if (maxcount == 0 || self->length == 0)
6892 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893
Thomas Wouters477c8d52006-05-27 19:21:47 +00006894 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006895 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006896 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006897 if (str1->length == 0)
6898 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006899 if (str1->length == 1) {
6900 /* replace characters */
6901 Py_UNICODE u1, u2;
6902 if (!findchar(self->str, self->length, str1->str[0]))
6903 goto nothing;
6904 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6905 if (!u)
6906 return NULL;
6907 Py_UNICODE_COPY(u->str, self->str, self->length);
6908 u1 = str1->str[0];
6909 u2 = str2->str[0];
6910 for (i = 0; i < u->length; i++)
6911 if (u->str[i] == u1) {
6912 if (--maxcount < 0)
6913 break;
6914 u->str[i] = u2;
6915 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006917 i = stringlib_find(
6918 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006920 if (i < 0)
6921 goto nothing;
6922 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6923 if (!u)
6924 return NULL;
6925 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006926
6927 /* change everything in-place, starting with this one */
6928 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6929 i += str1->length;
6930
6931 while ( --maxcount > 0) {
6932 i = stringlib_find(self->str+i, self->length-i,
6933 str1->str, str1->length,
6934 i);
6935 if (i == -1)
6936 break;
6937 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6938 i += str1->length;
6939 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006942
Brett Cannonb94767f2011-02-22 20:15:44 +00006943 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006944 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 Py_UNICODE *p;
6946
6947 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006948 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6949 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006950 if (n == 0)
6951 goto nothing;
6952 /* new_size = self->length + n * (str2->length - str1->length)); */
6953 delta = (str2->length - str1->length);
6954 if (delta == 0) {
6955 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006956 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006957 product = n * (str2->length - str1->length);
6958 if ((product / (str2->length - str1->length)) != n) {
6959 PyErr_SetString(PyExc_OverflowError,
6960 "replace string is too long");
6961 return NULL;
6962 }
6963 new_size = self->length + product;
6964 if (new_size < 0) {
6965 PyErr_SetString(PyExc_OverflowError,
6966 "replace string is too long");
6967 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968 }
6969 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006970 u = _PyUnicode_New(new_size);
6971 if (!u)
6972 return NULL;
6973 i = 0;
6974 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006975 if (str1->length > 0) {
6976 while (n-- > 0) {
6977 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006978 j = stringlib_find(self->str+i, self->length-i,
6979 str1->str, str1->length,
6980 i);
6981 if (j == -1)
6982 break;
6983 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006984 /* copy unchanged part [i:j] */
6985 Py_UNICODE_COPY(p, self->str+i, j-i);
6986 p += j - i;
6987 }
6988 /* copy substitution string */
6989 if (str2->length > 0) {
6990 Py_UNICODE_COPY(p, str2->str, str2->length);
6991 p += str2->length;
6992 }
6993 i = j + str1->length;
6994 }
6995 if (i < self->length)
6996 /* copy tail [i:] */
6997 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6998 } else {
6999 /* interleave */
7000 while (n > 0) {
7001 Py_UNICODE_COPY(p, str2->str, str2->length);
7002 p += str2->length;
7003 if (--n <= 0)
7004 break;
7005 *p++ = self->str[i++];
7006 }
7007 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007011
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007013 /* nothing to replace; return original string (when possible) */
7014 if (PyUnicode_CheckExact(self)) {
7015 Py_INCREF(self);
7016 return (PyObject *) self;
7017 }
7018 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019}
7020
7021/* --- Unicode Object Methods --------------------------------------------- */
7022
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007023PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007024 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025\n\
7026Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007027characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028
7029static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007030unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032 return fixup(self, fixtitle);
7033}
7034
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007035PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007036 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037\n\
7038Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007039have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007040
7041static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007042unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044 return fixup(self, fixcapitalize);
7045}
7046
7047#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007048PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007050\n\
7051Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007052normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053
7054static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007055unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056{
7057 PyObject *list;
7058 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007059 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061 /* Split into words */
7062 list = split(self, NULL, -1);
7063 if (!list)
7064 return NULL;
7065
7066 /* Capitalize each word */
7067 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7068 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070 if (item == NULL)
7071 goto onError;
7072 Py_DECREF(PyList_GET_ITEM(list, i));
7073 PyList_SET_ITEM(list, i, item);
7074 }
7075
7076 /* Join the words to form a new string */
7077 item = PyUnicode_Join(NULL, list);
7078
Benjamin Peterson29060642009-01-31 22:14:21 +00007079 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080 Py_DECREF(list);
7081 return (PyObject *)item;
7082}
7083#endif
7084
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007085/* Argument converter. Coerces to a single unicode character */
7086
7087static int
7088convert_uc(PyObject *obj, void *addr)
7089{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007090 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7091 PyObject *uniobj;
7092 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007093
Benjamin Peterson14339b62009-01-31 16:36:08 +00007094 uniobj = PyUnicode_FromObject(obj);
7095 if (uniobj == NULL) {
7096 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007097 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007098 return 0;
7099 }
7100 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7101 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007102 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007103 Py_DECREF(uniobj);
7104 return 0;
7105 }
7106 unistr = PyUnicode_AS_UNICODE(uniobj);
7107 *fillcharloc = unistr[0];
7108 Py_DECREF(uniobj);
7109 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007110}
7111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007112PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007115Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007116done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117
7118static PyObject *
7119unicode_center(PyUnicodeObject *self, PyObject *args)
7120{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007121 Py_ssize_t marg, left;
7122 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007123 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124
Thomas Woutersde017742006-02-16 19:34:37 +00007125 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126 return NULL;
7127
Tim Peters7a29bd52001-09-12 03:03:31 +00007128 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129 Py_INCREF(self);
7130 return (PyObject*) self;
7131 }
7132
7133 marg = width - self->length;
7134 left = marg / 2 + (marg & width & 1);
7135
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007136 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137}
7138
Marc-André Lemburge5034372000-08-08 08:04:29 +00007139#if 0
7140
7141/* This code should go into some future Unicode collation support
7142 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007143 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007144
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007145/* speedy UTF-16 code point order comparison */
7146/* gleaned from: */
7147/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7148
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007149static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007150{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007151 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007152 0, 0, 0, 0, 0, 0, 0, 0,
7153 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007154 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007155};
7156
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157static int
7158unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7159{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007160 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007161
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162 Py_UNICODE *s1 = str1->str;
7163 Py_UNICODE *s2 = str2->str;
7164
7165 len1 = str1->length;
7166 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007167
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007169 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007170
7171 c1 = *s1++;
7172 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007173
Benjamin Peterson29060642009-01-31 22:14:21 +00007174 if (c1 > (1<<11) * 26)
7175 c1 += utf16Fixup[c1>>11];
7176 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007177 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007178 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007179
7180 if (c1 != c2)
7181 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007182
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007183 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007184 }
7185
7186 return (len1 < len2) ? -1 : (len1 != len2);
7187}
7188
Marc-André Lemburge5034372000-08-08 08:04:29 +00007189#else
7190
7191static int
7192unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7193{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007194 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007195
7196 Py_UNICODE *s1 = str1->str;
7197 Py_UNICODE *s2 = str2->str;
7198
7199 len1 = str1->length;
7200 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007201
Marc-André Lemburge5034372000-08-08 08:04:29 +00007202 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007203 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007204
Fredrik Lundh45714e92001-06-26 16:39:36 +00007205 c1 = *s1++;
7206 c2 = *s2++;
7207
7208 if (c1 != c2)
7209 return (c1 < c2) ? -1 : 1;
7210
Marc-André Lemburge5034372000-08-08 08:04:29 +00007211 len1--; len2--;
7212 }
7213
7214 return (len1 < len2) ? -1 : (len1 != len2);
7215}
7216
7217#endif
7218
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007220 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007221{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007222 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7223 return unicode_compare((PyUnicodeObject *)left,
7224 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007225 PyErr_Format(PyExc_TypeError,
7226 "Can't compare %.100s and %.100s",
7227 left->ob_type->tp_name,
7228 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007229 return -1;
7230}
7231
Martin v. Löwis5b222132007-06-10 09:51:05 +00007232int
7233PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7234{
7235 int i;
7236 Py_UNICODE *id;
7237 assert(PyUnicode_Check(uni));
7238 id = PyUnicode_AS_UNICODE(uni);
7239 /* Compare Unicode string and source character set string */
7240 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007241 if (id[i] != str[i])
7242 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007243 /* This check keeps Python strings that end in '\0' from comparing equal
7244 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007245 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007247 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007248 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007249 return 0;
7250}
7251
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007252
Benjamin Peterson29060642009-01-31 22:14:21 +00007253#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007254 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007255
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007256PyObject *PyUnicode_RichCompare(PyObject *left,
7257 PyObject *right,
7258 int op)
7259{
7260 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007261
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007262 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7263 PyObject *v;
7264 if (((PyUnicodeObject *) left)->length !=
7265 ((PyUnicodeObject *) right)->length) {
7266 if (op == Py_EQ) {
7267 Py_INCREF(Py_False);
7268 return Py_False;
7269 }
7270 if (op == Py_NE) {
7271 Py_INCREF(Py_True);
7272 return Py_True;
7273 }
7274 }
7275 if (left == right)
7276 result = 0;
7277 else
7278 result = unicode_compare((PyUnicodeObject *)left,
7279 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007280
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007281 /* Convert the return value to a Boolean */
7282 switch (op) {
7283 case Py_EQ:
7284 v = TEST_COND(result == 0);
7285 break;
7286 case Py_NE:
7287 v = TEST_COND(result != 0);
7288 break;
7289 case Py_LE:
7290 v = TEST_COND(result <= 0);
7291 break;
7292 case Py_GE:
7293 v = TEST_COND(result >= 0);
7294 break;
7295 case Py_LT:
7296 v = TEST_COND(result == -1);
7297 break;
7298 case Py_GT:
7299 v = TEST_COND(result == 1);
7300 break;
7301 default:
7302 PyErr_BadArgument();
7303 return NULL;
7304 }
7305 Py_INCREF(v);
7306 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007307 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007308
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007309 Py_INCREF(Py_NotImplemented);
7310 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007311}
7312
Guido van Rossum403d68b2000-03-13 15:55:09 +00007313int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007314 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007315{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007316 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007317 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007318
7319 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007320 sub = PyUnicode_FromObject(element);
7321 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007322 PyErr_Format(PyExc_TypeError,
7323 "'in <string>' requires string as left operand, not %s",
7324 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007325 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007326 }
7327
Thomas Wouters477c8d52006-05-27 19:21:47 +00007328 str = PyUnicode_FromObject(container);
7329 if (!str) {
7330 Py_DECREF(sub);
7331 return -1;
7332 }
7333
7334 result = stringlib_contains_obj(str, sub);
7335
7336 Py_DECREF(str);
7337 Py_DECREF(sub);
7338
Guido van Rossum403d68b2000-03-13 15:55:09 +00007339 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007340}
7341
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342/* Concat to string or Unicode object giving a new Unicode object. */
7343
7344PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346{
7347 PyUnicodeObject *u = NULL, *v = NULL, *w;
7348
7349 /* Coerce the two arguments */
7350 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7351 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007352 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7354 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007355 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356
7357 /* Shortcuts */
7358 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 Py_DECREF(v);
7360 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361 }
7362 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007363 Py_DECREF(u);
7364 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365 }
7366
7367 /* Concat the two Unicode strings */
7368 w = _PyUnicode_New(u->length + v->length);
7369 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007371 Py_UNICODE_COPY(w->str, u->str, u->length);
7372 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7373
7374 Py_DECREF(u);
7375 Py_DECREF(v);
7376 return (PyObject *)w;
7377
Benjamin Peterson29060642009-01-31 22:14:21 +00007378 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379 Py_XDECREF(u);
7380 Py_XDECREF(v);
7381 return NULL;
7382}
7383
Walter Dörwald1ab83302007-05-18 17:15:44 +00007384void
7385PyUnicode_Append(PyObject **pleft, PyObject *right)
7386{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007387 PyObject *new;
7388 if (*pleft == NULL)
7389 return;
7390 if (right == NULL || !PyUnicode_Check(*pleft)) {
7391 Py_DECREF(*pleft);
7392 *pleft = NULL;
7393 return;
7394 }
7395 new = PyUnicode_Concat(*pleft, right);
7396 Py_DECREF(*pleft);
7397 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007398}
7399
7400void
7401PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7402{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007403 PyUnicode_Append(pleft, right);
7404 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007405}
7406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007407PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007408 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007409\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007410Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007411string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007412interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413
7414static PyObject *
7415unicode_count(PyUnicodeObject *self, PyObject *args)
7416{
7417 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007418 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007419 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 PyObject *result;
7421
Guido van Rossumb8872e62000-05-09 14:14:27 +00007422 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007423 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007424 return NULL;
7425
7426 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007427 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007428 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007429 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007430
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007431 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007432 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007433 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007434 substring->str, substring->length,
7435 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007436 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007437
7438 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007439
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440 return result;
7441}
7442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007443PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007444 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007446Encode S using the codec registered for encoding. Default encoding\n\
7447is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007448handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007449a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7450'xmlcharrefreplace' as well as any other name registered with\n\
7451codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452
7453static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007454unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007456 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457 char *encoding = NULL;
7458 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007459
Benjamin Peterson308d6372009-09-18 21:42:35 +00007460 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7461 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007463 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007464}
7465
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007466PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007467 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468\n\
7469Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007470If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471
7472static PyObject*
7473unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7474{
7475 Py_UNICODE *e;
7476 Py_UNICODE *p;
7477 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007478 Py_UNICODE *qe;
7479 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480 PyUnicodeObject *u;
7481 int tabsize = 8;
7482
7483 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007485
Thomas Wouters7e474022000-07-16 12:04:32 +00007486 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007487 i = 0; /* chars up to and including most recent \n or \r */
7488 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7489 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490 for (p = self->str; p < e; p++)
7491 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007492 if (tabsize > 0) {
7493 incr = tabsize - (j % tabsize); /* cannot overflow */
7494 if (j > PY_SSIZE_T_MAX - incr)
7495 goto overflow1;
7496 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007497 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007500 if (j > PY_SSIZE_T_MAX - 1)
7501 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007502 j++;
7503 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007504 if (i > PY_SSIZE_T_MAX - j)
7505 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007507 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 }
7509 }
7510
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007511 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007512 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007513
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514 /* Second pass: create output string and fill it */
7515 u = _PyUnicode_New(i + j);
7516 if (!u)
7517 return NULL;
7518
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007519 j = 0; /* same as in first pass */
7520 q = u->str; /* next output char */
7521 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007522
7523 for (p = self->str; p < e; p++)
7524 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 if (tabsize > 0) {
7526 i = tabsize - (j % tabsize);
7527 j += i;
7528 while (i--) {
7529 if (q >= qe)
7530 goto overflow2;
7531 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007532 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007533 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007534 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007535 else {
7536 if (q >= qe)
7537 goto overflow2;
7538 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007539 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540 if (*p == '\n' || *p == '\r')
7541 j = 0;
7542 }
7543
7544 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007545
7546 overflow2:
7547 Py_DECREF(u);
7548 overflow1:
7549 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7550 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551}
7552
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007553PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007554 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007555\n\
7556Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007557such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558arguments start and end are interpreted as in slice notation.\n\
7559\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007560Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561
7562static PyObject *
7563unicode_find(PyUnicodeObject *self, PyObject *args)
7564{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007565 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007566 Py_ssize_t start;
7567 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007568 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569
Christian Heimes9cd17752007-11-18 19:35:23 +00007570 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572
Thomas Wouters477c8d52006-05-27 19:21:47 +00007573 result = stringlib_find_slice(
7574 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7575 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7576 start, end
7577 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578
7579 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007580
Christian Heimes217cfd12007-12-02 14:31:20 +00007581 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582}
7583
7584static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007585unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586{
7587 if (index < 0 || index >= self->length) {
7588 PyErr_SetString(PyExc_IndexError, "string index out of range");
7589 return NULL;
7590 }
7591
7592 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7593}
7594
Guido van Rossumc2504932007-09-18 19:42:40 +00007595/* Believe it or not, this produces the same value for ASCII strings
7596 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007597static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007598unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599{
Guido van Rossumc2504932007-09-18 19:42:40 +00007600 Py_ssize_t len;
7601 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007602 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007603
7604 if (self->hash != -1)
7605 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007606 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007607 p = self->str;
7608 x = *p << 7;
7609 while (--len >= 0)
7610 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007611 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007612 if (x == -1)
7613 x = -2;
7614 self->hash = x;
7615 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007616}
7617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007618PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007621Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622
7623static PyObject *
7624unicode_index(PyUnicodeObject *self, PyObject *args)
7625{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007626 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007627 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007628 Py_ssize_t start;
7629 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007630
Christian Heimes9cd17752007-11-18 19:35:23 +00007631 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007633
Thomas Wouters477c8d52006-05-27 19:21:47 +00007634 result = stringlib_find_slice(
7635 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7636 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7637 start, end
7638 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639
7640 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007641
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642 if (result < 0) {
7643 PyErr_SetString(PyExc_ValueError, "substring not found");
7644 return NULL;
7645 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007646
Christian Heimes217cfd12007-12-02 14:31:20 +00007647 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648}
7649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007650PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007651 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007653Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007654at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655
7656static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007657unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658{
7659 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7660 register const Py_UNICODE *e;
7661 int cased;
7662
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663 /* Shortcut for single character strings */
7664 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007667 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007668 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007669 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007670
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671 e = p + PyUnicode_GET_SIZE(self);
7672 cased = 0;
7673 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007675
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7677 return PyBool_FromLong(0);
7678 else if (!cased && Py_UNICODE_ISLOWER(ch))
7679 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007681 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682}
7683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007684PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007687Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007688at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689
7690static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007691unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692{
7693 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7694 register const Py_UNICODE *e;
7695 int cased;
7696
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697 /* Shortcut for single character strings */
7698 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007701 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007702 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007703 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007704
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705 e = p + PyUnicode_GET_SIZE(self);
7706 cased = 0;
7707 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007708 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007709
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7711 return PyBool_FromLong(0);
7712 else if (!cased && Py_UNICODE_ISUPPER(ch))
7713 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007715 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716}
7717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007718PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007719 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007721Return True if S is a titlecased string and there is at least one\n\
7722character in S, i.e. upper- and titlecase characters may only\n\
7723follow uncased characters and lowercase characters only cased ones.\n\
7724Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725
7726static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007727unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728{
7729 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7730 register const Py_UNICODE *e;
7731 int cased, previous_is_cased;
7732
Guido van Rossumd57fd912000-03-10 22:53:23 +00007733 /* Shortcut for single character strings */
7734 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007735 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7736 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007738 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007739 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007740 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007741
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742 e = p + PyUnicode_GET_SIZE(self);
7743 cased = 0;
7744 previous_is_cased = 0;
7745 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007746 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007747
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7749 if (previous_is_cased)
7750 return PyBool_FromLong(0);
7751 previous_is_cased = 1;
7752 cased = 1;
7753 }
7754 else if (Py_UNICODE_ISLOWER(ch)) {
7755 if (!previous_is_cased)
7756 return PyBool_FromLong(0);
7757 previous_is_cased = 1;
7758 cased = 1;
7759 }
7760 else
7761 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007763 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764}
7765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007766PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007767 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007769Return True if all characters in S are whitespace\n\
7770and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771
7772static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007773unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774{
7775 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7776 register const Py_UNICODE *e;
7777
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778 /* Shortcut for single character strings */
7779 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007780 Py_UNICODE_ISSPACE(*p))
7781 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007783 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007784 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007786
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787 e = p + PyUnicode_GET_SIZE(self);
7788 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 if (!Py_UNICODE_ISSPACE(*p))
7790 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007792 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793}
7794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007795PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007797\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007798Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007799and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007800
7801static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007802unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007803{
7804 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7805 register const Py_UNICODE *e;
7806
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007807 /* Shortcut for single character strings */
7808 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007809 Py_UNICODE_ISALPHA(*p))
7810 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007811
7812 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007813 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007815
7816 e = p + PyUnicode_GET_SIZE(self);
7817 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007818 if (!Py_UNICODE_ISALPHA(*p))
7819 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007820 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007821 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007822}
7823
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007824PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007825 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007826\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007827Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007828and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007829
7830static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007831unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007832{
7833 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7834 register const Py_UNICODE *e;
7835
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007836 /* Shortcut for single character strings */
7837 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 Py_UNICODE_ISALNUM(*p))
7839 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007840
7841 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007842 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007844
7845 e = p + PyUnicode_GET_SIZE(self);
7846 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 if (!Py_UNICODE_ISALNUM(*p))
7848 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007849 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007850 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007851}
7852
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007853PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007854 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007856Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007857False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858
7859static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007860unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861{
7862 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7863 register const Py_UNICODE *e;
7864
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865 /* Shortcut for single character strings */
7866 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 Py_UNICODE_ISDECIMAL(*p))
7868 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007870 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007871 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007873
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 e = p + PyUnicode_GET_SIZE(self);
7875 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007876 if (!Py_UNICODE_ISDECIMAL(*p))
7877 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007879 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880}
7881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007882PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007883 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007885Return True if all characters in S are digits\n\
7886and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887
7888static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007889unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890{
7891 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7892 register const Py_UNICODE *e;
7893
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894 /* Shortcut for single character strings */
7895 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007896 Py_UNICODE_ISDIGIT(*p))
7897 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007899 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007900 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007902
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903 e = p + PyUnicode_GET_SIZE(self);
7904 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 if (!Py_UNICODE_ISDIGIT(*p))
7906 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007908 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909}
7910
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007911PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007912 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007914Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007915False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916
7917static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007918unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919{
7920 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7921 register const Py_UNICODE *e;
7922
Guido van Rossumd57fd912000-03-10 22:53:23 +00007923 /* Shortcut for single character strings */
7924 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 Py_UNICODE_ISNUMERIC(*p))
7926 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007928 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007929 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007931
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932 e = p + PyUnicode_GET_SIZE(self);
7933 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 if (!Py_UNICODE_ISNUMERIC(*p))
7935 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007937 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007938}
7939
Martin v. Löwis47383402007-08-15 07:32:56 +00007940int
7941PyUnicode_IsIdentifier(PyObject *self)
7942{
7943 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7944 register const Py_UNICODE *e;
7945
7946 /* Special case for empty strings */
7947 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007948 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007949
7950 /* PEP 3131 says that the first character must be in
7951 XID_Start and subsequent characters in XID_Continue,
7952 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007953 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007954 letters, digits, underscore). However, given the current
7955 definition of XID_Start and XID_Continue, it is sufficient
7956 to check just for these, except that _ must be allowed
7957 as starting an identifier. */
7958 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7959 return 0;
7960
7961 e = p + PyUnicode_GET_SIZE(self);
7962 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 if (!_PyUnicode_IsXidContinue(*p))
7964 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007965 }
7966 return 1;
7967}
7968
7969PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007970 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007971\n\
7972Return True if S is a valid identifier according\n\
7973to the language definition.");
7974
7975static PyObject*
7976unicode_isidentifier(PyObject *self)
7977{
7978 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7979}
7980
Georg Brandl559e5d72008-06-11 18:37:52 +00007981PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00007983\n\
7984Return True if all characters in S are considered\n\
7985printable in repr() or S is empty, False otherwise.");
7986
7987static PyObject*
7988unicode_isprintable(PyObject *self)
7989{
7990 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7991 register const Py_UNICODE *e;
7992
7993 /* Shortcut for single character strings */
7994 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7995 Py_RETURN_TRUE;
7996 }
7997
7998 e = p + PyUnicode_GET_SIZE(self);
7999 for (; p < e; p++) {
8000 if (!Py_UNICODE_ISPRINTABLE(*p)) {
8001 Py_RETURN_FALSE;
8002 }
8003 }
8004 Py_RETURN_TRUE;
8005}
8006
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008007PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008008 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009\n\
8010Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008011iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012
8013static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008014unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008016 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017}
8018
Martin v. Löwis18e16552006-02-15 17:27:45 +00008019static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020unicode_length(PyUnicodeObject *self)
8021{
8022 return self->length;
8023}
8024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008025PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008028Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008029done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030
8031static PyObject *
8032unicode_ljust(PyUnicodeObject *self, PyObject *args)
8033{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008034 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008035 Py_UNICODE fillchar = ' ';
8036
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008037 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038 return NULL;
8039
Tim Peters7a29bd52001-09-12 03:03:31 +00008040 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008041 Py_INCREF(self);
8042 return (PyObject*) self;
8043 }
8044
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008045 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046}
8047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008048PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008051Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052
8053static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008054unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056 return fixup(self, fixlower);
8057}
8058
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008059#define LEFTSTRIP 0
8060#define RIGHTSTRIP 1
8061#define BOTHSTRIP 2
8062
8063/* Arrays indexed by above */
8064static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8065
8066#define STRIPNAME(i) (stripformat[i]+3)
8067
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008068/* externally visible for str.strip(unicode) */
8069PyObject *
8070_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8071{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008072 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8073 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8074 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8075 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8076 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008077
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008079
Benjamin Peterson14339b62009-01-31 16:36:08 +00008080 i = 0;
8081 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008082 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8083 i++;
8084 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008085 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008086
Benjamin Peterson14339b62009-01-31 16:36:08 +00008087 j = len;
8088 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 do {
8090 j--;
8091 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8092 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008093 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008094
Benjamin Peterson14339b62009-01-31 16:36:08 +00008095 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 Py_INCREF(self);
8097 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008098 }
8099 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008100 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008101}
8102
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103
8104static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008105do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008107 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8108 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008109
Benjamin Peterson14339b62009-01-31 16:36:08 +00008110 i = 0;
8111 if (striptype != RIGHTSTRIP) {
8112 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8113 i++;
8114 }
8115 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008116
Benjamin Peterson14339b62009-01-31 16:36:08 +00008117 j = len;
8118 if (striptype != LEFTSTRIP) {
8119 do {
8120 j--;
8121 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8122 j++;
8123 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008124
Benjamin Peterson14339b62009-01-31 16:36:08 +00008125 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8126 Py_INCREF(self);
8127 return (PyObject*)self;
8128 }
8129 else
8130 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131}
8132
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008133
8134static PyObject *
8135do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8136{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008137 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008138
Benjamin Peterson14339b62009-01-31 16:36:08 +00008139 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8140 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008141
Benjamin Peterson14339b62009-01-31 16:36:08 +00008142 if (sep != NULL && sep != Py_None) {
8143 if (PyUnicode_Check(sep))
8144 return _PyUnicode_XStrip(self, striptype, sep);
8145 else {
8146 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 "%s arg must be None or str",
8148 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008149 return NULL;
8150 }
8151 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008152
Benjamin Peterson14339b62009-01-31 16:36:08 +00008153 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008154}
8155
8156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008157PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008159\n\
8160Return a copy of the string S with leading and trailing\n\
8161whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008162If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008163
8164static PyObject *
8165unicode_strip(PyUnicodeObject *self, PyObject *args)
8166{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008167 if (PyTuple_GET_SIZE(args) == 0)
8168 return do_strip(self, BOTHSTRIP); /* Common case */
8169 else
8170 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008171}
8172
8173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008174PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008176\n\
8177Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008178If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008179
8180static PyObject *
8181unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8182{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008183 if (PyTuple_GET_SIZE(args) == 0)
8184 return do_strip(self, LEFTSTRIP); /* Common case */
8185 else
8186 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008187}
8188
8189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008190PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008192\n\
8193Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008194If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008195
8196static PyObject *
8197unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8198{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008199 if (PyTuple_GET_SIZE(args) == 0)
8200 return do_strip(self, RIGHTSTRIP); /* Common case */
8201 else
8202 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008203}
8204
8205
Guido van Rossumd57fd912000-03-10 22:53:23 +00008206static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008207unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008208{
8209 PyUnicodeObject *u;
8210 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008211 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008212 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213
Georg Brandl222de0f2009-04-12 12:01:50 +00008214 if (len < 1) {
8215 Py_INCREF(unicode_empty);
8216 return (PyObject *)unicode_empty;
8217 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008218
Tim Peters7a29bd52001-09-12 03:03:31 +00008219 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008220 /* no repeat, return original string */
8221 Py_INCREF(str);
8222 return (PyObject*) str;
8223 }
Tim Peters8f422462000-09-09 06:13:41 +00008224
8225 /* ensure # of chars needed doesn't overflow int and # of bytes
8226 * needed doesn't overflow size_t
8227 */
8228 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008229 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008230 PyErr_SetString(PyExc_OverflowError,
8231 "repeated string is too long");
8232 return NULL;
8233 }
8234 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8235 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8236 PyErr_SetString(PyExc_OverflowError,
8237 "repeated string is too long");
8238 return NULL;
8239 }
8240 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241 if (!u)
8242 return NULL;
8243
8244 p = u->str;
8245
Georg Brandl222de0f2009-04-12 12:01:50 +00008246 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008247 Py_UNICODE_FILL(p, str->str[0], len);
8248 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008249 Py_ssize_t done = str->length; /* number of characters copied this far */
8250 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008252 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008253 Py_UNICODE_COPY(p+done, p, n);
8254 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256 }
8257
8258 return (PyObject*) u;
8259}
8260
8261PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 PyObject *subobj,
8263 PyObject *replobj,
8264 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265{
8266 PyObject *self;
8267 PyObject *str1;
8268 PyObject *str2;
8269 PyObject *result;
8270
8271 self = PyUnicode_FromObject(obj);
8272 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274 str1 = PyUnicode_FromObject(subobj);
8275 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 Py_DECREF(self);
8277 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 }
8279 str2 = PyUnicode_FromObject(replobj);
8280 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 Py_DECREF(self);
8282 Py_DECREF(str1);
8283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008284 }
Tim Petersced69f82003-09-16 20:30:58 +00008285 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008286 (PyUnicodeObject *)str1,
8287 (PyUnicodeObject *)str2,
8288 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 Py_DECREF(self);
8290 Py_DECREF(str1);
8291 Py_DECREF(str2);
8292 return result;
8293}
8294
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008295PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008296 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297\n\
8298Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008299old replaced by new. If the optional argument count is\n\
8300given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301
8302static PyObject*
8303unicode_replace(PyUnicodeObject *self, PyObject *args)
8304{
8305 PyUnicodeObject *str1;
8306 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008307 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 PyObject *result;
8309
Martin v. Löwis18e16552006-02-15 17:27:45 +00008310 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 return NULL;
8312 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8313 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008316 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 Py_DECREF(str1);
8318 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008319 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320
8321 result = replace(self, str1, str2, maxcount);
8322
8323 Py_DECREF(str1);
8324 Py_DECREF(str2);
8325 return result;
8326}
8327
8328static
8329PyObject *unicode_repr(PyObject *unicode)
8330{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008331 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008332 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008333 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8334 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8335
8336 /* XXX(nnorwitz): rather than over-allocating, it would be
8337 better to choose a different scheme. Perhaps scan the
8338 first N-chars of the string and allocate based on that size.
8339 */
8340 /* Initial allocation is based on the longest-possible unichr
8341 escape.
8342
8343 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8344 unichr, so in this case it's the longest unichr escape. In
8345 narrow (UTF-16) builds this is five chars per source unichr
8346 since there are two unichrs in the surrogate pair, so in narrow
8347 (UTF-16) builds it's not the longest unichr escape.
8348
8349 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8350 so in the narrow (UTF-16) build case it's the longest unichr
8351 escape.
8352 */
8353
Walter Dörwald1ab83302007-05-18 17:15:44 +00008354 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008355 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008356#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008358#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008360#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008362 if (repr == NULL)
8363 return NULL;
8364
Walter Dörwald1ab83302007-05-18 17:15:44 +00008365 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008366
8367 /* Add quote */
8368 *p++ = (findchar(s, size, '\'') &&
8369 !findchar(s, size, '"')) ? '"' : '\'';
8370 while (size-- > 0) {
8371 Py_UNICODE ch = *s++;
8372
8373 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008374 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008375 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008376 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008377 continue;
8378 }
8379
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008381 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008382 *p++ = '\\';
8383 *p++ = 't';
8384 }
8385 else if (ch == '\n') {
8386 *p++ = '\\';
8387 *p++ = 'n';
8388 }
8389 else if (ch == '\r') {
8390 *p++ = '\\';
8391 *p++ = 'r';
8392 }
8393
8394 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008395 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008396 *p++ = '\\';
8397 *p++ = 'x';
8398 *p++ = hexdigits[(ch >> 4) & 0x000F];
8399 *p++ = hexdigits[ch & 0x000F];
8400 }
8401
Georg Brandl559e5d72008-06-11 18:37:52 +00008402 /* Copy ASCII characters as-is */
8403 else if (ch < 0x7F) {
8404 *p++ = ch;
8405 }
8406
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008408 else {
8409 Py_UCS4 ucs = ch;
8410
8411#ifndef Py_UNICODE_WIDE
8412 Py_UNICODE ch2 = 0;
8413 /* Get code point from surrogate pair */
8414 if (size > 0) {
8415 ch2 = *s;
8416 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008418 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008420 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008421 size--;
8422 }
8423 }
8424#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008425 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008426 (categories Z* and C* except ASCII space)
8427 */
8428 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8429 /* Map 8-bit characters to '\xhh' */
8430 if (ucs <= 0xff) {
8431 *p++ = '\\';
8432 *p++ = 'x';
8433 *p++ = hexdigits[(ch >> 4) & 0x000F];
8434 *p++ = hexdigits[ch & 0x000F];
8435 }
8436 /* Map 21-bit characters to '\U00xxxxxx' */
8437 else if (ucs >= 0x10000) {
8438 *p++ = '\\';
8439 *p++ = 'U';
8440 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8441 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8442 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8443 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8444 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8445 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8446 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8447 *p++ = hexdigits[ucs & 0x0000000F];
8448 }
8449 /* Map 16-bit characters to '\uxxxx' */
8450 else {
8451 *p++ = '\\';
8452 *p++ = 'u';
8453 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8454 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8455 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8456 *p++ = hexdigits[ucs & 0x000F];
8457 }
8458 }
8459 /* Copy characters as-is */
8460 else {
8461 *p++ = ch;
8462#ifndef Py_UNICODE_WIDE
8463 if (ucs >= 0x10000)
8464 *p++ = ch2;
8465#endif
8466 }
8467 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008468 }
8469 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008470 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008471
8472 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008473 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008474 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475}
8476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008477PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479\n\
8480Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008481such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008482arguments start and end are interpreted as in slice notation.\n\
8483\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008484Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008485
8486static PyObject *
8487unicode_rfind(PyUnicodeObject *self, PyObject *args)
8488{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008489 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008490 Py_ssize_t start;
8491 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008492 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008493
Christian Heimes9cd17752007-11-18 19:35:23 +00008494 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008496
Thomas Wouters477c8d52006-05-27 19:21:47 +00008497 result = stringlib_rfind_slice(
8498 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8499 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8500 start, end
8501 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502
8503 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008504
Christian Heimes217cfd12007-12-02 14:31:20 +00008505 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506}
8507
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008508PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008510\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008511Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008512
8513static PyObject *
8514unicode_rindex(PyUnicodeObject *self, PyObject *args)
8515{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008516 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008517 Py_ssize_t start;
8518 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008519 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520
Christian Heimes9cd17752007-11-18 19:35:23 +00008521 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008522 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523
Thomas Wouters477c8d52006-05-27 19:21:47 +00008524 result = stringlib_rfind_slice(
8525 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8526 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8527 start, end
8528 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008529
8530 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008531
Guido van Rossumd57fd912000-03-10 22:53:23 +00008532 if (result < 0) {
8533 PyErr_SetString(PyExc_ValueError, "substring not found");
8534 return NULL;
8535 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008536 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537}
8538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008539PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008542Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008543done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544
8545static PyObject *
8546unicode_rjust(PyUnicodeObject *self, PyObject *args)
8547{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008548 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008549 Py_UNICODE fillchar = ' ';
8550
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008551 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008552 return NULL;
8553
Tim Peters7a29bd52001-09-12 03:03:31 +00008554 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008555 Py_INCREF(self);
8556 return (PyObject*) self;
8557 }
8558
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008559 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560}
8561
Guido van Rossumd57fd912000-03-10 22:53:23 +00008562PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 PyObject *sep,
8564 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565{
8566 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008567
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568 s = PyUnicode_FromObject(s);
8569 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008570 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 if (sep != NULL) {
8572 sep = PyUnicode_FromObject(sep);
8573 if (sep == NULL) {
8574 Py_DECREF(s);
8575 return NULL;
8576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577 }
8578
8579 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8580
8581 Py_DECREF(s);
8582 Py_XDECREF(sep);
8583 return result;
8584}
8585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008586PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588\n\
8589Return a list of the words in S, using sep as the\n\
8590delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008591splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008592whitespace string is a separator and empty strings are\n\
8593removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594
8595static PyObject*
8596unicode_split(PyUnicodeObject *self, PyObject *args)
8597{
8598 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008599 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600
Martin v. Löwis18e16552006-02-15 17:27:45 +00008601 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602 return NULL;
8603
8604 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610}
8611
Thomas Wouters477c8d52006-05-27 19:21:47 +00008612PyObject *
8613PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8614{
8615 PyObject* str_obj;
8616 PyObject* sep_obj;
8617 PyObject* out;
8618
8619 str_obj = PyUnicode_FromObject(str_in);
8620 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008622 sep_obj = PyUnicode_FromObject(sep_in);
8623 if (!sep_obj) {
8624 Py_DECREF(str_obj);
8625 return NULL;
8626 }
8627
8628 out = stringlib_partition(
8629 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8630 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8631 );
8632
8633 Py_DECREF(sep_obj);
8634 Py_DECREF(str_obj);
8635
8636 return out;
8637}
8638
8639
8640PyObject *
8641PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8642{
8643 PyObject* str_obj;
8644 PyObject* sep_obj;
8645 PyObject* out;
8646
8647 str_obj = PyUnicode_FromObject(str_in);
8648 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008650 sep_obj = PyUnicode_FromObject(sep_in);
8651 if (!sep_obj) {
8652 Py_DECREF(str_obj);
8653 return NULL;
8654 }
8655
8656 out = stringlib_rpartition(
8657 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8658 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8659 );
8660
8661 Py_DECREF(sep_obj);
8662 Py_DECREF(str_obj);
8663
8664 return out;
8665}
8666
8667PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008669\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008670Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008671the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008672found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008673
8674static PyObject*
8675unicode_partition(PyUnicodeObject *self, PyObject *separator)
8676{
8677 return PyUnicode_Partition((PyObject *)self, separator);
8678}
8679
8680PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008681 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008682\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008683Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008684the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008685separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008686
8687static PyObject*
8688unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8689{
8690 return PyUnicode_RPartition((PyObject *)self, separator);
8691}
8692
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008693PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 PyObject *sep,
8695 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008696{
8697 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008698
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008699 s = PyUnicode_FromObject(s);
8700 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008701 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 if (sep != NULL) {
8703 sep = PyUnicode_FromObject(sep);
8704 if (sep == NULL) {
8705 Py_DECREF(s);
8706 return NULL;
8707 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008708 }
8709
8710 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8711
8712 Py_DECREF(s);
8713 Py_XDECREF(sep);
8714 return result;
8715}
8716
8717PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008719\n\
8720Return a list of the words in S, using sep as the\n\
8721delimiter string, starting at the end of the string and\n\
8722working to the front. If maxsplit is given, at most maxsplit\n\
8723splits are done. If sep is not specified, any whitespace string\n\
8724is a separator.");
8725
8726static PyObject*
8727unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8728{
8729 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008730 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008731
Martin v. Löwis18e16552006-02-15 17:27:45 +00008732 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008733 return NULL;
8734
8735 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008737 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008739 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008741}
8742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008743PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008745\n\
8746Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008747Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008748is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749
8750static PyObject*
8751unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8752{
Guido van Rossum86662912000-04-11 15:38:46 +00008753 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754
Guido van Rossum86662912000-04-11 15:38:46 +00008755 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756 return NULL;
8757
Guido van Rossum86662912000-04-11 15:38:46 +00008758 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008759}
8760
8761static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008762PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763{
Walter Dörwald346737f2007-05-31 10:44:43 +00008764 if (PyUnicode_CheckExact(self)) {
8765 Py_INCREF(self);
8766 return self;
8767 } else
8768 /* Subtype -- return genuine unicode string with the same value. */
8769 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8770 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771}
8772
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008773PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775\n\
8776Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008777and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778
8779static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008780unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782 return fixup(self, fixswapcase);
8783}
8784
Georg Brandlceee0772007-11-27 23:48:05 +00008785PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008786 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008787\n\
8788Return a translation table usable for str.translate().\n\
8789If there is only one argument, it must be a dictionary mapping Unicode\n\
8790ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008791Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008792If there are two arguments, they must be strings of equal length, and\n\
8793in the resulting dictionary, each character in x will be mapped to the\n\
8794character at the same position in y. If there is a third argument, it\n\
8795must be a string, whose characters will be mapped to None in the result.");
8796
8797static PyObject*
8798unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8799{
8800 PyObject *x, *y = NULL, *z = NULL;
8801 PyObject *new = NULL, *key, *value;
8802 Py_ssize_t i = 0;
8803 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008804
Georg Brandlceee0772007-11-27 23:48:05 +00008805 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8806 return NULL;
8807 new = PyDict_New();
8808 if (!new)
8809 return NULL;
8810 if (y != NULL) {
8811 /* x must be a string too, of equal length */
8812 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8813 if (!PyUnicode_Check(x)) {
8814 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8815 "be a string if there is a second argument");
8816 goto err;
8817 }
8818 if (PyUnicode_GET_SIZE(x) != ylen) {
8819 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8820 "arguments must have equal length");
8821 goto err;
8822 }
8823 /* create entries for translating chars in x to those in y */
8824 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008825 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8826 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008827 if (!key || !value)
8828 goto err;
8829 res = PyDict_SetItem(new, key, value);
8830 Py_DECREF(key);
8831 Py_DECREF(value);
8832 if (res < 0)
8833 goto err;
8834 }
8835 /* create entries for deleting chars in z */
8836 if (z != NULL) {
8837 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008838 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008839 if (!key)
8840 goto err;
8841 res = PyDict_SetItem(new, key, Py_None);
8842 Py_DECREF(key);
8843 if (res < 0)
8844 goto err;
8845 }
8846 }
8847 } else {
8848 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008849 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008850 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8851 "to maketrans it must be a dict");
8852 goto err;
8853 }
8854 /* copy entries into the new dict, converting string keys to int keys */
8855 while (PyDict_Next(x, &i, &key, &value)) {
8856 if (PyUnicode_Check(key)) {
8857 /* convert string keys to integer keys */
8858 PyObject *newkey;
8859 if (PyUnicode_GET_SIZE(key) != 1) {
8860 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8861 "table must be of length 1");
8862 goto err;
8863 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008864 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008865 if (!newkey)
8866 goto err;
8867 res = PyDict_SetItem(new, newkey, value);
8868 Py_DECREF(newkey);
8869 if (res < 0)
8870 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008871 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008872 /* just keep integer keys */
8873 if (PyDict_SetItem(new, key, value) < 0)
8874 goto err;
8875 } else {
8876 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8877 "be strings or integers");
8878 goto err;
8879 }
8880 }
8881 }
8882 return new;
8883 err:
8884 Py_DECREF(new);
8885 return NULL;
8886}
8887
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008888PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008889 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890\n\
8891Return a copy of the string S, where all characters have been mapped\n\
8892through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008893Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008894Unmapped characters are left untouched. Characters mapped to None\n\
8895are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896
8897static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008898unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899{
Georg Brandlceee0772007-11-27 23:48:05 +00008900 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901}
8902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008903PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008904 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008906Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907
8908static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008909unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911 return fixup(self, fixupper);
8912}
8913
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008914PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008915 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008916\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008917Pad a numeric string S with zeros on the left, to fill a field\n\
8918of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919
8920static PyObject *
8921unicode_zfill(PyUnicodeObject *self, PyObject *args)
8922{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008923 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924 PyUnicodeObject *u;
8925
Martin v. Löwis18e16552006-02-15 17:27:45 +00008926 Py_ssize_t width;
8927 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008928 return NULL;
8929
8930 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008931 if (PyUnicode_CheckExact(self)) {
8932 Py_INCREF(self);
8933 return (PyObject*) self;
8934 }
8935 else
8936 return PyUnicode_FromUnicode(
8937 PyUnicode_AS_UNICODE(self),
8938 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008939 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940 }
8941
8942 fill = width - self->length;
8943
8944 u = pad(self, fill, 0, '0');
8945
Walter Dörwald068325e2002-04-15 13:36:47 +00008946 if (u == NULL)
8947 return NULL;
8948
Guido van Rossumd57fd912000-03-10 22:53:23 +00008949 if (u->str[fill] == '+' || u->str[fill] == '-') {
8950 /* move sign to beginning of string */
8951 u->str[0] = u->str[fill];
8952 u->str[fill] = '0';
8953 }
8954
8955 return (PyObject*) u;
8956}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008957
8958#if 0
8959static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008960unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961{
Christian Heimes2202f872008-02-06 14:31:34 +00008962 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008964
8965static PyObject *
8966unicode__decimal2ascii(PyObject *self)
8967{
8968 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
8969 PyUnicode_GET_SIZE(self));
8970}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971#endif
8972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008973PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008974 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008976Return True if S starts with the specified prefix, False otherwise.\n\
8977With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008978With optional end, stop comparing S at that position.\n\
8979prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980
8981static PyObject *
8982unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008983 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008985 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008987 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008988 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008989 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008991 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008992 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8993 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008994 if (PyTuple_Check(subobj)) {
8995 Py_ssize_t i;
8996 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8997 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008999 if (substring == NULL)
9000 return NULL;
9001 result = tailmatch(self, substring, start, end, -1);
9002 Py_DECREF(substring);
9003 if (result) {
9004 Py_RETURN_TRUE;
9005 }
9006 }
9007 /* nothing matched */
9008 Py_RETURN_FALSE;
9009 }
9010 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009012 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009013 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009015 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016}
9017
9018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009019PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009020 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009022Return True if S ends with the specified suffix, False otherwise.\n\
9023With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009024With optional end, stop comparing S at that position.\n\
9025suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026
9027static PyObject *
9028unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009029 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009031 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009033 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009034 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009035 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009037 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009038 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9039 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009040 if (PyTuple_Check(subobj)) {
9041 Py_ssize_t i;
9042 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9043 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009044 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009045 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009046 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009047 result = tailmatch(self, substring, start, end, +1);
9048 Py_DECREF(substring);
9049 if (result) {
9050 Py_RETURN_TRUE;
9051 }
9052 }
9053 Py_RETURN_FALSE;
9054 }
9055 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009059 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009061 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062}
9063
Eric Smith8c663262007-08-25 02:26:07 +00009064#include "stringlib/string_format.h"
9065
9066PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009067 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009068\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009069Return a formatted version of S, using substitutions from args and kwargs.\n\
9070The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009071
Eric Smith27bbca62010-11-04 17:06:58 +00009072PyDoc_STRVAR(format_map__doc__,
9073 "S.format_map(mapping) -> str\n\
9074\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009075Return a formatted version of S, using substitutions from mapping.\n\
9076The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009077
Eric Smith4a7d76d2008-05-30 18:10:19 +00009078static PyObject *
9079unicode__format__(PyObject* self, PyObject* args)
9080{
9081 PyObject *format_spec;
9082
9083 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9084 return NULL;
9085
9086 return _PyUnicode_FormatAdvanced(self,
9087 PyUnicode_AS_UNICODE(format_spec),
9088 PyUnicode_GET_SIZE(format_spec));
9089}
9090
Eric Smith8c663262007-08-25 02:26:07 +00009091PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009093\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009094Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009095
9096static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009097unicode__sizeof__(PyUnicodeObject *v)
9098{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009099 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9100 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009101}
9102
9103PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009104 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009105
9106static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009107unicode_getnewargs(PyUnicodeObject *v)
9108{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009109 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009110}
9111
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112static PyMethodDef unicode_methods[] = {
9113
9114 /* Order is according to common usage: often used methods should
9115 appear first, since lookup is done sequentially. */
9116
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009117 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009118 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9119 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009120 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009121 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9122 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9123 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9124 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9125 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9126 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9127 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009128 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009129 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9130 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9131 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009132 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009133 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9134 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9135 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009136 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009137 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009138 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009139 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009140 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9141 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9142 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9143 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9144 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9145 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9146 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9147 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9148 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9149 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9150 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9151 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9152 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9153 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009154 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009155 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009156 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009157 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009158 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009159 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009160 {"maketrans", (PyCFunction) unicode_maketrans,
9161 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009162 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009163#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009164 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009165#endif
9166
9167#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009168 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009169 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009170 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009171#endif
9172
Benjamin Peterson14339b62009-01-31 16:36:08 +00009173 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174 {NULL, NULL}
9175};
9176
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009177static PyObject *
9178unicode_mod(PyObject *v, PyObject *w)
9179{
Benjamin Peterson29060642009-01-31 22:14:21 +00009180 if (!PyUnicode_Check(v)) {
9181 Py_INCREF(Py_NotImplemented);
9182 return Py_NotImplemented;
9183 }
9184 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009185}
9186
9187static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009188 0, /*nb_add*/
9189 0, /*nb_subtract*/
9190 0, /*nb_multiply*/
9191 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009192};
9193
Guido van Rossumd57fd912000-03-10 22:53:23 +00009194static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009195 (lenfunc) unicode_length, /* sq_length */
9196 PyUnicode_Concat, /* sq_concat */
9197 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9198 (ssizeargfunc) unicode_getitem, /* sq_item */
9199 0, /* sq_slice */
9200 0, /* sq_ass_item */
9201 0, /* sq_ass_slice */
9202 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009203};
9204
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009205static PyObject*
9206unicode_subscript(PyUnicodeObject* self, PyObject* item)
9207{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009208 if (PyIndex_Check(item)) {
9209 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009210 if (i == -1 && PyErr_Occurred())
9211 return NULL;
9212 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009213 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009214 return unicode_getitem(self, i);
9215 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009216 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009217 Py_UNICODE* source_buf;
9218 Py_UNICODE* result_buf;
9219 PyObject* result;
9220
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009221 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009222 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009223 return NULL;
9224 }
9225
9226 if (slicelength <= 0) {
9227 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009228 } else if (start == 0 && step == 1 && slicelength == self->length &&
9229 PyUnicode_CheckExact(self)) {
9230 Py_INCREF(self);
9231 return (PyObject *)self;
9232 } else if (step == 1) {
9233 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009234 } else {
9235 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009236 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9237 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009238
Benjamin Peterson29060642009-01-31 22:14:21 +00009239 if (result_buf == NULL)
9240 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009241
9242 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9243 result_buf[i] = source_buf[cur];
9244 }
Tim Petersced69f82003-09-16 20:30:58 +00009245
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009246 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009247 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009248 return result;
9249 }
9250 } else {
9251 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9252 return NULL;
9253 }
9254}
9255
9256static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009257 (lenfunc)unicode_length, /* mp_length */
9258 (binaryfunc)unicode_subscript, /* mp_subscript */
9259 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009260};
9261
Guido van Rossumd57fd912000-03-10 22:53:23 +00009262
Guido van Rossumd57fd912000-03-10 22:53:23 +00009263/* Helpers for PyUnicode_Format() */
9264
9265static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009266getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009267{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009268 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009269 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009270 (*p_argidx)++;
9271 if (arglen < 0)
9272 return args;
9273 else
9274 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009275 }
9276 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009277 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278 return NULL;
9279}
9280
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009281/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009282
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009283static PyObject *
9284formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009285{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009286 char *p;
9287 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009288 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009289
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290 x = PyFloat_AsDouble(v);
9291 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009292 return NULL;
9293
Guido van Rossumd57fd912000-03-10 22:53:23 +00009294 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009295 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009296
Eric Smith0923d1d2009-04-16 20:16:10 +00009297 p = PyOS_double_to_string(x, type, prec,
9298 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009299 if (p == NULL)
9300 return NULL;
9301 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009302 PyMem_Free(p);
9303 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304}
9305
Tim Peters38fd5b62000-09-21 05:43:11 +00009306static PyObject*
9307formatlong(PyObject *val, int flags, int prec, int type)
9308{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009309 char *buf;
9310 int len;
9311 PyObject *str; /* temporary string object. */
9312 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009313
Benjamin Peterson14339b62009-01-31 16:36:08 +00009314 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9315 if (!str)
9316 return NULL;
9317 result = PyUnicode_FromStringAndSize(buf, len);
9318 Py_DECREF(str);
9319 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009320}
9321
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322static int
9323formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009324 size_t buflen,
9325 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009326{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009327 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009328 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009329 if (PyUnicode_GET_SIZE(v) == 1) {
9330 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9331 buf[1] = '\0';
9332 return 1;
9333 }
9334#ifndef Py_UNICODE_WIDE
9335 if (PyUnicode_GET_SIZE(v) == 2) {
9336 /* Decode a valid surrogate pair */
9337 int c0 = PyUnicode_AS_UNICODE(v)[0];
9338 int c1 = PyUnicode_AS_UNICODE(v)[1];
9339 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9340 0xDC00 <= c1 && c1 <= 0xDFFF) {
9341 buf[0] = c0;
9342 buf[1] = c1;
9343 buf[2] = '\0';
9344 return 2;
9345 }
9346 }
9347#endif
9348 goto onError;
9349 }
9350 else {
9351 /* Integer input truncated to a character */
9352 long x;
9353 x = PyLong_AsLong(v);
9354 if (x == -1 && PyErr_Occurred())
9355 goto onError;
9356
9357 if (x < 0 || x > 0x10ffff) {
9358 PyErr_SetString(PyExc_OverflowError,
9359 "%c arg not in range(0x110000)");
9360 return -1;
9361 }
9362
9363#ifndef Py_UNICODE_WIDE
9364 if (x > 0xffff) {
9365 x -= 0x10000;
9366 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9367 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9368 return 2;
9369 }
9370#endif
9371 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009372 buf[1] = '\0';
9373 return 1;
9374 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009375
Benjamin Peterson29060642009-01-31 22:14:21 +00009376 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009377 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009378 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009379 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380}
9381
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009382/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009383 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009384*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009385#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009386
Guido van Rossumd57fd912000-03-10 22:53:23 +00009387PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009388 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009389{
9390 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009391 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392 int args_owned = 0;
9393 PyUnicodeObject *result = NULL;
9394 PyObject *dict = NULL;
9395 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009396
Guido van Rossumd57fd912000-03-10 22:53:23 +00009397 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009398 PyErr_BadInternalCall();
9399 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400 }
9401 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009402 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009403 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404 fmt = PyUnicode_AS_UNICODE(uformat);
9405 fmtcnt = PyUnicode_GET_SIZE(uformat);
9406
9407 reslen = rescnt = fmtcnt + 100;
9408 result = _PyUnicode_New(reslen);
9409 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009410 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009411 res = PyUnicode_AS_UNICODE(result);
9412
9413 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009414 arglen = PyTuple_Size(args);
9415 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009416 }
9417 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009418 arglen = -1;
9419 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009421 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009422 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009423 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009424
9425 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009426 if (*fmt != '%') {
9427 if (--rescnt < 0) {
9428 rescnt = fmtcnt + 100;
9429 reslen += rescnt;
9430 if (_PyUnicode_Resize(&result, reslen) < 0)
9431 goto onError;
9432 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9433 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009434 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009435 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009436 }
9437 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009438 /* Got a format specifier */
9439 int flags = 0;
9440 Py_ssize_t width = -1;
9441 int prec = -1;
9442 Py_UNICODE c = '\0';
9443 Py_UNICODE fill;
9444 int isnumok;
9445 PyObject *v = NULL;
9446 PyObject *temp = NULL;
9447 Py_UNICODE *pbuf;
9448 Py_UNICODE sign;
9449 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009450 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009451
Benjamin Peterson29060642009-01-31 22:14:21 +00009452 fmt++;
9453 if (*fmt == '(') {
9454 Py_UNICODE *keystart;
9455 Py_ssize_t keylen;
9456 PyObject *key;
9457 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009458
Benjamin Peterson29060642009-01-31 22:14:21 +00009459 if (dict == NULL) {
9460 PyErr_SetString(PyExc_TypeError,
9461 "format requires a mapping");
9462 goto onError;
9463 }
9464 ++fmt;
9465 --fmtcnt;
9466 keystart = fmt;
9467 /* Skip over balanced parentheses */
9468 while (pcount > 0 && --fmtcnt >= 0) {
9469 if (*fmt == ')')
9470 --pcount;
9471 else if (*fmt == '(')
9472 ++pcount;
9473 fmt++;
9474 }
9475 keylen = fmt - keystart - 1;
9476 if (fmtcnt < 0 || pcount > 0) {
9477 PyErr_SetString(PyExc_ValueError,
9478 "incomplete format key");
9479 goto onError;
9480 }
9481#if 0
9482 /* keys are converted to strings using UTF-8 and
9483 then looked up since Python uses strings to hold
9484 variables names etc. in its namespaces and we
9485 wouldn't want to break common idioms. */
9486 key = PyUnicode_EncodeUTF8(keystart,
9487 keylen,
9488 NULL);
9489#else
9490 key = PyUnicode_FromUnicode(keystart, keylen);
9491#endif
9492 if (key == NULL)
9493 goto onError;
9494 if (args_owned) {
9495 Py_DECREF(args);
9496 args_owned = 0;
9497 }
9498 args = PyObject_GetItem(dict, key);
9499 Py_DECREF(key);
9500 if (args == NULL) {
9501 goto onError;
9502 }
9503 args_owned = 1;
9504 arglen = -1;
9505 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009506 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009507 while (--fmtcnt >= 0) {
9508 switch (c = *fmt++) {
9509 case '-': flags |= F_LJUST; continue;
9510 case '+': flags |= F_SIGN; continue;
9511 case ' ': flags |= F_BLANK; continue;
9512 case '#': flags |= F_ALT; continue;
9513 case '0': flags |= F_ZERO; continue;
9514 }
9515 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009516 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009517 if (c == '*') {
9518 v = getnextarg(args, arglen, &argidx);
9519 if (v == NULL)
9520 goto onError;
9521 if (!PyLong_Check(v)) {
9522 PyErr_SetString(PyExc_TypeError,
9523 "* wants int");
9524 goto onError;
9525 }
9526 width = PyLong_AsLong(v);
9527 if (width == -1 && PyErr_Occurred())
9528 goto onError;
9529 if (width < 0) {
9530 flags |= F_LJUST;
9531 width = -width;
9532 }
9533 if (--fmtcnt >= 0)
9534 c = *fmt++;
9535 }
9536 else if (c >= '0' && c <= '9') {
9537 width = c - '0';
9538 while (--fmtcnt >= 0) {
9539 c = *fmt++;
9540 if (c < '0' || c > '9')
9541 break;
9542 if ((width*10) / 10 != width) {
9543 PyErr_SetString(PyExc_ValueError,
9544 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009545 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009546 }
9547 width = width*10 + (c - '0');
9548 }
9549 }
9550 if (c == '.') {
9551 prec = 0;
9552 if (--fmtcnt >= 0)
9553 c = *fmt++;
9554 if (c == '*') {
9555 v = getnextarg(args, arglen, &argidx);
9556 if (v == NULL)
9557 goto onError;
9558 if (!PyLong_Check(v)) {
9559 PyErr_SetString(PyExc_TypeError,
9560 "* wants int");
9561 goto onError;
9562 }
9563 prec = PyLong_AsLong(v);
9564 if (prec == -1 && PyErr_Occurred())
9565 goto onError;
9566 if (prec < 0)
9567 prec = 0;
9568 if (--fmtcnt >= 0)
9569 c = *fmt++;
9570 }
9571 else if (c >= '0' && c <= '9') {
9572 prec = c - '0';
9573 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009574 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009575 if (c < '0' || c > '9')
9576 break;
9577 if ((prec*10) / 10 != prec) {
9578 PyErr_SetString(PyExc_ValueError,
9579 "prec too big");
9580 goto onError;
9581 }
9582 prec = prec*10 + (c - '0');
9583 }
9584 }
9585 } /* prec */
9586 if (fmtcnt >= 0) {
9587 if (c == 'h' || c == 'l' || c == 'L') {
9588 if (--fmtcnt >= 0)
9589 c = *fmt++;
9590 }
9591 }
9592 if (fmtcnt < 0) {
9593 PyErr_SetString(PyExc_ValueError,
9594 "incomplete format");
9595 goto onError;
9596 }
9597 if (c != '%') {
9598 v = getnextarg(args, arglen, &argidx);
9599 if (v == NULL)
9600 goto onError;
9601 }
9602 sign = 0;
9603 fill = ' ';
9604 switch (c) {
9605
9606 case '%':
9607 pbuf = formatbuf;
9608 /* presume that buffer length is at least 1 */
9609 pbuf[0] = '%';
9610 len = 1;
9611 break;
9612
9613 case 's':
9614 case 'r':
9615 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009616 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009617 temp = v;
9618 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009619 }
9620 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009621 if (c == 's')
9622 temp = PyObject_Str(v);
9623 else if (c == 'r')
9624 temp = PyObject_Repr(v);
9625 else
9626 temp = PyObject_ASCII(v);
9627 if (temp == NULL)
9628 goto onError;
9629 if (PyUnicode_Check(temp))
9630 /* nothing to do */;
9631 else {
9632 Py_DECREF(temp);
9633 PyErr_SetString(PyExc_TypeError,
9634 "%s argument has non-string str()");
9635 goto onError;
9636 }
9637 }
9638 pbuf = PyUnicode_AS_UNICODE(temp);
9639 len = PyUnicode_GET_SIZE(temp);
9640 if (prec >= 0 && len > prec)
9641 len = prec;
9642 break;
9643
9644 case 'i':
9645 case 'd':
9646 case 'u':
9647 case 'o':
9648 case 'x':
9649 case 'X':
9650 if (c == 'i')
9651 c = 'd';
9652 isnumok = 0;
9653 if (PyNumber_Check(v)) {
9654 PyObject *iobj=NULL;
9655
9656 if (PyLong_Check(v)) {
9657 iobj = v;
9658 Py_INCREF(iobj);
9659 }
9660 else {
9661 iobj = PyNumber_Long(v);
9662 }
9663 if (iobj!=NULL) {
9664 if (PyLong_Check(iobj)) {
9665 isnumok = 1;
9666 temp = formatlong(iobj, flags, prec, c);
9667 Py_DECREF(iobj);
9668 if (!temp)
9669 goto onError;
9670 pbuf = PyUnicode_AS_UNICODE(temp);
9671 len = PyUnicode_GET_SIZE(temp);
9672 sign = 1;
9673 }
9674 else {
9675 Py_DECREF(iobj);
9676 }
9677 }
9678 }
9679 if (!isnumok) {
9680 PyErr_Format(PyExc_TypeError,
9681 "%%%c format: a number is required, "
9682 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9683 goto onError;
9684 }
9685 if (flags & F_ZERO)
9686 fill = '0';
9687 break;
9688
9689 case 'e':
9690 case 'E':
9691 case 'f':
9692 case 'F':
9693 case 'g':
9694 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009695 temp = formatfloat(v, flags, prec, c);
9696 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009697 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009698 pbuf = PyUnicode_AS_UNICODE(temp);
9699 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009700 sign = 1;
9701 if (flags & F_ZERO)
9702 fill = '0';
9703 break;
9704
9705 case 'c':
9706 pbuf = formatbuf;
9707 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9708 if (len < 0)
9709 goto onError;
9710 break;
9711
9712 default:
9713 PyErr_Format(PyExc_ValueError,
9714 "unsupported format character '%c' (0x%x) "
9715 "at index %zd",
9716 (31<=c && c<=126) ? (char)c : '?',
9717 (int)c,
9718 (Py_ssize_t)(fmt - 1 -
9719 PyUnicode_AS_UNICODE(uformat)));
9720 goto onError;
9721 }
9722 if (sign) {
9723 if (*pbuf == '-' || *pbuf == '+') {
9724 sign = *pbuf++;
9725 len--;
9726 }
9727 else if (flags & F_SIGN)
9728 sign = '+';
9729 else if (flags & F_BLANK)
9730 sign = ' ';
9731 else
9732 sign = 0;
9733 }
9734 if (width < len)
9735 width = len;
9736 if (rescnt - (sign != 0) < width) {
9737 reslen -= rescnt;
9738 rescnt = width + fmtcnt + 100;
9739 reslen += rescnt;
9740 if (reslen < 0) {
9741 Py_XDECREF(temp);
9742 PyErr_NoMemory();
9743 goto onError;
9744 }
9745 if (_PyUnicode_Resize(&result, reslen) < 0) {
9746 Py_XDECREF(temp);
9747 goto onError;
9748 }
9749 res = PyUnicode_AS_UNICODE(result)
9750 + reslen - rescnt;
9751 }
9752 if (sign) {
9753 if (fill != ' ')
9754 *res++ = sign;
9755 rescnt--;
9756 if (width > len)
9757 width--;
9758 }
9759 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9760 assert(pbuf[0] == '0');
9761 assert(pbuf[1] == c);
9762 if (fill != ' ') {
9763 *res++ = *pbuf++;
9764 *res++ = *pbuf++;
9765 }
9766 rescnt -= 2;
9767 width -= 2;
9768 if (width < 0)
9769 width = 0;
9770 len -= 2;
9771 }
9772 if (width > len && !(flags & F_LJUST)) {
9773 do {
9774 --rescnt;
9775 *res++ = fill;
9776 } while (--width > len);
9777 }
9778 if (fill == ' ') {
9779 if (sign)
9780 *res++ = sign;
9781 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9782 assert(pbuf[0] == '0');
9783 assert(pbuf[1] == c);
9784 *res++ = *pbuf++;
9785 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009786 }
9787 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009788 Py_UNICODE_COPY(res, pbuf, len);
9789 res += len;
9790 rescnt -= len;
9791 while (--width >= len) {
9792 --rescnt;
9793 *res++ = ' ';
9794 }
9795 if (dict && (argidx < arglen) && c != '%') {
9796 PyErr_SetString(PyExc_TypeError,
9797 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009798 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009799 goto onError;
9800 }
9801 Py_XDECREF(temp);
9802 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009803 } /* until end */
9804 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009805 PyErr_SetString(PyExc_TypeError,
9806 "not all arguments converted during string formatting");
9807 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009808 }
9809
Thomas Woutersa96affe2006-03-12 00:29:36 +00009810 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009811 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009812 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009813 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009814 }
9815 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009816 return (PyObject *)result;
9817
Benjamin Peterson29060642009-01-31 22:14:21 +00009818 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819 Py_XDECREF(result);
9820 Py_DECREF(uformat);
9821 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009822 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823 }
9824 return NULL;
9825}
9826
Jeremy Hylton938ace62002-07-17 16:30:39 +00009827static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009828unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9829
Tim Peters6d6c1a32001-08-02 04:15:00 +00009830static PyObject *
9831unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9832{
Benjamin Peterson29060642009-01-31 22:14:21 +00009833 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009834 static char *kwlist[] = {"object", "encoding", "errors", 0};
9835 char *encoding = NULL;
9836 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009837
Benjamin Peterson14339b62009-01-31 16:36:08 +00009838 if (type != &PyUnicode_Type)
9839 return unicode_subtype_new(type, args, kwds);
9840 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009841 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009842 return NULL;
9843 if (x == NULL)
9844 return (PyObject *)_PyUnicode_New(0);
9845 if (encoding == NULL && errors == NULL)
9846 return PyObject_Str(x);
9847 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009848 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009849}
9850
Guido van Rossume023fe02001-08-30 03:12:59 +00009851static PyObject *
9852unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9853{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009854 PyUnicodeObject *tmp, *pnew;
9855 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009856
Benjamin Peterson14339b62009-01-31 16:36:08 +00009857 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9858 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9859 if (tmp == NULL)
9860 return NULL;
9861 assert(PyUnicode_Check(tmp));
9862 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9863 if (pnew == NULL) {
9864 Py_DECREF(tmp);
9865 return NULL;
9866 }
9867 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9868 if (pnew->str == NULL) {
9869 _Py_ForgetReference((PyObject *)pnew);
9870 PyObject_Del(pnew);
9871 Py_DECREF(tmp);
9872 return PyErr_NoMemory();
9873 }
9874 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9875 pnew->length = n;
9876 pnew->hash = tmp->hash;
9877 Py_DECREF(tmp);
9878 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009879}
9880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009881PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009882 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009883\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009884Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009885encoding defaults to the current default string encoding.\n\
9886errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009887
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009888static PyObject *unicode_iter(PyObject *seq);
9889
Guido van Rossumd57fd912000-03-10 22:53:23 +00009890PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009891 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009892 "str", /* tp_name */
9893 sizeof(PyUnicodeObject), /* tp_size */
9894 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009896 (destructor)unicode_dealloc, /* tp_dealloc */
9897 0, /* tp_print */
9898 0, /* tp_getattr */
9899 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009900 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009901 unicode_repr, /* tp_repr */
9902 &unicode_as_number, /* tp_as_number */
9903 &unicode_as_sequence, /* tp_as_sequence */
9904 &unicode_as_mapping, /* tp_as_mapping */
9905 (hashfunc) unicode_hash, /* tp_hash*/
9906 0, /* tp_call*/
9907 (reprfunc) unicode_str, /* tp_str */
9908 PyObject_GenericGetAttr, /* tp_getattro */
9909 0, /* tp_setattro */
9910 0, /* tp_as_buffer */
9911 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009912 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009913 unicode_doc, /* tp_doc */
9914 0, /* tp_traverse */
9915 0, /* tp_clear */
9916 PyUnicode_RichCompare, /* tp_richcompare */
9917 0, /* tp_weaklistoffset */
9918 unicode_iter, /* tp_iter */
9919 0, /* tp_iternext */
9920 unicode_methods, /* tp_methods */
9921 0, /* tp_members */
9922 0, /* tp_getset */
9923 &PyBaseObject_Type, /* tp_base */
9924 0, /* tp_dict */
9925 0, /* tp_descr_get */
9926 0, /* tp_descr_set */
9927 0, /* tp_dictoffset */
9928 0, /* tp_init */
9929 0, /* tp_alloc */
9930 unicode_new, /* tp_new */
9931 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932};
9933
9934/* Initialize the Unicode implementation */
9935
Thomas Wouters78890102000-07-22 19:25:51 +00009936void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009937{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009938 int i;
9939
Thomas Wouters477c8d52006-05-27 19:21:47 +00009940 /* XXX - move this array to unicodectype.c ? */
9941 Py_UNICODE linebreak[] = {
9942 0x000A, /* LINE FEED */
9943 0x000D, /* CARRIAGE RETURN */
9944 0x001C, /* FILE SEPARATOR */
9945 0x001D, /* GROUP SEPARATOR */
9946 0x001E, /* RECORD SEPARATOR */
9947 0x0085, /* NEXT LINE */
9948 0x2028, /* LINE SEPARATOR */
9949 0x2029, /* PARAGRAPH SEPARATOR */
9950 };
9951
Fred Drakee4315f52000-05-09 19:53:39 +00009952 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009953 free_list = NULL;
9954 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009956 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009957 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009958
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009959 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009960 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009961 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009962 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009963
9964 /* initialize the linebreak bloom filter */
9965 bloom_linebreak = make_bloom_mask(
9966 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9967 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009968
9969 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009970}
9971
9972/* Finalize the Unicode implementation */
9973
Christian Heimesa156e092008-02-16 07:38:31 +00009974int
9975PyUnicode_ClearFreeList(void)
9976{
9977 int freelist_size = numfree;
9978 PyUnicodeObject *u;
9979
9980 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009981 PyUnicodeObject *v = u;
9982 u = *(PyUnicodeObject **)u;
9983 if (v->str)
9984 PyObject_DEL(v->str);
9985 Py_XDECREF(v->defenc);
9986 PyObject_Del(v);
9987 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +00009988 }
9989 free_list = NULL;
9990 assert(numfree == 0);
9991 return freelist_size;
9992}
9993
Guido van Rossumd57fd912000-03-10 22:53:23 +00009994void
Thomas Wouters78890102000-07-22 19:25:51 +00009995_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009996{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009997 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009998
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00009999 Py_XDECREF(unicode_empty);
10000 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010001
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010002 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010003 if (unicode_latin1[i]) {
10004 Py_DECREF(unicode_latin1[i]);
10005 unicode_latin1[i] = NULL;
10006 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010007 }
Christian Heimesa156e092008-02-16 07:38:31 +000010008 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010009}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010010
Walter Dörwald16807132007-05-25 13:52:07 +000010011void
10012PyUnicode_InternInPlace(PyObject **p)
10013{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010014 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10015 PyObject *t;
10016 if (s == NULL || !PyUnicode_Check(s))
10017 Py_FatalError(
10018 "PyUnicode_InternInPlace: unicode strings only please!");
10019 /* If it's a subclass, we don't really know what putting
10020 it in the interned dict might do. */
10021 if (!PyUnicode_CheckExact(s))
10022 return;
10023 if (PyUnicode_CHECK_INTERNED(s))
10024 return;
10025 if (interned == NULL) {
10026 interned = PyDict_New();
10027 if (interned == NULL) {
10028 PyErr_Clear(); /* Don't leave an exception */
10029 return;
10030 }
10031 }
10032 /* It might be that the GetItem call fails even
10033 though the key is present in the dictionary,
10034 namely when this happens during a stack overflow. */
10035 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010036 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010037 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010038
Benjamin Peterson29060642009-01-31 22:14:21 +000010039 if (t) {
10040 Py_INCREF(t);
10041 Py_DECREF(*p);
10042 *p = t;
10043 return;
10044 }
Walter Dörwald16807132007-05-25 13:52:07 +000010045
Benjamin Peterson14339b62009-01-31 16:36:08 +000010046 PyThreadState_GET()->recursion_critical = 1;
10047 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10048 PyErr_Clear();
10049 PyThreadState_GET()->recursion_critical = 0;
10050 return;
10051 }
10052 PyThreadState_GET()->recursion_critical = 0;
10053 /* The two references in interned are not counted by refcnt.
10054 The deallocator will take care of this */
10055 Py_REFCNT(s) -= 2;
10056 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010057}
10058
10059void
10060PyUnicode_InternImmortal(PyObject **p)
10061{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010062 PyUnicode_InternInPlace(p);
10063 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10064 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10065 Py_INCREF(*p);
10066 }
Walter Dörwald16807132007-05-25 13:52:07 +000010067}
10068
10069PyObject *
10070PyUnicode_InternFromString(const char *cp)
10071{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010072 PyObject *s = PyUnicode_FromString(cp);
10073 if (s == NULL)
10074 return NULL;
10075 PyUnicode_InternInPlace(&s);
10076 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010077}
10078
10079void _Py_ReleaseInternedUnicodeStrings(void)
10080{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010081 PyObject *keys;
10082 PyUnicodeObject *s;
10083 Py_ssize_t i, n;
10084 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010085
Benjamin Peterson14339b62009-01-31 16:36:08 +000010086 if (interned == NULL || !PyDict_Check(interned))
10087 return;
10088 keys = PyDict_Keys(interned);
10089 if (keys == NULL || !PyList_Check(keys)) {
10090 PyErr_Clear();
10091 return;
10092 }
Walter Dörwald16807132007-05-25 13:52:07 +000010093
Benjamin Peterson14339b62009-01-31 16:36:08 +000010094 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10095 detector, interned unicode strings are not forcibly deallocated;
10096 rather, we give them their stolen references back, and then clear
10097 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010098
Benjamin Peterson14339b62009-01-31 16:36:08 +000010099 n = PyList_GET_SIZE(keys);
10100 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010101 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010102 for (i = 0; i < n; i++) {
10103 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10104 switch (s->state) {
10105 case SSTATE_NOT_INTERNED:
10106 /* XXX Shouldn't happen */
10107 break;
10108 case SSTATE_INTERNED_IMMORTAL:
10109 Py_REFCNT(s) += 1;
10110 immortal_size += s->length;
10111 break;
10112 case SSTATE_INTERNED_MORTAL:
10113 Py_REFCNT(s) += 2;
10114 mortal_size += s->length;
10115 break;
10116 default:
10117 Py_FatalError("Inconsistent interned string state.");
10118 }
10119 s->state = SSTATE_NOT_INTERNED;
10120 }
10121 fprintf(stderr, "total size of all interned strings: "
10122 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10123 "mortal/immortal\n", mortal_size, immortal_size);
10124 Py_DECREF(keys);
10125 PyDict_Clear(interned);
10126 Py_DECREF(interned);
10127 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010128}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010129
10130
10131/********************* Unicode Iterator **************************/
10132
10133typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010134 PyObject_HEAD
10135 Py_ssize_t it_index;
10136 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010137} unicodeiterobject;
10138
10139static void
10140unicodeiter_dealloc(unicodeiterobject *it)
10141{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010142 _PyObject_GC_UNTRACK(it);
10143 Py_XDECREF(it->it_seq);
10144 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010145}
10146
10147static int
10148unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10149{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010150 Py_VISIT(it->it_seq);
10151 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010152}
10153
10154static PyObject *
10155unicodeiter_next(unicodeiterobject *it)
10156{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010157 PyUnicodeObject *seq;
10158 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010159
Benjamin Peterson14339b62009-01-31 16:36:08 +000010160 assert(it != NULL);
10161 seq = it->it_seq;
10162 if (seq == NULL)
10163 return NULL;
10164 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010165
Benjamin Peterson14339b62009-01-31 16:36:08 +000010166 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10167 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010168 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010169 if (item != NULL)
10170 ++it->it_index;
10171 return item;
10172 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010173
Benjamin Peterson14339b62009-01-31 16:36:08 +000010174 Py_DECREF(seq);
10175 it->it_seq = NULL;
10176 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010177}
10178
10179static PyObject *
10180unicodeiter_len(unicodeiterobject *it)
10181{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010182 Py_ssize_t len = 0;
10183 if (it->it_seq)
10184 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10185 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010186}
10187
10188PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10189
10190static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010191 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010192 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010193 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010194};
10195
10196PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010197 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10198 "str_iterator", /* tp_name */
10199 sizeof(unicodeiterobject), /* tp_basicsize */
10200 0, /* tp_itemsize */
10201 /* methods */
10202 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10203 0, /* tp_print */
10204 0, /* tp_getattr */
10205 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010206 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010207 0, /* tp_repr */
10208 0, /* tp_as_number */
10209 0, /* tp_as_sequence */
10210 0, /* tp_as_mapping */
10211 0, /* tp_hash */
10212 0, /* tp_call */
10213 0, /* tp_str */
10214 PyObject_GenericGetAttr, /* tp_getattro */
10215 0, /* tp_setattro */
10216 0, /* tp_as_buffer */
10217 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10218 0, /* tp_doc */
10219 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10220 0, /* tp_clear */
10221 0, /* tp_richcompare */
10222 0, /* tp_weaklistoffset */
10223 PyObject_SelfIter, /* tp_iter */
10224 (iternextfunc)unicodeiter_next, /* tp_iternext */
10225 unicodeiter_methods, /* tp_methods */
10226 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010227};
10228
10229static PyObject *
10230unicode_iter(PyObject *seq)
10231{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010232 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010233
Benjamin Peterson14339b62009-01-31 16:36:08 +000010234 if (!PyUnicode_Check(seq)) {
10235 PyErr_BadInternalCall();
10236 return NULL;
10237 }
10238 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10239 if (it == NULL)
10240 return NULL;
10241 it->it_index = 0;
10242 Py_INCREF(seq);
10243 it->it_seq = (PyUnicodeObject *)seq;
10244 _PyObject_GC_TRACK(it);
10245 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010246}
10247
Martin v. Löwis5b222132007-06-10 09:51:05 +000010248size_t
10249Py_UNICODE_strlen(const Py_UNICODE *u)
10250{
10251 int res = 0;
10252 while(*u++)
10253 res++;
10254 return res;
10255}
10256
10257Py_UNICODE*
10258Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10259{
10260 Py_UNICODE *u = s1;
10261 while ((*u++ = *s2++));
10262 return s1;
10263}
10264
10265Py_UNICODE*
10266Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10267{
10268 Py_UNICODE *u = s1;
10269 while ((*u++ = *s2++))
10270 if (n-- == 0)
10271 break;
10272 return s1;
10273}
10274
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010275Py_UNICODE*
10276Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10277{
10278 Py_UNICODE *u1 = s1;
10279 u1 += Py_UNICODE_strlen(u1);
10280 Py_UNICODE_strcpy(u1, s2);
10281 return s1;
10282}
10283
Martin v. Löwis5b222132007-06-10 09:51:05 +000010284int
10285Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10286{
10287 while (*s1 && *s2 && *s1 == *s2)
10288 s1++, s2++;
10289 if (*s1 && *s2)
10290 return (*s1 < *s2) ? -1 : +1;
10291 if (*s1)
10292 return 1;
10293 if (*s2)
10294 return -1;
10295 return 0;
10296}
10297
Victor Stinneref8d95c2010-08-16 22:03:11 +000010298int
10299Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10300{
10301 register Py_UNICODE u1, u2;
10302 for (; n != 0; n--) {
10303 u1 = *s1;
10304 u2 = *s2;
10305 if (u1 != u2)
10306 return (u1 < u2) ? -1 : +1;
10307 if (u1 == '\0')
10308 return 0;
10309 s1++;
10310 s2++;
10311 }
10312 return 0;
10313}
10314
Martin v. Löwis5b222132007-06-10 09:51:05 +000010315Py_UNICODE*
10316Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10317{
10318 const Py_UNICODE *p;
10319 for (p = s; *p; p++)
10320 if (*p == c)
10321 return (Py_UNICODE*)p;
10322 return NULL;
10323}
10324
Victor Stinner331ea922010-08-10 16:37:20 +000010325Py_UNICODE*
10326Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10327{
10328 const Py_UNICODE *p;
10329 p = s + Py_UNICODE_strlen(s);
10330 while (p != s) {
10331 p--;
10332 if (*p == c)
10333 return (Py_UNICODE*)p;
10334 }
10335 return NULL;
10336}
10337
Victor Stinner71133ff2010-09-01 23:43:53 +000010338Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010339PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010340{
10341 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10342 Py_UNICODE *copy;
10343 Py_ssize_t size;
10344
10345 /* Ensure we won't overflow the size. */
10346 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10347 PyErr_NoMemory();
10348 return NULL;
10349 }
10350 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10351 size *= sizeof(Py_UNICODE);
10352 copy = PyMem_Malloc(size);
10353 if (copy == NULL) {
10354 PyErr_NoMemory();
10355 return NULL;
10356 }
10357 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10358 return copy;
10359}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010360
Georg Brandl66c221e2010-10-14 07:04:07 +000010361/* A _string module, to export formatter_parser and formatter_field_name_split
10362 to the string.Formatter class implemented in Python. */
10363
10364static PyMethodDef _string_methods[] = {
10365 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10366 METH_O, PyDoc_STR("split the argument as a field name")},
10367 {"formatter_parser", (PyCFunction) formatter_parser,
10368 METH_O, PyDoc_STR("parse the argument as a format string")},
10369 {NULL, NULL}
10370};
10371
10372static struct PyModuleDef _string_module = {
10373 PyModuleDef_HEAD_INIT,
10374 "_string",
10375 PyDoc_STR("string helper module"),
10376 0,
10377 _string_methods,
10378 NULL,
10379 NULL,
10380 NULL,
10381 NULL
10382};
10383
10384PyMODINIT_FUNC
10385PyInit__string(void)
10386{
10387 return PyModule_Create(&_string_module);
10388}
10389
10390
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010391#ifdef __cplusplus
10392}
10393#endif