blob: 1f1fe8e6fa70715b15f4af9fdcb524c3cce6a4b7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000145static PyObject *unicode_encode_call_errorhandler(const char *errors,
146 PyObject **errorHandler,const char *encoding, const char *reason,
147 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
148 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
149
Victor Stinner31be90b2010-04-22 19:38:16 +0000150static void raise_encode_exception(PyObject **exceptionObject,
151 const char *encoding,
152 const Py_UNICODE *unicode, Py_ssize_t size,
153 Py_ssize_t startpos, Py_ssize_t endpos,
154 const char *reason);
155
Christian Heimes190d79e2008-01-30 11:58:22 +0000156/* Same for linebreaks */
157static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000158 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000159/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000160/* 0x000B, * LINE TABULATION */
161/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000162/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000163 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000164 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000165/* 0x001C, * FILE SEPARATOR */
166/* 0x001D, * GROUP SEPARATOR */
167/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000168 0, 0, 0, 0, 1, 1, 1, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000173
Benjamin Peterson14339b62009-01-31 16:36:08 +0000174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000182};
183
184
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000185Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000186PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000188#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000189 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000190#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 /* This is actually an illegal character, so it should
192 not be passed to unichr. */
193 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000194#endif
195}
196
Thomas Wouters477c8d52006-05-27 19:21:47 +0000197/* --- Bloom Filters ----------------------------------------------------- */
198
199/* stuff to implement simple "bloom filters" for Unicode characters.
200 to keep things simple, we use a single bitmask, using the least 5
201 bits from each unicode characters as the bit index. */
202
203/* the linebreak mask is set up by Unicode_Init below */
204
Antoine Pitrouf068f942010-01-13 14:19:12 +0000205#if LONG_BIT >= 128
206#define BLOOM_WIDTH 128
207#elif LONG_BIT >= 64
208#define BLOOM_WIDTH 64
209#elif LONG_BIT >= 32
210#define BLOOM_WIDTH 32
211#else
212#error "LONG_BIT is smaller than 32"
213#endif
214
Thomas Wouters477c8d52006-05-27 19:21:47 +0000215#define BLOOM_MASK unsigned long
216
217static BLOOM_MASK bloom_linebreak;
218
Antoine Pitrouf068f942010-01-13 14:19:12 +0000219#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
220#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000221
Benjamin Peterson29060642009-01-31 22:14:21 +0000222#define BLOOM_LINEBREAK(ch) \
223 ((ch) < 128U ? ascii_linebreak[(ch)] : \
224 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000225
226Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
227{
228 /* calculate simple bloom-style bitmask for a given unicode string */
229
Antoine Pitrouf068f942010-01-13 14:19:12 +0000230 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000231 Py_ssize_t i;
232
233 mask = 0;
234 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000235 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000236
237 return mask;
238}
239
240Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
241{
242 Py_ssize_t i;
243
244 for (i = 0; i < setlen; i++)
245 if (set[i] == chr)
246 return 1;
247
248 return 0;
249}
250
Benjamin Peterson29060642009-01-31 22:14:21 +0000251#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000252 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
253
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254/* --- Unicode Object ----------------------------------------------------- */
255
256static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000257int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +0000258 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259{
260 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000261
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000264 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000265
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000266 /* Resizing shared object (unicode_empty or single character
267 objects) in-place is not allowed. Use PyUnicode_Resize()
268 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000271 (unicode->length == 1 &&
272 unicode->str[0] < 256U &&
273 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000274 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000275 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000276 return -1;
277 }
278
Thomas Wouters477c8d52006-05-27 19:21:47 +0000279 /* We allocate one more byte to make sure the string is Ux0000 terminated.
280 The overallocation is also used by fastsearch, which assumes that it's
281 safe to look at str[length] (without making any assumptions about what
282 it contains). */
283
Guido van Rossumd57fd912000-03-10 22:53:23 +0000284 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000285 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000286 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000287 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000288 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000289 PyErr_NoMemory();
290 return -1;
291 }
292 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000294
Benjamin Peterson29060642009-01-31 22:14:21 +0000295 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000296 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000297 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000298 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000299 }
300 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000301
Guido van Rossumd57fd912000-03-10 22:53:23 +0000302 return 0;
303}
304
305/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000306 Ux0000 terminated; some code (e.g. new_identifier)
307 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000308
309 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000310 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000311
312*/
313
314static
Martin v. Löwis18e16552006-02-15 17:27:45 +0000315PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000316{
317 register PyUnicodeObject *unicode;
318
Thomas Wouters477c8d52006-05-27 19:21:47 +0000319 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320 if (length == 0 && unicode_empty != NULL) {
321 Py_INCREF(unicode_empty);
322 return unicode_empty;
323 }
324
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000325 /* Ensure we won't overflow the size. */
326 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
327 return (PyUnicodeObject *)PyErr_NoMemory();
328 }
329
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000331 if (free_list) {
332 unicode = free_list;
333 free_list = *(PyUnicodeObject **)unicode;
334 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000335 if (unicode->str) {
336 /* Keep-Alive optimization: we only upsize the buffer,
337 never downsize it. */
338 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000339 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000340 PyObject_DEL(unicode->str);
341 unicode->str = NULL;
342 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000343 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000344 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000345 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000347 }
348 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000349 }
350 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000351 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000352 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 if (unicode == NULL)
354 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
356 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 }
358
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000359 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000360 PyErr_NoMemory();
361 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000362 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000363 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000364 * the caller fails before initializing str -- unicode_resize()
365 * reads str[0], and the Keep-Alive optimization can keep memory
366 * allocated for str alive across a call to unicode_dealloc(unicode).
367 * We don't want unicode_resize to read uninitialized memory in
368 * that case.
369 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000370 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000374 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000375 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000376 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000377
Benjamin Peterson29060642009-01-31 22:14:21 +0000378 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000379 /* XXX UNREF/NEWREF interface should be more symmetrical */
380 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000382 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000383 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384}
385
386static
Guido van Rossum9475a232001-10-05 20:51:39 +0000387void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388{
Walter Dörwald16807132007-05-25 13:52:07 +0000389 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000390 case SSTATE_NOT_INTERNED:
391 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000392
Benjamin Peterson29060642009-01-31 22:14:21 +0000393 case SSTATE_INTERNED_MORTAL:
394 /* revive dead object temporarily for DelItem */
395 Py_REFCNT(unicode) = 3;
396 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
397 Py_FatalError(
398 "deletion of interned string failed");
399 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000400
Benjamin Peterson29060642009-01-31 22:14:21 +0000401 case SSTATE_INTERNED_IMMORTAL:
402 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000403
Benjamin Peterson29060642009-01-31 22:14:21 +0000404 default:
405 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000406 }
407
Guido van Rossum604ddf82001-12-06 20:03:56 +0000408 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000409 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000410 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000411 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
412 PyObject_DEL(unicode->str);
413 unicode->str = NULL;
414 unicode->length = 0;
415 }
416 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000417 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000418 }
419 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000420 *(PyUnicodeObject **)unicode = free_list;
421 free_list = unicode;
422 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000425 PyObject_DEL(unicode->str);
426 Py_XDECREF(unicode->defenc);
427 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428 }
429}
430
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000431static
432int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000433{
434 register PyUnicodeObject *v;
435
436 /* Argument checks */
437 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000438 PyErr_BadInternalCall();
439 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000440 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000441 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000442 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000443 PyErr_BadInternalCall();
444 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000445 }
446
447 /* Resizing unicode_empty and single character objects is not
448 possible since these are being shared. We simply return a fresh
449 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000450 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000451 (v == unicode_empty || v->length == 1)) {
452 PyUnicodeObject *w = _PyUnicode_New(length);
453 if (w == NULL)
454 return -1;
455 Py_UNICODE_COPY(w->str, v->str,
456 length < v->length ? length : v->length);
457 Py_DECREF(*unicode);
458 *unicode = w;
459 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000460 }
461
462 /* Note that we don't have to modify *unicode for unshared Unicode
463 objects, since we can modify them in-place. */
464 return unicode_resize(v, length);
465}
466
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000467int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
468{
469 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
470}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000471
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson29060642009-01-31 22:14:21 +0000473 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474{
475 PyUnicodeObject *unicode;
476
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000477 /* If the Unicode data is known at construction time, we can apply
478 some optimizations which share commonly used objects. */
479 if (u != NULL) {
480
Benjamin Peterson29060642009-01-31 22:14:21 +0000481 /* Optimization for empty strings */
482 if (size == 0 && unicode_empty != NULL) {
483 Py_INCREF(unicode_empty);
484 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000485 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000486
487 /* Single character Unicode objects in the Latin-1 range are
488 shared when using this constructor */
489 if (size == 1 && *u < 256) {
490 unicode = unicode_latin1[*u];
491 if (!unicode) {
492 unicode = _PyUnicode_New(1);
493 if (!unicode)
494 return NULL;
495 unicode->str[0] = *u;
496 unicode_latin1[*u] = unicode;
497 }
498 Py_INCREF(unicode);
499 return (PyObject *)unicode;
500 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000501 }
Tim Petersced69f82003-09-16 20:30:58 +0000502
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503 unicode = _PyUnicode_New(size);
504 if (!unicode)
505 return NULL;
506
507 /* Copy the Unicode data into the new object */
508 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000509 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510
511 return (PyObject *)unicode;
512}
513
Walter Dörwaldd2034312007-05-18 16:29:38 +0000514PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000515{
516 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000517
Benjamin Peterson14339b62009-01-31 16:36:08 +0000518 if (size < 0) {
519 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000520 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000521 return NULL;
522 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000524 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000525 some optimizations which share commonly used objects.
526 Also, this means the input must be UTF-8, so fall back to the
527 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000528 if (u != NULL) {
529
Benjamin Peterson29060642009-01-31 22:14:21 +0000530 /* Optimization for empty strings */
531 if (size == 0 && unicode_empty != NULL) {
532 Py_INCREF(unicode_empty);
533 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000534 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000535
536 /* Single characters are shared when using this constructor.
537 Restrict to ASCII, since the input must be UTF-8. */
538 if (size == 1 && Py_CHARMASK(*u) < 128) {
539 unicode = unicode_latin1[Py_CHARMASK(*u)];
540 if (!unicode) {
541 unicode = _PyUnicode_New(1);
542 if (!unicode)
543 return NULL;
544 unicode->str[0] = Py_CHARMASK(*u);
545 unicode_latin1[Py_CHARMASK(*u)] = unicode;
546 }
547 Py_INCREF(unicode);
548 return (PyObject *)unicode;
549 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000550
551 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000552 }
553
Walter Dörwald55507312007-05-18 13:12:10 +0000554 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000555 if (!unicode)
556 return NULL;
557
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 return (PyObject *)unicode;
559}
560
Walter Dörwaldd2034312007-05-18 16:29:38 +0000561PyObject *PyUnicode_FromString(const char *u)
562{
563 size_t size = strlen(u);
564 if (size > PY_SSIZE_T_MAX) {
565 PyErr_SetString(PyExc_OverflowError, "input too long");
566 return NULL;
567 }
568
569 return PyUnicode_FromStringAndSize(u, size);
570}
571
Guido van Rossumd57fd912000-03-10 22:53:23 +0000572#ifdef HAVE_WCHAR_H
573
Mark Dickinson081dfee2009-03-18 14:47:41 +0000574#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
575# define CONVERT_WCHAR_TO_SURROGATES
576#endif
577
578#ifdef CONVERT_WCHAR_TO_SURROGATES
579
580/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
581 to convert from UTF32 to UTF16. */
582
583PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
584 Py_ssize_t size)
585{
586 PyUnicodeObject *unicode;
587 register Py_ssize_t i;
588 Py_ssize_t alloc;
589 const wchar_t *orig_w;
590
591 if (w == NULL) {
592 if (size == 0)
593 return PyUnicode_FromStringAndSize(NULL, 0);
594 PyErr_BadInternalCall();
595 return NULL;
596 }
597
598 if (size == -1) {
599 size = wcslen(w);
600 }
601
602 alloc = size;
603 orig_w = w;
604 for (i = size; i > 0; i--) {
605 if (*w > 0xFFFF)
606 alloc++;
607 w++;
608 }
609 w = orig_w;
610 unicode = _PyUnicode_New(alloc);
611 if (!unicode)
612 return NULL;
613
614 /* Copy the wchar_t data into the new object */
615 {
616 register Py_UNICODE *u;
617 u = PyUnicode_AS_UNICODE(unicode);
618 for (i = size; i > 0; i--) {
619 if (*w > 0xFFFF) {
620 wchar_t ordinal = *w++;
621 ordinal -= 0x10000;
622 *u++ = 0xD800 | (ordinal >> 10);
623 *u++ = 0xDC00 | (ordinal & 0x3FF);
624 }
625 else
626 *u++ = *w++;
627 }
628 }
629 return (PyObject *)unicode;
630}
631
632#else
633
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson29060642009-01-31 22:14:21 +0000635 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636{
637 PyUnicodeObject *unicode;
638
639 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000640 if (size == 0)
641 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000642 PyErr_BadInternalCall();
643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000644 }
645
Martin v. Löwis790465f2008-04-05 20:41:37 +0000646 if (size == -1) {
647 size = wcslen(w);
648 }
649
Guido van Rossumd57fd912000-03-10 22:53:23 +0000650 unicode = _PyUnicode_New(size);
651 if (!unicode)
652 return NULL;
653
654 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000655#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000657#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000659 register Py_UNICODE *u;
660 register Py_ssize_t i;
661 u = PyUnicode_AS_UNICODE(unicode);
662 for (i = size; i > 0; i--)
663 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 }
665#endif
666
667 return (PyObject *)unicode;
668}
669
Mark Dickinson081dfee2009-03-18 14:47:41 +0000670#endif /* CONVERT_WCHAR_TO_SURROGATES */
671
672#undef CONVERT_WCHAR_TO_SURROGATES
673
Walter Dörwald346737f2007-05-31 10:44:43 +0000674static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000675makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
676 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000677{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000678 *fmt++ = '%';
679 if (width) {
680 if (zeropad)
681 *fmt++ = '0';
682 fmt += sprintf(fmt, "%d", width);
683 }
684 if (precision)
685 fmt += sprintf(fmt, ".%d", precision);
686 if (longflag)
687 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000688 else if (longlongflag) {
689 /* longlongflag should only ever be nonzero on machines with
690 HAVE_LONG_LONG defined */
691#ifdef HAVE_LONG_LONG
692 char *f = PY_FORMAT_LONG_LONG;
693 while (*f)
694 *fmt++ = *f++;
695#else
696 /* we shouldn't ever get here */
697 assert(0);
698 *fmt++ = 'l';
699#endif
700 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000701 else if (size_tflag) {
702 char *f = PY_FORMAT_SIZE_T;
703 while (*f)
704 *fmt++ = *f++;
705 }
706 *fmt++ = c;
707 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000708}
709
Walter Dörwaldd2034312007-05-18 16:29:38 +0000710#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
711
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000712/* size of fixed-size buffer for formatting single arguments */
713#define ITEM_BUFFER_LEN 21
714/* maximum number of characters required for output of %ld. 21 characters
715 allows for 64-bit integers (in decimal) and an optional sign. */
716#define MAX_LONG_CHARS 21
717/* maximum number of characters required for output of %lld.
718 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
719 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
720#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
721
Walter Dörwaldd2034312007-05-18 16:29:38 +0000722PyObject *
723PyUnicode_FromFormatV(const char *format, va_list vargs)
724{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000725 va_list count;
726 Py_ssize_t callcount = 0;
727 PyObject **callresults = NULL;
728 PyObject **callresult = NULL;
729 Py_ssize_t n = 0;
730 int width = 0;
731 int precision = 0;
732 int zeropad;
733 const char* f;
734 Py_UNICODE *s;
735 PyObject *string;
736 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000737 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000738 /* use abuffer instead of buffer, if we need more space
739 * (which can happen if there's a format specifier with width). */
740 char *abuffer = NULL;
741 char *realbuffer;
742 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000743 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000744 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000745
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000746 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000747 /* step 1: count the number of %S/%R/%A/%s format specifications
748 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
749 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
750 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000751 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000752 if (*f == '%') {
753 if (*(f+1)=='%')
754 continue;
Victor Stinner2b574a22011-03-01 22:48:49 +0000755 if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000756 ++callcount;
David Malcolm96960882010-11-05 17:23:41 +0000757 while (Py_ISDIGIT((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000758 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000759 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000760 ;
761 if (*f == 's')
762 ++callcount;
763 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000764 else if (128 <= (unsigned char)*f) {
765 PyErr_Format(PyExc_ValueError,
766 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000767 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000768 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000769 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000770 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000771 }
772 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000773 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000774 if (callcount) {
775 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
776 if (!callresults) {
777 PyErr_NoMemory();
778 return NULL;
779 }
780 callresult = callresults;
781 }
782 /* step 3: figure out how large a buffer we need */
783 for (f = format; *f; f++) {
784 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000785#ifdef HAVE_LONG_LONG
786 int longlongflag = 0;
787#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +0000788 const char* p = f;
789 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000790 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000791 width = (width*10) + *f++ - '0';
David Malcolm96960882010-11-05 17:23:41 +0000792 while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000793 ;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794
Benjamin Peterson14339b62009-01-31 16:36:08 +0000795 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
796 * they don't affect the amount of space we reserve.
797 */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000798 if (*f == 'l') {
799 if (f[1] == 'd' || f[1] == 'u') {
800 ++f;
801 }
802#ifdef HAVE_LONG_LONG
803 else if (f[1] == 'l' &&
804 (f[2] == 'd' || f[2] == 'u')) {
805 longlongflag = 1;
806 f += 2;
807 }
808#endif
809 }
810 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000811 ++f;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000812 }
Walter Dörwaldd2034312007-05-18 16:29:38 +0000813
Benjamin Peterson14339b62009-01-31 16:36:08 +0000814 switch (*f) {
815 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +0000816 {
817#ifndef Py_UNICODE_WIDE
818 int ordinal = va_arg(count, int);
819 if (ordinal > 0xffff)
820 n += 2;
821 else
822 n++;
823#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000824 (void)va_arg(count, int);
Victor Stinner659eb842011-02-23 12:14:22 +0000825 n++;
826#endif
827 break;
828 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000829 case '%':
830 n++;
831 break;
832 case 'd': case 'u': case 'i': case 'x':
833 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000834#ifdef HAVE_LONG_LONG
835 if (longlongflag) {
836 if (width < MAX_LONG_LONG_CHARS)
837 width = MAX_LONG_LONG_CHARS;
838 }
839 else
840#endif
841 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
842 including sign. Decimal takes the most space. This
843 isn't enough for octal. If a width is specified we
844 need more (which we allocate later). */
845 if (width < MAX_LONG_CHARS)
846 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000847 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000848 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000849 if (abuffersize < width)
850 abuffersize = width;
851 break;
852 case 's':
853 {
854 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000855 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000856 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
857 if (!str)
858 goto fail;
859 n += PyUnicode_GET_SIZE(str);
860 /* Remember the str and switch to the next slot */
861 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000862 break;
863 }
864 case 'U':
865 {
866 PyObject *obj = va_arg(count, PyObject *);
867 assert(obj && PyUnicode_Check(obj));
868 n += PyUnicode_GET_SIZE(obj);
869 break;
870 }
871 case 'V':
872 {
873 PyObject *obj = va_arg(count, PyObject *);
874 const char *str = va_arg(count, const char *);
Victor Stinner2b574a22011-03-01 22:48:49 +0000875 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000876 assert(obj || str);
877 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2b574a22011-03-01 22:48:49 +0000878 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000879 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2b574a22011-03-01 22:48:49 +0000880 *callresult++ = NULL;
881 }
882 else {
883 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
884 if (!str_obj)
885 goto fail;
886 n += PyUnicode_GET_SIZE(str_obj);
887 *callresult++ = str_obj;
888 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000889 break;
890 }
891 case 'S':
892 {
893 PyObject *obj = va_arg(count, PyObject *);
894 PyObject *str;
895 assert(obj);
896 str = PyObject_Str(obj);
897 if (!str)
898 goto fail;
899 n += PyUnicode_GET_SIZE(str);
900 /* Remember the str and switch to the next slot */
901 *callresult++ = str;
902 break;
903 }
904 case 'R':
905 {
906 PyObject *obj = va_arg(count, PyObject *);
907 PyObject *repr;
908 assert(obj);
909 repr = PyObject_Repr(obj);
910 if (!repr)
911 goto fail;
912 n += PyUnicode_GET_SIZE(repr);
913 /* Remember the repr and switch to the next slot */
914 *callresult++ = repr;
915 break;
916 }
917 case 'A':
918 {
919 PyObject *obj = va_arg(count, PyObject *);
920 PyObject *ascii;
921 assert(obj);
922 ascii = PyObject_ASCII(obj);
923 if (!ascii)
924 goto fail;
925 n += PyUnicode_GET_SIZE(ascii);
926 /* Remember the repr and switch to the next slot */
927 *callresult++ = ascii;
928 break;
929 }
930 case 'p':
931 (void) va_arg(count, int);
932 /* maximum 64-bit pointer representation:
933 * 0xffffffffffffffff
934 * so 19 characters is enough.
935 * XXX I count 18 -- what's the extra for?
936 */
937 n += 19;
938 break;
939 default:
940 /* if we stumble upon an unknown
941 formatting code, copy the rest of
942 the format string to the output
943 string. (we cannot just skip the
944 code, since there's no way to know
945 what's in the argument list) */
946 n += strlen(p);
947 goto expand;
948 }
949 } else
950 n++;
951 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000952 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000953 if (abuffersize > ITEM_BUFFER_LEN) {
954 /* add 1 for sprintf's trailing null byte */
955 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +0000956 if (!abuffer) {
957 PyErr_NoMemory();
958 goto fail;
959 }
960 realbuffer = abuffer;
961 }
962 else
963 realbuffer = buffer;
964 /* step 4: fill the buffer */
965 /* Since we've analyzed how much space we need for the worst case,
966 we don't have to resize the string.
967 There can be no errors beyond this point. */
968 string = PyUnicode_FromUnicode(NULL, n);
969 if (!string)
970 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000971
Benjamin Peterson14339b62009-01-31 16:36:08 +0000972 s = PyUnicode_AS_UNICODE(string);
973 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000974
Benjamin Peterson14339b62009-01-31 16:36:08 +0000975 for (f = format; *f; f++) {
976 if (*f == '%') {
977 const char* p = f++;
978 int longflag = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000979 int longlongflag = 0;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000980 int size_tflag = 0;
981 zeropad = (*f == '0');
982 /* parse the width.precision part */
983 width = 0;
David Malcolm96960882010-11-05 17:23:41 +0000984 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000985 width = (width*10) + *f++ - '0';
986 precision = 0;
987 if (*f == '.') {
988 f++;
David Malcolm96960882010-11-05 17:23:41 +0000989 while (Py_ISDIGIT((unsigned)*f))
Benjamin Peterson14339b62009-01-31 16:36:08 +0000990 precision = (precision*10) + *f++ - '0';
991 }
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000992 /* Handle %ld, %lu, %lld and %llu. */
993 if (*f == 'l') {
994 if (f[1] == 'd' || f[1] == 'u') {
995 longflag = 1;
996 ++f;
997 }
998#ifdef HAVE_LONG_LONG
999 else if (f[1] == 'l' &&
1000 (f[2] == 'd' || f[2] == 'u')) {
1001 longlongflag = 1;
1002 f += 2;
1003 }
1004#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001005 }
1006 /* handle the size_t flag. */
1007 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1008 size_tflag = 1;
1009 ++f;
1010 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001011
Benjamin Peterson14339b62009-01-31 16:36:08 +00001012 switch (*f) {
1013 case 'c':
Victor Stinner659eb842011-02-23 12:14:22 +00001014 {
1015 int ordinal = va_arg(vargs, int);
1016#ifndef Py_UNICODE_WIDE
1017 if (ordinal > 0xffff) {
1018 ordinal -= 0x10000;
1019 *s++ = 0xD800 | (ordinal >> 10);
1020 *s++ = 0xDC00 | (ordinal & 0x3FF);
1021 } else
1022#endif
1023 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001024 break;
Victor Stinner659eb842011-02-23 12:14:22 +00001025 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001026 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001027 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1028 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001029 if (longflag)
1030 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001031#ifdef HAVE_LONG_LONG
1032 else if (longlongflag)
1033 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1034#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001035 else if (size_tflag)
1036 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1037 else
1038 sprintf(realbuffer, fmt, va_arg(vargs, int));
1039 appendstring(realbuffer);
1040 break;
1041 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001042 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1043 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001044 if (longflag)
1045 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001046#ifdef HAVE_LONG_LONG
1047 else if (longlongflag)
1048 sprintf(realbuffer, fmt, va_arg(vargs,
1049 unsigned PY_LONG_LONG));
1050#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 else if (size_tflag)
1052 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1053 else
1054 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1055 appendstring(realbuffer);
1056 break;
1057 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001058 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001059 sprintf(realbuffer, fmt, va_arg(vargs, int));
1060 appendstring(realbuffer);
1061 break;
1062 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001063 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001064 sprintf(realbuffer, fmt, va_arg(vargs, int));
1065 appendstring(realbuffer);
1066 break;
1067 case 's':
1068 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001069 /* unused, since we already have the result */
1070 (void) va_arg(vargs, char *);
1071 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1072 PyUnicode_GET_SIZE(*callresult));
1073 s += PyUnicode_GET_SIZE(*callresult);
1074 /* We're done with the unicode()/repr() => forget it */
1075 Py_DECREF(*callresult);
1076 /* switch to next unicode()/repr() result */
1077 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001078 break;
1079 }
1080 case 'U':
1081 {
1082 PyObject *obj = va_arg(vargs, PyObject *);
1083 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1084 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1085 s += size;
1086 break;
1087 }
1088 case 'V':
1089 {
1090 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2b574a22011-03-01 22:48:49 +00001091 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001092 if (obj) {
1093 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1094 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1095 s += size;
1096 } else {
Victor Stinner2b574a22011-03-01 22:48:49 +00001097 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1098 PyUnicode_GET_SIZE(*callresult));
1099 s += PyUnicode_GET_SIZE(*callresult);
1100 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001101 }
Victor Stinner2b574a22011-03-01 22:48:49 +00001102 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001103 break;
1104 }
1105 case 'S':
1106 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001107 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001108 {
1109 Py_UNICODE *ucopy;
1110 Py_ssize_t usize;
1111 Py_ssize_t upos;
1112 /* unused, since we already have the result */
1113 (void) va_arg(vargs, PyObject *);
1114 ucopy = PyUnicode_AS_UNICODE(*callresult);
1115 usize = PyUnicode_GET_SIZE(*callresult);
1116 for (upos = 0; upos<usize;)
1117 *s++ = ucopy[upos++];
1118 /* We're done with the unicode()/repr() => forget it */
1119 Py_DECREF(*callresult);
1120 /* switch to next unicode()/repr() result */
1121 ++callresult;
1122 break;
1123 }
1124 case 'p':
1125 sprintf(buffer, "%p", va_arg(vargs, void*));
1126 /* %p is ill-defined: ensure leading 0x. */
1127 if (buffer[1] == 'X')
1128 buffer[1] = 'x';
1129 else if (buffer[1] != 'x') {
1130 memmove(buffer+2, buffer, strlen(buffer)+1);
1131 buffer[0] = '0';
1132 buffer[1] = 'x';
1133 }
1134 appendstring(buffer);
1135 break;
1136 case '%':
1137 *s++ = '%';
1138 break;
1139 default:
1140 appendstring(p);
1141 goto end;
1142 }
Victor Stinner1205f272010-09-11 00:54:47 +00001143 }
Victor Stinner1205f272010-09-11 00:54:47 +00001144 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001145 *s++ = *f;
1146 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001147
Benjamin Peterson29060642009-01-31 22:14:21 +00001148 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001149 if (callresults)
1150 PyObject_Free(callresults);
1151 if (abuffer)
1152 PyObject_Free(abuffer);
1153 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1154 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001155 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001156 if (callresults) {
1157 PyObject **callresult2 = callresults;
1158 while (callresult2 < callresult) {
Victor Stinner2b574a22011-03-01 22:48:49 +00001159 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001160 ++callresult2;
1161 }
1162 PyObject_Free(callresults);
1163 }
1164 if (abuffer)
1165 PyObject_Free(abuffer);
1166 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001167}
1168
1169#undef appendstring
1170
1171PyObject *
1172PyUnicode_FromFormat(const char *format, ...)
1173{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001174 PyObject* ret;
1175 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001176
1177#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001178 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001179#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001180 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001181#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001182 ret = PyUnicode_FromFormatV(format, vargs);
1183 va_end(vargs);
1184 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001185}
1186
Victor Stinner5593d8a2010-10-02 11:11:27 +00001187/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1188 convert a Unicode object to a wide character string.
1189
1190 - If w is NULL: return the number of wide characters (including the nul
1191 character) required to convert the unicode object. Ignore size argument.
1192
1193 - Otherwise: return the number of wide characters (excluding the nul
1194 character) written into w. Write at most size wide characters (including
1195 the nul character). */
1196static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001197unicode_aswidechar(PyUnicodeObject *unicode,
1198 wchar_t *w,
1199 Py_ssize_t size)
1200{
1201#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001202 Py_ssize_t res;
1203 if (w != NULL) {
1204 res = PyUnicode_GET_SIZE(unicode);
1205 if (size > res)
1206 size = res + 1;
1207 else
1208 res = size;
1209 memcpy(w, unicode->str, size * sizeof(wchar_t));
1210 return res;
1211 }
1212 else
1213 return PyUnicode_GET_SIZE(unicode) + 1;
1214#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1215 register const Py_UNICODE *u;
1216 const Py_UNICODE *uend;
1217 const wchar_t *worig, *wend;
1218 Py_ssize_t nchar;
1219
Victor Stinner137c34c2010-09-29 10:25:54 +00001220 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001221 uend = u + PyUnicode_GET_SIZE(unicode);
1222 if (w != NULL) {
1223 worig = w;
1224 wend = w + size;
1225 while (u != uend && w != wend) {
1226 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1227 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1228 {
1229 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1230 u += 2;
1231 }
1232 else {
1233 *w = *u;
1234 u++;
1235 }
1236 w++;
1237 }
1238 if (w != wend)
1239 *w = L'\0';
1240 return w - worig;
1241 }
1242 else {
1243 nchar = 1; /* nul character at the end */
1244 while (u != uend) {
1245 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1246 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1247 u += 2;
1248 else
1249 u++;
1250 nchar++;
1251 }
1252 }
1253 return nchar;
1254#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1255 register Py_UNICODE *u, *uend, ordinal;
1256 register Py_ssize_t i;
1257 wchar_t *worig, *wend;
1258 Py_ssize_t nchar;
1259
1260 u = PyUnicode_AS_UNICODE(unicode);
1261 uend = u + PyUnicode_GET_SIZE(u);
1262 if (w != NULL) {
1263 worig = w;
1264 wend = w + size;
1265 while (u != uend && w != wend) {
1266 ordinal = *u;
1267 if (ordinal > 0xffff) {
1268 ordinal -= 0x10000;
1269 *w++ = 0xD800 | (ordinal >> 10);
1270 *w++ = 0xDC00 | (ordinal & 0x3FF);
1271 }
1272 else
1273 *w++ = ordinal;
1274 u++;
1275 }
1276 if (w != wend)
1277 *w = 0;
1278 return w - worig;
1279 }
1280 else {
1281 nchar = 1; /* nul character */
1282 while (u != uend) {
1283 if (*u > 0xffff)
1284 nchar += 2;
1285 else
1286 nchar++;
1287 u++;
1288 }
1289 return nchar;
1290 }
1291#else
1292# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001293#endif
1294}
1295
1296Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001297PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001298 wchar_t *w,
1299 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300{
1301 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001302 PyErr_BadInternalCall();
1303 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001305 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306}
1307
Victor Stinner137c34c2010-09-29 10:25:54 +00001308wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001309PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001310 Py_ssize_t *size)
1311{
1312 wchar_t* buffer;
1313 Py_ssize_t buflen;
1314
1315 if (unicode == NULL) {
1316 PyErr_BadInternalCall();
1317 return NULL;
1318 }
1319
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001320 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001321 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001322 PyErr_NoMemory();
1323 return NULL;
1324 }
1325
Victor Stinner137c34c2010-09-29 10:25:54 +00001326 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1327 if (buffer == NULL) {
1328 PyErr_NoMemory();
1329 return NULL;
1330 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001331 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001332 if (size != NULL)
1333 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001334 return buffer;
1335}
1336
Guido van Rossumd57fd912000-03-10 22:53:23 +00001337#endif
1338
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001339PyObject *PyUnicode_FromOrdinal(int ordinal)
1340{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001341 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001342
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001343 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001344 PyErr_SetString(PyExc_ValueError,
1345 "chr() arg not in range(0x110000)");
1346 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001347 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001348
1349#ifndef Py_UNICODE_WIDE
1350 if (ordinal > 0xffff) {
1351 ordinal -= 0x10000;
1352 s[0] = 0xD800 | (ordinal >> 10);
1353 s[1] = 0xDC00 | (ordinal & 0x3FF);
1354 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001355 }
1356#endif
1357
Hye-Shik Chang40574832004-04-06 07:24:51 +00001358 s[0] = (Py_UNICODE)ordinal;
1359 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001360}
1361
Guido van Rossumd57fd912000-03-10 22:53:23 +00001362PyObject *PyUnicode_FromObject(register PyObject *obj)
1363{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001364 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001365 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001366 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001367 Py_INCREF(obj);
1368 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001369 }
1370 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001371 /* For a Unicode subtype that's not a Unicode object,
1372 return a true Unicode object with the same data. */
1373 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1374 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001375 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001376 PyErr_Format(PyExc_TypeError,
1377 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001378 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001379 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001380}
1381
1382PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00001383 const char *encoding,
1384 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001385{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001386 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001387 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001388
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 PyErr_BadInternalCall();
1391 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001393
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001394 /* Decoding bytes objects is the most common case and should be fast */
1395 if (PyBytes_Check(obj)) {
1396 if (PyBytes_GET_SIZE(obj) == 0) {
1397 Py_INCREF(unicode_empty);
1398 v = (PyObject *) unicode_empty;
1399 }
1400 else {
1401 v = PyUnicode_Decode(
1402 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1403 encoding, errors);
1404 }
1405 return v;
1406 }
1407
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001408 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001409 PyErr_SetString(PyExc_TypeError,
1410 "decoding str is not supported");
1411 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001412 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001413
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001414 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1415 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1416 PyErr_Format(PyExc_TypeError,
1417 "coercing to str: need bytes, bytearray "
1418 "or buffer-like object, %.80s found",
1419 Py_TYPE(obj)->tp_name);
1420 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001421 }
Tim Petersced69f82003-09-16 20:30:58 +00001422
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001423 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001424 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001425 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426 }
Tim Petersced69f82003-09-16 20:30:58 +00001427 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001428 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001429
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001430 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001431 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432}
1433
Victor Stinner600d3be2010-06-10 12:00:55 +00001434/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001435 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1436 1 on success. */
1437static int
1438normalize_encoding(const char *encoding,
1439 char *lower,
1440 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001442 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001443 char *l;
1444 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001445
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001446 e = encoding;
1447 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001448 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001449 while (*e) {
1450 if (l == l_end)
1451 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001452 if (Py_ISUPPER(*e)) {
1453 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001454 }
1455 else if (*e == '_') {
1456 *l++ = '-';
1457 e++;
1458 }
1459 else {
1460 *l++ = *e++;
1461 }
1462 }
1463 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001464 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001465}
1466
1467PyObject *PyUnicode_Decode(const char *s,
1468 Py_ssize_t size,
1469 const char *encoding,
1470 const char *errors)
1471{
1472 PyObject *buffer = NULL, *unicode;
1473 Py_buffer info;
1474 char lower[11]; /* Enough for any encoding shortcut */
1475
1476 if (encoding == NULL)
1477 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001478
1479 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001480 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1481 if (strcmp(lower, "utf-8") == 0)
1482 return PyUnicode_DecodeUTF8(s, size, errors);
1483 else if ((strcmp(lower, "latin-1") == 0) ||
1484 (strcmp(lower, "iso-8859-1") == 0))
1485 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001486#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001487 else if (strcmp(lower, "mbcs") == 0)
1488 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001489#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001490 else if (strcmp(lower, "ascii") == 0)
1491 return PyUnicode_DecodeASCII(s, size, errors);
1492 else if (strcmp(lower, "utf-16") == 0)
1493 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1494 else if (strcmp(lower, "utf-32") == 0)
1495 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497
1498 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001499 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001500 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001501 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001502 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503 if (buffer == NULL)
1504 goto onError;
1505 unicode = PyCodec_Decode(buffer, encoding, errors);
1506 if (unicode == NULL)
1507 goto onError;
1508 if (!PyUnicode_Check(unicode)) {
1509 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001510 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001511 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001512 Py_DECREF(unicode);
1513 goto onError;
1514 }
1515 Py_DECREF(buffer);
1516 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001517
Benjamin Peterson29060642009-01-31 22:14:21 +00001518 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519 Py_XDECREF(buffer);
1520 return NULL;
1521}
1522
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001523PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1524 const char *encoding,
1525 const char *errors)
1526{
1527 PyObject *v;
1528
1529 if (!PyUnicode_Check(unicode)) {
1530 PyErr_BadArgument();
1531 goto onError;
1532 }
1533
1534 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001535 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001536
1537 /* Decode via the codec registry */
1538 v = PyCodec_Decode(unicode, encoding, errors);
1539 if (v == NULL)
1540 goto onError;
1541 return v;
1542
Benjamin Peterson29060642009-01-31 22:14:21 +00001543 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001544 return NULL;
1545}
1546
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001547PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1548 const char *encoding,
1549 const char *errors)
1550{
1551 PyObject *v;
1552
1553 if (!PyUnicode_Check(unicode)) {
1554 PyErr_BadArgument();
1555 goto onError;
1556 }
1557
1558 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001559 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001560
1561 /* Decode via the codec registry */
1562 v = PyCodec_Decode(unicode, encoding, errors);
1563 if (v == NULL)
1564 goto onError;
1565 if (!PyUnicode_Check(v)) {
1566 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001567 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001568 Py_TYPE(v)->tp_name);
1569 Py_DECREF(v);
1570 goto onError;
1571 }
1572 return v;
1573
Benjamin Peterson29060642009-01-31 22:14:21 +00001574 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001575 return NULL;
1576}
1577
Guido van Rossumd57fd912000-03-10 22:53:23 +00001578PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00001579 Py_ssize_t size,
1580 const char *encoding,
1581 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001582{
1583 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001584
Guido van Rossumd57fd912000-03-10 22:53:23 +00001585 unicode = PyUnicode_FromUnicode(s, size);
1586 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001587 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001588 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1589 Py_DECREF(unicode);
1590 return v;
1591}
1592
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001593PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1594 const char *encoding,
1595 const char *errors)
1596{
1597 PyObject *v;
1598
1599 if (!PyUnicode_Check(unicode)) {
1600 PyErr_BadArgument();
1601 goto onError;
1602 }
1603
1604 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001605 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001606
1607 /* Encode via the codec registry */
1608 v = PyCodec_Encode(unicode, encoding, errors);
1609 if (v == NULL)
1610 goto onError;
1611 return v;
1612
Benjamin Peterson29060642009-01-31 22:14:21 +00001613 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001614 return NULL;
1615}
1616
Victor Stinnerad158722010-10-27 00:25:46 +00001617PyObject *
1618PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001619{
Victor Stinner313a1202010-06-11 23:56:51 +00001620#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001621 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1622 PyUnicode_GET_SIZE(unicode),
1623 NULL);
1624#elif defined(__APPLE__)
1625 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1626 PyUnicode_GET_SIZE(unicode),
1627 "surrogateescape");
1628#else
1629 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001630 return PyUnicode_AsEncodedString(unicode,
1631 Py_FileSystemDefaultEncoding,
1632 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001633 }
1634 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001635 /* locale encoding with surrogateescape */
1636 wchar_t *wchar;
1637 char *bytes;
1638 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001639 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001640
1641 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1642 if (wchar == NULL)
1643 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001644 bytes = _Py_wchar2char(wchar, &error_pos);
1645 if (bytes == NULL) {
1646 if (error_pos != (size_t)-1) {
1647 char *errmsg = strerror(errno);
1648 PyObject *exc = NULL;
1649 if (errmsg == NULL)
1650 errmsg = "Py_wchar2char() failed";
1651 raise_encode_exception(&exc,
1652 "filesystemencoding",
1653 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1654 error_pos, error_pos+1,
1655 errmsg);
1656 Py_XDECREF(exc);
1657 }
1658 else
1659 PyErr_NoMemory();
1660 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001661 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001662 }
1663 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001664
1665 bytes_obj = PyBytes_FromString(bytes);
1666 PyMem_Free(bytes);
1667 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001668 }
Victor Stinnerad158722010-10-27 00:25:46 +00001669#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001670}
1671
Guido van Rossumd57fd912000-03-10 22:53:23 +00001672PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1673 const char *encoding,
1674 const char *errors)
1675{
1676 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001677 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001678
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679 if (!PyUnicode_Check(unicode)) {
1680 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001682 }
Fred Drakee4315f52000-05-09 19:53:39 +00001683
Tim Petersced69f82003-09-16 20:30:58 +00001684 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001685 encoding = PyUnicode_GetDefaultEncoding();
Fred Drakee4315f52000-05-09 19:53:39 +00001686
1687 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001688 if (normalize_encoding(encoding, lower, sizeof(lower))) {
1689 if (strcmp(lower, "utf-8") == 0)
1690 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1691 PyUnicode_GET_SIZE(unicode),
1692 errors);
1693 else if ((strcmp(lower, "latin-1") == 0) ||
1694 (strcmp(lower, "iso-8859-1") == 0))
1695 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1696 PyUnicode_GET_SIZE(unicode),
1697 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001698#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001699 else if (strcmp(lower, "mbcs") == 0)
1700 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1701 PyUnicode_GET_SIZE(unicode),
1702 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001703#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001704 else if (strcmp(lower, "ascii") == 0)
1705 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1706 PyUnicode_GET_SIZE(unicode),
1707 errors);
1708 }
Victor Stinner59e62db2010-05-15 13:14:32 +00001709 /* During bootstrap, we may need to find the encodings
1710 package, to load the file system encoding, and require the
1711 file system encoding in order to load the encodings
1712 package.
Christian Heimes6a27efa2008-10-30 21:48:26 +00001713
Victor Stinner59e62db2010-05-15 13:14:32 +00001714 Break out of this dependency by assuming that the path to
1715 the encodings module is ASCII-only. XXX could try wcstombs
1716 instead, if the file system encoding is the locale's
1717 encoding. */
Victor Stinner37296e82010-06-10 13:36:23 +00001718 if (Py_FileSystemDefaultEncoding &&
Victor Stinner59e62db2010-05-15 13:14:32 +00001719 strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1720 !PyThreadState_GET()->interp->codecs_initialized)
1721 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1722 PyUnicode_GET_SIZE(unicode),
1723 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724
1725 /* Encode via the codec registry */
1726 v = PyCodec_Encode(unicode, encoding, errors);
1727 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001728 return NULL;
1729
1730 /* The normal path */
1731 if (PyBytes_Check(v))
1732 return v;
1733
1734 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001735 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001736 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001737 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001738
1739 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1740 "encoder %s returned bytearray instead of bytes",
1741 encoding);
1742 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001743 Py_DECREF(v);
1744 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001745 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001746
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001747 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1748 Py_DECREF(v);
1749 return b;
1750 }
1751
1752 PyErr_Format(PyExc_TypeError,
1753 "encoder did not return a bytes object (type=%.400s)",
1754 Py_TYPE(v)->tp_name);
1755 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001756 return NULL;
1757}
1758
1759PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1760 const char *encoding,
1761 const char *errors)
1762{
1763 PyObject *v;
1764
1765 if (!PyUnicode_Check(unicode)) {
1766 PyErr_BadArgument();
1767 goto onError;
1768 }
1769
1770 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001771 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001772
1773 /* Encode via the codec registry */
1774 v = PyCodec_Encode(unicode, encoding, errors);
1775 if (v == NULL)
1776 goto onError;
1777 if (!PyUnicode_Check(v)) {
1778 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001779 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001780 Py_TYPE(v)->tp_name);
1781 Py_DECREF(v);
1782 goto onError;
1783 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001785
Benjamin Peterson29060642009-01-31 22:14:21 +00001786 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 return NULL;
1788}
1789
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001790PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00001791 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001792{
1793 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001794 if (v)
1795 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001796 if (errors != NULL)
1797 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001798 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001799 PyUnicode_GET_SIZE(unicode),
1800 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001801 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001802 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001803 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001804 return v;
1805}
1806
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001807PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001808PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001809 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001810 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1811}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001812
Christian Heimes5894ba72007-11-04 11:43:14 +00001813PyObject*
1814PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1815{
Victor Stinnerad158722010-10-27 00:25:46 +00001816#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1817 return PyUnicode_DecodeMBCS(s, size, NULL);
1818#elif defined(__APPLE__)
1819 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1820#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001821 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1822 can be undefined. If it is case, decode using UTF-8. The following assumes
1823 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1824 bootstrapping process where the codecs aren't ready yet.
1825 */
1826 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001827 return PyUnicode_Decode(s, size,
1828 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001829 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001830 }
1831 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001832 /* locale encoding with surrogateescape */
1833 wchar_t *wchar;
1834 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001835 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001836
1837 if (s[size] != '\0' || size != strlen(s)) {
1838 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1839 return NULL;
1840 }
1841
Victor Stinner168e1172010-10-16 23:16:16 +00001842 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001843 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001844 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001845
Victor Stinner168e1172010-10-16 23:16:16 +00001846 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001847 PyMem_Free(wchar);
1848 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001849 }
Victor Stinnerad158722010-10-27 00:25:46 +00001850#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001851}
1852
Martin v. Löwis011e8422009-05-05 04:43:17 +00001853
1854int
1855PyUnicode_FSConverter(PyObject* arg, void* addr)
1856{
1857 PyObject *output = NULL;
1858 Py_ssize_t size;
1859 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001860 if (arg == NULL) {
1861 Py_DECREF(*(PyObject**)addr);
1862 return 1;
1863 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001864 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001865 output = arg;
1866 Py_INCREF(output);
1867 }
1868 else {
1869 arg = PyUnicode_FromObject(arg);
1870 if (!arg)
1871 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001872 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001873 Py_DECREF(arg);
1874 if (!output)
1875 return 0;
1876 if (!PyBytes_Check(output)) {
1877 Py_DECREF(output);
1878 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1879 return 0;
1880 }
1881 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001882 size = PyBytes_GET_SIZE(output);
1883 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001884 if (size != strlen(data)) {
1885 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1886 Py_DECREF(output);
1887 return 0;
1888 }
1889 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001890 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001891}
1892
1893
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001894int
1895PyUnicode_FSDecoder(PyObject* arg, void* addr)
1896{
1897 PyObject *output = NULL;
1898 Py_ssize_t size;
1899 void *data;
1900 if (arg == NULL) {
1901 Py_DECREF(*(PyObject**)addr);
1902 return 1;
1903 }
1904 if (PyUnicode_Check(arg)) {
1905 output = arg;
1906 Py_INCREF(output);
1907 }
1908 else {
1909 arg = PyBytes_FromObject(arg);
1910 if (!arg)
1911 return 0;
1912 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1913 PyBytes_GET_SIZE(arg));
1914 Py_DECREF(arg);
1915 if (!output)
1916 return 0;
1917 if (!PyUnicode_Check(output)) {
1918 Py_DECREF(output);
1919 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1920 return 0;
1921 }
1922 }
1923 size = PyUnicode_GET_SIZE(output);
1924 data = PyUnicode_AS_UNICODE(output);
1925 if (size != Py_UNICODE_strlen(data)) {
1926 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1927 Py_DECREF(output);
1928 return 0;
1929 }
1930 *(PyObject**)addr = output;
1931 return Py_CLEANUP_SUPPORTED;
1932}
1933
1934
Martin v. Löwis5b222132007-06-10 09:51:05 +00001935char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001936_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001937{
Christian Heimesf3863112007-11-22 07:46:41 +00001938 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001939 if (!PyUnicode_Check(unicode)) {
1940 PyErr_BadArgument();
1941 return NULL;
1942 }
Christian Heimesf3863112007-11-22 07:46:41 +00001943 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1944 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001945 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001946 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001947 *psize = PyBytes_GET_SIZE(bytes);
1948 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001949}
1950
1951char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001952_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001953{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001954 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001955}
1956
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1958{
1959 if (!PyUnicode_Check(unicode)) {
1960 PyErr_BadArgument();
1961 goto onError;
1962 }
1963 return PyUnicode_AS_UNICODE(unicode);
1964
Benjamin Peterson29060642009-01-31 22:14:21 +00001965 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966 return NULL;
1967}
1968
Martin v. Löwis18e16552006-02-15 17:27:45 +00001969Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970{
1971 if (!PyUnicode_Check(unicode)) {
1972 PyErr_BadArgument();
1973 goto onError;
1974 }
1975 return PyUnicode_GET_SIZE(unicode);
1976
Benjamin Peterson29060642009-01-31 22:14:21 +00001977 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978 return -1;
1979}
1980
Thomas Wouters78890102000-07-22 19:25:51 +00001981const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00001982{
Victor Stinner42cb4622010-09-01 19:39:01 +00001983 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00001984}
1985
Victor Stinner554f3f02010-06-16 23:33:54 +00001986/* create or adjust a UnicodeDecodeError */
1987static void
1988make_decode_exception(PyObject **exceptionObject,
1989 const char *encoding,
1990 const char *input, Py_ssize_t length,
1991 Py_ssize_t startpos, Py_ssize_t endpos,
1992 const char *reason)
1993{
1994 if (*exceptionObject == NULL) {
1995 *exceptionObject = PyUnicodeDecodeError_Create(
1996 encoding, input, length, startpos, endpos, reason);
1997 }
1998 else {
1999 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2000 goto onError;
2001 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2002 goto onError;
2003 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2004 goto onError;
2005 }
2006 return;
2007
2008onError:
2009 Py_DECREF(*exceptionObject);
2010 *exceptionObject = NULL;
2011}
2012
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002013/* error handling callback helper:
2014 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002015 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002016 and adjust various state variables.
2017 return 0 on success, -1 on error
2018*/
2019
2020static
2021int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00002022 const char *encoding, const char *reason,
2023 const char **input, const char **inend, Py_ssize_t *startinpos,
2024 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2025 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002026{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002027 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002028
2029 PyObject *restuple = NULL;
2030 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002031 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002032 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002033 Py_ssize_t requiredsize;
2034 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002035 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002036 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002037 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002038 int res = -1;
2039
2040 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002041 *errorHandler = PyCodec_LookupError(errors);
2042 if (*errorHandler == NULL)
2043 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002044 }
2045
Victor Stinner554f3f02010-06-16 23:33:54 +00002046 make_decode_exception(exceptionObject,
2047 encoding,
2048 *input, *inend - *input,
2049 *startinpos, *endinpos,
2050 reason);
2051 if (*exceptionObject == NULL)
2052 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002053
2054 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2055 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002056 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002057 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002058 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002059 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002060 }
2061 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002062 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002063
2064 /* Copy back the bytes variables, which might have been modified by the
2065 callback */
2066 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2067 if (!inputobj)
2068 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002069 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002070 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002071 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002072 *input = PyBytes_AS_STRING(inputobj);
2073 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002074 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002075 /* we can DECREF safely, as the exception has another reference,
2076 so the object won't go away. */
2077 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002080 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002081 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002082 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2083 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002084 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002085
2086 /* need more space? (at least enough for what we
2087 have+the replacement+the rest of the string (starting
2088 at the new input position), so we won't have to check space
2089 when there are no errors in the rest of the string) */
2090 repptr = PyUnicode_AS_UNICODE(repunicode);
2091 repsize = PyUnicode_GET_SIZE(repunicode);
2092 requiredsize = *outpos + repsize + insize-newpos;
2093 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002094 if (requiredsize<2*outsize)
2095 requiredsize = 2*outsize;
2096 if (_PyUnicode_Resize(output, requiredsize) < 0)
2097 goto onError;
2098 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002099 }
2100 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002101 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002102 Py_UNICODE_COPY(*outptr, repptr, repsize);
2103 *outptr += repsize;
2104 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002106 /* we made it! */
2107 res = 0;
2108
Benjamin Peterson29060642009-01-31 22:14:21 +00002109 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002110 Py_XDECREF(restuple);
2111 return res;
2112}
2113
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002114/* --- UTF-7 Codec -------------------------------------------------------- */
2115
Antoine Pitrou244651a2009-05-04 18:56:13 +00002116/* See RFC2152 for details. We encode conservatively and decode liberally. */
2117
2118/* Three simple macros defining base-64. */
2119
2120/* Is c a base-64 character? */
2121
2122#define IS_BASE64(c) \
2123 (((c) >= 'A' && (c) <= 'Z') || \
2124 ((c) >= 'a' && (c) <= 'z') || \
2125 ((c) >= '0' && (c) <= '9') || \
2126 (c) == '+' || (c) == '/')
2127
2128/* given that c is a base-64 character, what is its base-64 value? */
2129
2130#define FROM_BASE64(c) \
2131 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2132 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2133 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2134 (c) == '+' ? 62 : 63)
2135
2136/* What is the base-64 character of the bottom 6 bits of n? */
2137
2138#define TO_BASE64(n) \
2139 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2140
2141/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2142 * decoded as itself. We are permissive on decoding; the only ASCII
2143 * byte not decoding to itself is the + which begins a base64
2144 * string. */
2145
2146#define DECODE_DIRECT(c) \
2147 ((c) <= 127 && (c) != '+')
2148
2149/* The UTF-7 encoder treats ASCII characters differently according to
2150 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2151 * the above). See RFC2152. This array identifies these different
2152 * sets:
2153 * 0 : "Set D"
2154 * alphanumeric and '(),-./:?
2155 * 1 : "Set O"
2156 * !"#$%&*;<=>@[]^_`{|}
2157 * 2 : "whitespace"
2158 * ht nl cr sp
2159 * 3 : special (must be base64 encoded)
2160 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2161 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002162
Tim Petersced69f82003-09-16 20:30:58 +00002163static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002164char utf7_category[128] = {
2165/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2166 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2167/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2168 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2169/* sp ! " # $ % & ' ( ) * + , - . / */
2170 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2171/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2172 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2173/* @ A B C D E F G H I J K L M N O */
2174 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2175/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2176 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2177/* ` a b c d e f g h i j k l m n o */
2178 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2179/* p q r s t u v w x y z { | } ~ del */
2180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002181};
2182
Antoine Pitrou244651a2009-05-04 18:56:13 +00002183/* ENCODE_DIRECT: this character should be encoded as itself. The
2184 * answer depends on whether we are encoding set O as itself, and also
2185 * on whether we are encoding whitespace as itself. RFC2152 makes it
2186 * clear that the answers to these questions vary between
2187 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002188
Antoine Pitrou244651a2009-05-04 18:56:13 +00002189#define ENCODE_DIRECT(c, directO, directWS) \
2190 ((c) < 128 && (c) > 0 && \
2191 ((utf7_category[(c)] == 0) || \
2192 (directWS && (utf7_category[(c)] == 2)) || \
2193 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002194
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002195PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002196 Py_ssize_t size,
2197 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002198{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002199 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2200}
2201
Antoine Pitrou244651a2009-05-04 18:56:13 +00002202/* The decoder. The only state we preserve is our read position,
2203 * i.e. how many characters we have consumed. So if we end in the
2204 * middle of a shift sequence we have to back off the read position
2205 * and the output to the beginning of the sequence, otherwise we lose
2206 * all the shift state (seen bits, number of bits seen, high
2207 * surrogate). */
2208
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002209PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002210 Py_ssize_t size,
2211 const char *errors,
2212 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002213{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002214 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002215 Py_ssize_t startinpos;
2216 Py_ssize_t endinpos;
2217 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002218 const char *e;
2219 PyUnicodeObject *unicode;
2220 Py_UNICODE *p;
2221 const char *errmsg = "";
2222 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002223 Py_UNICODE *shiftOutStart;
2224 unsigned int base64bits = 0;
2225 unsigned long base64buffer = 0;
2226 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002227 PyObject *errorHandler = NULL;
2228 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002229
2230 unicode = _PyUnicode_New(size);
2231 if (!unicode)
2232 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002233 if (size == 0) {
2234 if (consumed)
2235 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002236 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002237 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002238
2239 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002240 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002241 e = s + size;
2242
2243 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002244 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002245 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002246 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002247
Antoine Pitrou244651a2009-05-04 18:56:13 +00002248 if (inShift) { /* in a base-64 section */
2249 if (IS_BASE64(ch)) { /* consume a base-64 character */
2250 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2251 base64bits += 6;
2252 s++;
2253 if (base64bits >= 16) {
2254 /* we have enough bits for a UTF-16 value */
2255 Py_UNICODE outCh = (Py_UNICODE)
2256 (base64buffer >> (base64bits-16));
2257 base64bits -= 16;
2258 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2259 if (surrogate) {
2260 /* expecting a second surrogate */
2261 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2262#ifdef Py_UNICODE_WIDE
2263 *p++ = (((surrogate & 0x3FF)<<10)
2264 | (outCh & 0x3FF)) + 0x10000;
2265#else
2266 *p++ = surrogate;
2267 *p++ = outCh;
2268#endif
2269 surrogate = 0;
2270 }
2271 else {
2272 surrogate = 0;
2273 errmsg = "second surrogate missing";
2274 goto utf7Error;
2275 }
2276 }
2277 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2278 /* first surrogate */
2279 surrogate = outCh;
2280 }
2281 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2282 errmsg = "unexpected second surrogate";
2283 goto utf7Error;
2284 }
2285 else {
2286 *p++ = outCh;
2287 }
2288 }
2289 }
2290 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002291 inShift = 0;
2292 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002293 if (surrogate) {
2294 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002295 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002296 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002297 if (base64bits > 0) { /* left-over bits */
2298 if (base64bits >= 6) {
2299 /* We've seen at least one base-64 character */
2300 errmsg = "partial character in shift sequence";
2301 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002302 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002303 else {
2304 /* Some bits remain; they should be zero */
2305 if (base64buffer != 0) {
2306 errmsg = "non-zero padding bits in shift sequence";
2307 goto utf7Error;
2308 }
2309 }
2310 }
2311 if (ch != '-') {
2312 /* '-' is absorbed; other terminating
2313 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002314 *p++ = ch;
2315 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002316 }
2317 }
2318 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002319 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002320 s++; /* consume '+' */
2321 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002322 s++;
2323 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002324 }
2325 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002326 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002327 shiftOutStart = p;
2328 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002329 }
2330 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002331 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002332 *p++ = ch;
2333 s++;
2334 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002335 else {
2336 startinpos = s-starts;
2337 s++;
2338 errmsg = "unexpected special character";
2339 goto utf7Error;
2340 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002341 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002342utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002343 outpos = p-PyUnicode_AS_UNICODE(unicode);
2344 endinpos = s-starts;
2345 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002346 errors, &errorHandler,
2347 "utf7", errmsg,
2348 &starts, &e, &startinpos, &endinpos, &exc, &s,
2349 &unicode, &outpos, &p))
2350 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002351 }
2352
Antoine Pitrou244651a2009-05-04 18:56:13 +00002353 /* end of string */
2354
2355 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2356 /* if we're in an inconsistent state, that's an error */
2357 if (surrogate ||
2358 (base64bits >= 6) ||
2359 (base64bits > 0 && base64buffer != 0)) {
2360 outpos = p-PyUnicode_AS_UNICODE(unicode);
2361 endinpos = size;
2362 if (unicode_decode_call_errorhandler(
2363 errors, &errorHandler,
2364 "utf7", "unterminated shift sequence",
2365 &starts, &e, &startinpos, &endinpos, &exc, &s,
2366 &unicode, &outpos, &p))
2367 goto onError;
2368 if (s < e)
2369 goto restart;
2370 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002371 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002372
2373 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002374 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002375 if (inShift) {
2376 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002377 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002378 }
2379 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002380 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002381 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002382 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002383
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002384 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002385 goto onError;
2386
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002387 Py_XDECREF(errorHandler);
2388 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002389 return (PyObject *)unicode;
2390
Benjamin Peterson29060642009-01-31 22:14:21 +00002391 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002392 Py_XDECREF(errorHandler);
2393 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002394 Py_DECREF(unicode);
2395 return NULL;
2396}
2397
2398
2399PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002400 Py_ssize_t size,
Antoine Pitrou244651a2009-05-04 18:56:13 +00002401 int base64SetO,
2402 int base64WhiteSpace,
Benjamin Peterson29060642009-01-31 22:14:21 +00002403 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002404{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002405 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002406 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002407 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002408 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002409 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002410 unsigned int base64bits = 0;
2411 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002412 char * out;
2413 char * start;
2414
2415 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002416 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002417
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002418 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002419 return PyErr_NoMemory();
2420
Antoine Pitrou244651a2009-05-04 18:56:13 +00002421 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002422 if (v == NULL)
2423 return NULL;
2424
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002425 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002426 for (;i < size; ++i) {
2427 Py_UNICODE ch = s[i];
2428
Antoine Pitrou244651a2009-05-04 18:56:13 +00002429 if (inShift) {
2430 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2431 /* shifting out */
2432 if (base64bits) { /* output remaining bits */
2433 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2434 base64buffer = 0;
2435 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002436 }
2437 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002438 /* Characters not in the BASE64 set implicitly unshift the sequence
2439 so no '-' is required, except if the character is itself a '-' */
2440 if (IS_BASE64(ch) || ch == '-') {
2441 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002442 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002443 *out++ = (char) ch;
2444 }
2445 else {
2446 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002447 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002448 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002449 else { /* not in a shift sequence */
2450 if (ch == '+') {
2451 *out++ = '+';
2452 *out++ = '-';
2453 }
2454 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2455 *out++ = (char) ch;
2456 }
2457 else {
2458 *out++ = '+';
2459 inShift = 1;
2460 goto encode_char;
2461 }
2462 }
2463 continue;
2464encode_char:
2465#ifdef Py_UNICODE_WIDE
2466 if (ch >= 0x10000) {
2467 /* code first surrogate */
2468 base64bits += 16;
2469 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2470 while (base64bits >= 6) {
2471 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2472 base64bits -= 6;
2473 }
2474 /* prepare second surrogate */
2475 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2476 }
2477#endif
2478 base64bits += 16;
2479 base64buffer = (base64buffer << 16) | ch;
2480 while (base64bits >= 6) {
2481 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2482 base64bits -= 6;
2483 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002484 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002485 if (base64bits)
2486 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2487 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002488 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002489 if (_PyBytes_Resize(&v, out - start) < 0)
2490 return NULL;
2491 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002492}
2493
Antoine Pitrou244651a2009-05-04 18:56:13 +00002494#undef IS_BASE64
2495#undef FROM_BASE64
2496#undef TO_BASE64
2497#undef DECODE_DIRECT
2498#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002499
Guido van Rossumd57fd912000-03-10 22:53:23 +00002500/* --- UTF-8 Codec -------------------------------------------------------- */
2501
Tim Petersced69f82003-09-16 20:30:58 +00002502static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002504 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2505 illegal prefix. See RFC 3629 for details */
2506 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2507 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002508 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2510 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2511 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2512 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002513 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2514 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002515 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2516 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002517 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2518 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2519 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2520 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2521 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522};
2523
Guido van Rossumd57fd912000-03-10 22:53:23 +00002524PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002525 Py_ssize_t size,
2526 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527{
Walter Dörwald69652032004-09-07 20:24:22 +00002528 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2529}
2530
Antoine Pitrouab868312009-01-10 15:40:25 +00002531/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2532#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2533
2534/* Mask to quickly check whether a C 'long' contains a
2535 non-ASCII, UTF8-encoded char. */
2536#if (SIZEOF_LONG == 8)
2537# define ASCII_CHAR_MASK 0x8080808080808080L
2538#elif (SIZEOF_LONG == 4)
2539# define ASCII_CHAR_MASK 0x80808080L
2540#else
2541# error C 'long' size should be either 4 or 8!
2542#endif
2543
Walter Dörwald69652032004-09-07 20:24:22 +00002544PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002545 Py_ssize_t size,
2546 const char *errors,
2547 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002548{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002549 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002551 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002552 Py_ssize_t startinpos;
2553 Py_ssize_t endinpos;
2554 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002555 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556 PyUnicodeObject *unicode;
2557 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002558 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002559 PyObject *errorHandler = NULL;
2560 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561
2562 /* Note: size will always be longer than the resulting Unicode
2563 character count */
2564 unicode = _PyUnicode_New(size);
2565 if (!unicode)
2566 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002567 if (size == 0) {
2568 if (consumed)
2569 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002570 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002572
2573 /* Unpack UTF-8 encoded data */
2574 p = unicode->str;
2575 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002576 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002577
2578 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002579 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580
2581 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002582 /* Fast path for runs of ASCII characters. Given that common UTF-8
2583 input will consist of an overwhelming majority of ASCII
2584 characters, we try to optimize for this case by checking
2585 as many characters as a C 'long' can contain.
2586 First, check if we can do an aligned read, as most CPUs have
2587 a penalty for unaligned reads.
2588 */
2589 if (!((size_t) s & LONG_PTR_MASK)) {
2590 /* Help register allocation */
2591 register const char *_s = s;
2592 register Py_UNICODE *_p = p;
2593 while (_s < aligned_end) {
2594 /* Read a whole long at a time (either 4 or 8 bytes),
2595 and do a fast unrolled copy if it only contains ASCII
2596 characters. */
2597 unsigned long data = *(unsigned long *) _s;
2598 if (data & ASCII_CHAR_MASK)
2599 break;
2600 _p[0] = (unsigned char) _s[0];
2601 _p[1] = (unsigned char) _s[1];
2602 _p[2] = (unsigned char) _s[2];
2603 _p[3] = (unsigned char) _s[3];
2604#if (SIZEOF_LONG == 8)
2605 _p[4] = (unsigned char) _s[4];
2606 _p[5] = (unsigned char) _s[5];
2607 _p[6] = (unsigned char) _s[6];
2608 _p[7] = (unsigned char) _s[7];
2609#endif
2610 _s += SIZEOF_LONG;
2611 _p += SIZEOF_LONG;
2612 }
2613 s = _s;
2614 p = _p;
2615 if (s == e)
2616 break;
2617 ch = (unsigned char)*s;
2618 }
2619 }
2620
2621 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002622 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623 s++;
2624 continue;
2625 }
2626
2627 n = utf8_code_length[ch];
2628
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002629 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002630 if (consumed)
2631 break;
2632 else {
2633 errmsg = "unexpected end of data";
2634 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002635 endinpos = startinpos+1;
2636 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2637 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002638 goto utf8Error;
2639 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002640 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641
2642 switch (n) {
2643
2644 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002645 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002646 startinpos = s-starts;
2647 endinpos = startinpos+1;
2648 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649
2650 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002651 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002652 startinpos = s-starts;
2653 endinpos = startinpos+1;
2654 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002655
2656 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002657 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002658 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002659 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002660 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002661 goto utf8Error;
2662 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002663 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002664 assert ((ch > 0x007F) && (ch <= 0x07FF));
2665 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666 break;
2667
2668 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002669 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2670 will result in surrogates in range d800-dfff. Surrogates are
2671 not valid UTF-8 so they are rejected.
2672 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2673 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002674 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002675 (s[2] & 0xc0) != 0x80 ||
2676 ((unsigned char)s[0] == 0xE0 &&
2677 (unsigned char)s[1] < 0xA0) ||
2678 ((unsigned char)s[0] == 0xED &&
2679 (unsigned char)s[1] > 0x9F)) {
2680 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002681 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002682 endinpos = startinpos + 1;
2683
2684 /* if s[1] first two bits are 1 and 0, then the invalid
2685 continuation byte is s[2], so increment endinpos by 1,
2686 if not, s[1] is invalid and endinpos doesn't need to
2687 be incremented. */
2688 if ((s[1] & 0xC0) == 0x80)
2689 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002690 goto utf8Error;
2691 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002693 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2694 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002695 break;
2696
2697 case 4:
2698 if ((s[1] & 0xc0) != 0x80 ||
2699 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002700 (s[3] & 0xc0) != 0x80 ||
2701 ((unsigned char)s[0] == 0xF0 &&
2702 (unsigned char)s[1] < 0x90) ||
2703 ((unsigned char)s[0] == 0xF4 &&
2704 (unsigned char)s[1] > 0x8F)) {
2705 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002706 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002707 endinpos = startinpos + 1;
2708 if ((s[1] & 0xC0) == 0x80) {
2709 endinpos++;
2710 if ((s[2] & 0xC0) == 0x80)
2711 endinpos++;
2712 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002713 goto utf8Error;
2714 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002715 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002716 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2717 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2718
Fredrik Lundh8f455852001-06-27 18:59:43 +00002719#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002720 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002721#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002722 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002723
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002724 /* translate from 10000..10FFFF to 0..FFFF */
2725 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002726
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002727 /* high surrogate = top 10 bits added to D800 */
2728 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002729
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002730 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002731 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002732#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734 }
2735 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002736 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002737
Benjamin Peterson29060642009-01-31 22:14:21 +00002738 utf8Error:
2739 outpos = p-PyUnicode_AS_UNICODE(unicode);
2740 if (unicode_decode_call_errorhandler(
2741 errors, &errorHandler,
2742 "utf8", errmsg,
2743 &starts, &e, &startinpos, &endinpos, &exc, &s,
2744 &unicode, &outpos, &p))
2745 goto onError;
2746 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 }
Walter Dörwald69652032004-09-07 20:24:22 +00002748 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002749 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750
2751 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002752 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002753 goto onError;
2754
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002755 Py_XDECREF(errorHandler);
2756 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757 return (PyObject *)unicode;
2758
Benjamin Peterson29060642009-01-31 22:14:21 +00002759 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002760 Py_XDECREF(errorHandler);
2761 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 Py_DECREF(unicode);
2763 return NULL;
2764}
2765
Antoine Pitrouab868312009-01-10 15:40:25 +00002766#undef ASCII_CHAR_MASK
2767
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002768#ifdef __APPLE__
2769
2770/* Simplified UTF-8 decoder using surrogateescape error handler,
2771 used to decode the command line arguments on Mac OS X. */
2772
2773wchar_t*
2774_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2775{
2776 int n;
2777 const char *e;
2778 wchar_t *unicode, *p;
2779
2780 /* Note: size will always be longer than the resulting Unicode
2781 character count */
2782 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2783 PyErr_NoMemory();
2784 return NULL;
2785 }
2786 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2787 if (!unicode)
2788 return NULL;
2789
2790 /* Unpack UTF-8 encoded data */
2791 p = unicode;
2792 e = s + size;
2793 while (s < e) {
2794 Py_UCS4 ch = (unsigned char)*s;
2795
2796 if (ch < 0x80) {
2797 *p++ = (wchar_t)ch;
2798 s++;
2799 continue;
2800 }
2801
2802 n = utf8_code_length[ch];
2803 if (s + n > e) {
2804 goto surrogateescape;
2805 }
2806
2807 switch (n) {
2808 case 0:
2809 case 1:
2810 goto surrogateescape;
2811
2812 case 2:
2813 if ((s[1] & 0xc0) != 0x80)
2814 goto surrogateescape;
2815 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2816 assert ((ch > 0x007F) && (ch <= 0x07FF));
2817 *p++ = (wchar_t)ch;
2818 break;
2819
2820 case 3:
2821 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2822 will result in surrogates in range d800-dfff. Surrogates are
2823 not valid UTF-8 so they are rejected.
2824 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2825 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2826 if ((s[1] & 0xc0) != 0x80 ||
2827 (s[2] & 0xc0) != 0x80 ||
2828 ((unsigned char)s[0] == 0xE0 &&
2829 (unsigned char)s[1] < 0xA0) ||
2830 ((unsigned char)s[0] == 0xED &&
2831 (unsigned char)s[1] > 0x9F)) {
2832
2833 goto surrogateescape;
2834 }
2835 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2836 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2837 *p++ = (Py_UNICODE)ch;
2838 break;
2839
2840 case 4:
2841 if ((s[1] & 0xc0) != 0x80 ||
2842 (s[2] & 0xc0) != 0x80 ||
2843 (s[3] & 0xc0) != 0x80 ||
2844 ((unsigned char)s[0] == 0xF0 &&
2845 (unsigned char)s[1] < 0x90) ||
2846 ((unsigned char)s[0] == 0xF4 &&
2847 (unsigned char)s[1] > 0x8F)) {
2848 goto surrogateescape;
2849 }
2850 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2851 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2852 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2853
2854#if SIZEOF_WCHAR_T == 4
2855 *p++ = (wchar_t)ch;
2856#else
2857 /* compute and append the two surrogates: */
2858
2859 /* translate from 10000..10FFFF to 0..FFFF */
2860 ch -= 0x10000;
2861
2862 /* high surrogate = top 10 bits added to D800 */
2863 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2864
2865 /* low surrogate = bottom 10 bits added to DC00 */
2866 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2867#endif
2868 break;
2869 }
2870 s += n;
2871 continue;
2872
2873 surrogateescape:
2874 *p++ = 0xDC00 + ch;
2875 s++;
2876 }
2877 *p = L'\0';
2878 return unicode;
2879}
2880
2881#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002882
Tim Peters602f7402002-04-27 18:03:26 +00002883/* Allocation strategy: if the string is short, convert into a stack buffer
2884 and allocate exactly as much space needed at the end. Else allocate the
2885 maximum possible needed (4 result bytes per Unicode character), and return
2886 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002887*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002888PyObject *
2889PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002890 Py_ssize_t size,
2891 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892{
Tim Peters602f7402002-04-27 18:03:26 +00002893#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002894
Guido van Rossum98297ee2007-11-06 21:34:58 +00002895 Py_ssize_t i; /* index into s of next input byte */
2896 PyObject *result; /* result string object */
2897 char *p; /* next free byte in output buffer */
2898 Py_ssize_t nallocated; /* number of result bytes allocated */
2899 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002900 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002901 PyObject *errorHandler = NULL;
2902 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002903
Tim Peters602f7402002-04-27 18:03:26 +00002904 assert(s != NULL);
2905 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906
Tim Peters602f7402002-04-27 18:03:26 +00002907 if (size <= MAX_SHORT_UNICHARS) {
2908 /* Write into the stack buffer; nallocated can't overflow.
2909 * At the end, we'll allocate exactly as much heap space as it
2910 * turns out we need.
2911 */
2912 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002913 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002914 p = stackbuf;
2915 }
2916 else {
2917 /* Overallocate on the heap, and give the excess back at the end. */
2918 nallocated = size * 4;
2919 if (nallocated / 4 != size) /* overflow! */
2920 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002921 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002922 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002923 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002924 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002925 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002926
Tim Peters602f7402002-04-27 18:03:26 +00002927 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002928 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002929
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002930 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002931 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002933
Guido van Rossumd57fd912000-03-10 22:53:23 +00002934 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002935 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002936 *p++ = (char)(0xc0 | (ch >> 6));
2937 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002938 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002939#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002940 /* Special case: check for high and low surrogate */
2941 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2942 Py_UCS4 ch2 = s[i];
2943 /* Combine the two surrogates to form a UCS4 value */
2944 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2945 i++;
2946
2947 /* Encode UCS4 Unicode ordinals */
2948 *p++ = (char)(0xf0 | (ch >> 18));
2949 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002950 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2951 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002952 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002953#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002954 Py_ssize_t newpos;
2955 PyObject *rep;
2956 Py_ssize_t repsize, k;
2957 rep = unicode_encode_call_errorhandler
2958 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2959 s, size, &exc, i-1, i, &newpos);
2960 if (!rep)
2961 goto error;
2962
2963 if (PyBytes_Check(rep))
2964 repsize = PyBytes_GET_SIZE(rep);
2965 else
2966 repsize = PyUnicode_GET_SIZE(rep);
2967
2968 if (repsize > 4) {
2969 Py_ssize_t offset;
2970
2971 if (result == NULL)
2972 offset = p - stackbuf;
2973 else
2974 offset = p - PyBytes_AS_STRING(result);
2975
2976 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2977 /* integer overflow */
2978 PyErr_NoMemory();
2979 goto error;
2980 }
2981 nallocated += repsize - 4;
2982 if (result != NULL) {
2983 if (_PyBytes_Resize(&result, nallocated) < 0)
2984 goto error;
2985 } else {
2986 result = PyBytes_FromStringAndSize(NULL, nallocated);
2987 if (result == NULL)
2988 goto error;
2989 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2990 }
2991 p = PyBytes_AS_STRING(result) + offset;
2992 }
2993
2994 if (PyBytes_Check(rep)) {
2995 char *prep = PyBytes_AS_STRING(rep);
2996 for(k = repsize; k > 0; k--)
2997 *p++ = *prep++;
2998 } else /* rep is unicode */ {
2999 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3000 Py_UNICODE c;
3001
3002 for(k=0; k<repsize; k++) {
3003 c = prep[k];
3004 if (0x80 <= c) {
3005 raise_encode_exception(&exc, "utf-8", s, size,
3006 i-1, i, "surrogates not allowed");
3007 goto error;
3008 }
3009 *p++ = (char)prep[k];
3010 }
3011 }
3012 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003013#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003014 }
Victor Stinner445a6232010-04-22 20:01:57 +00003015#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003016 } else if (ch < 0x10000) {
3017 *p++ = (char)(0xe0 | (ch >> 12));
3018 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3019 *p++ = (char)(0x80 | (ch & 0x3f));
3020 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003021 /* Encode UCS4 Unicode ordinals */
3022 *p++ = (char)(0xf0 | (ch >> 18));
3023 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3024 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3025 *p++ = (char)(0x80 | (ch & 0x3f));
3026 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003027 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003028
Guido van Rossum98297ee2007-11-06 21:34:58 +00003029 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003030 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003031 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003032 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003033 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003034 }
3035 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003036 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003037 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003038 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003039 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003040 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003041 Py_XDECREF(errorHandler);
3042 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003043 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003044 error:
3045 Py_XDECREF(errorHandler);
3046 Py_XDECREF(exc);
3047 Py_XDECREF(result);
3048 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003049
Tim Peters602f7402002-04-27 18:03:26 +00003050#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051}
3052
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3054{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003055 if (!PyUnicode_Check(unicode)) {
3056 PyErr_BadArgument();
3057 return NULL;
3058 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003059 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003060 PyUnicode_GET_SIZE(unicode),
3061 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062}
3063
Walter Dörwald41980ca2007-08-16 21:55:45 +00003064/* --- UTF-32 Codec ------------------------------------------------------- */
3065
3066PyObject *
3067PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003068 Py_ssize_t size,
3069 const char *errors,
3070 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003071{
3072 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3073}
3074
3075PyObject *
3076PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 Py_ssize_t size,
3078 const char *errors,
3079 int *byteorder,
3080 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003081{
3082 const char *starts = s;
3083 Py_ssize_t startinpos;
3084 Py_ssize_t endinpos;
3085 Py_ssize_t outpos;
3086 PyUnicodeObject *unicode;
3087 Py_UNICODE *p;
3088#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003089 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003090 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003091#else
3092 const int pairs = 0;
3093#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003094 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003095 int bo = 0; /* assume native ordering by default */
3096 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003097 /* Offsets from q for retrieving bytes in the right order. */
3098#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3099 int iorder[] = {0, 1, 2, 3};
3100#else
3101 int iorder[] = {3, 2, 1, 0};
3102#endif
3103 PyObject *errorHandler = NULL;
3104 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003105
Walter Dörwald41980ca2007-08-16 21:55:45 +00003106 q = (unsigned char *)s;
3107 e = q + size;
3108
3109 if (byteorder)
3110 bo = *byteorder;
3111
3112 /* Check for BOM marks (U+FEFF) in the input and adjust current
3113 byte order setting accordingly. In native mode, the leading BOM
3114 mark is skipped, in all other modes, it is copied to the output
3115 stream as-is (giving a ZWNBSP character). */
3116 if (bo == 0) {
3117 if (size >= 4) {
3118 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003119 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003120#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003121 if (bom == 0x0000FEFF) {
3122 q += 4;
3123 bo = -1;
3124 }
3125 else if (bom == 0xFFFE0000) {
3126 q += 4;
3127 bo = 1;
3128 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003129#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003130 if (bom == 0x0000FEFF) {
3131 q += 4;
3132 bo = 1;
3133 }
3134 else if (bom == 0xFFFE0000) {
3135 q += 4;
3136 bo = -1;
3137 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003138#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003139 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003140 }
3141
3142 if (bo == -1) {
3143 /* force LE */
3144 iorder[0] = 0;
3145 iorder[1] = 1;
3146 iorder[2] = 2;
3147 iorder[3] = 3;
3148 }
3149 else if (bo == 1) {
3150 /* force BE */
3151 iorder[0] = 3;
3152 iorder[1] = 2;
3153 iorder[2] = 1;
3154 iorder[3] = 0;
3155 }
3156
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003157 /* On narrow builds we split characters outside the BMP into two
3158 codepoints => count how much extra space we need. */
3159#ifndef Py_UNICODE_WIDE
3160 for (qq = q; qq < e; qq += 4)
3161 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3162 pairs++;
3163#endif
3164
3165 /* This might be one to much, because of a BOM */
3166 unicode = _PyUnicode_New((size+3)/4+pairs);
3167 if (!unicode)
3168 return NULL;
3169 if (size == 0)
3170 return (PyObject *)unicode;
3171
3172 /* Unpack UTF-32 encoded data */
3173 p = unicode->str;
3174
Walter Dörwald41980ca2007-08-16 21:55:45 +00003175 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003176 Py_UCS4 ch;
3177 /* remaining bytes at the end? (size should be divisible by 4) */
3178 if (e-q<4) {
3179 if (consumed)
3180 break;
3181 errmsg = "truncated data";
3182 startinpos = ((const char *)q)-starts;
3183 endinpos = ((const char *)e)-starts;
3184 goto utf32Error;
3185 /* The remaining input chars are ignored if the callback
3186 chooses to skip the input */
3187 }
3188 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3189 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003190
Benjamin Peterson29060642009-01-31 22:14:21 +00003191 if (ch >= 0x110000)
3192 {
3193 errmsg = "codepoint not in range(0x110000)";
3194 startinpos = ((const char *)q)-starts;
3195 endinpos = startinpos+4;
3196 goto utf32Error;
3197 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003198#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003199 if (ch >= 0x10000)
3200 {
3201 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3202 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3203 }
3204 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003205#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003206 *p++ = ch;
3207 q += 4;
3208 continue;
3209 utf32Error:
3210 outpos = p-PyUnicode_AS_UNICODE(unicode);
3211 if (unicode_decode_call_errorhandler(
3212 errors, &errorHandler,
3213 "utf32", errmsg,
3214 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3215 &unicode, &outpos, &p))
3216 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003217 }
3218
3219 if (byteorder)
3220 *byteorder = bo;
3221
3222 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003224
3225 /* Adjust length */
3226 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3227 goto onError;
3228
3229 Py_XDECREF(errorHandler);
3230 Py_XDECREF(exc);
3231 return (PyObject *)unicode;
3232
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003234 Py_DECREF(unicode);
3235 Py_XDECREF(errorHandler);
3236 Py_XDECREF(exc);
3237 return NULL;
3238}
3239
3240PyObject *
3241PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003242 Py_ssize_t size,
3243 const char *errors,
3244 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003245{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003246 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003247 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003248 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003249#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003250 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003251#else
3252 const int pairs = 0;
3253#endif
3254 /* Offsets from p for storing byte pairs in the right order. */
3255#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3256 int iorder[] = {0, 1, 2, 3};
3257#else
3258 int iorder[] = {3, 2, 1, 0};
3259#endif
3260
Benjamin Peterson29060642009-01-31 22:14:21 +00003261#define STORECHAR(CH) \
3262 do { \
3263 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3264 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3265 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3266 p[iorder[0]] = (CH) & 0xff; \
3267 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003268 } while(0)
3269
3270 /* In narrow builds we can output surrogate pairs as one codepoint,
3271 so we need less space. */
3272#ifndef Py_UNICODE_WIDE
3273 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003274 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3275 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3276 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003277#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003278 nsize = (size - pairs + (byteorder == 0));
3279 bytesize = nsize * 4;
3280 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003281 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003282 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003283 if (v == NULL)
3284 return NULL;
3285
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003286 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003287 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003288 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003289 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003290 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003291
3292 if (byteorder == -1) {
3293 /* force LE */
3294 iorder[0] = 0;
3295 iorder[1] = 1;
3296 iorder[2] = 2;
3297 iorder[3] = 3;
3298 }
3299 else if (byteorder == 1) {
3300 /* force BE */
3301 iorder[0] = 3;
3302 iorder[1] = 2;
3303 iorder[2] = 1;
3304 iorder[3] = 0;
3305 }
3306
3307 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003308 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003309#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003310 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3311 Py_UCS4 ch2 = *s;
3312 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3313 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3314 s++;
3315 size--;
3316 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003317 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003318#endif
3319 STORECHAR(ch);
3320 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003321
3322 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003323 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003324#undef STORECHAR
3325}
3326
3327PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3328{
3329 if (!PyUnicode_Check(unicode)) {
3330 PyErr_BadArgument();
3331 return NULL;
3332 }
3333 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003334 PyUnicode_GET_SIZE(unicode),
3335 NULL,
3336 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003337}
3338
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339/* --- UTF-16 Codec ------------------------------------------------------- */
3340
Tim Peters772747b2001-08-09 22:21:55 +00003341PyObject *
3342PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003343 Py_ssize_t size,
3344 const char *errors,
3345 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346{
Walter Dörwald69652032004-09-07 20:24:22 +00003347 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3348}
3349
Antoine Pitrouab868312009-01-10 15:40:25 +00003350/* Two masks for fast checking of whether a C 'long' may contain
3351 UTF16-encoded surrogate characters. This is an efficient heuristic,
3352 assuming that non-surrogate characters with a code point >= 0x8000 are
3353 rare in most input.
3354 FAST_CHAR_MASK is used when the input is in native byte ordering,
3355 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003356*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003357#if (SIZEOF_LONG == 8)
3358# define FAST_CHAR_MASK 0x8000800080008000L
3359# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3360#elif (SIZEOF_LONG == 4)
3361# define FAST_CHAR_MASK 0x80008000L
3362# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3363#else
3364# error C 'long' size should be either 4 or 8!
3365#endif
3366
Walter Dörwald69652032004-09-07 20:24:22 +00003367PyObject *
3368PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003369 Py_ssize_t size,
3370 const char *errors,
3371 int *byteorder,
3372 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003373{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003374 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003375 Py_ssize_t startinpos;
3376 Py_ssize_t endinpos;
3377 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378 PyUnicodeObject *unicode;
3379 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003380 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003381 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003382 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003383 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003384 /* Offsets from q for retrieving byte pairs in the right order. */
3385#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3386 int ihi = 1, ilo = 0;
3387#else
3388 int ihi = 0, ilo = 1;
3389#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 PyObject *errorHandler = NULL;
3391 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392
3393 /* Note: size will always be longer than the resulting Unicode
3394 character count */
3395 unicode = _PyUnicode_New(size);
3396 if (!unicode)
3397 return NULL;
3398 if (size == 0)
3399 return (PyObject *)unicode;
3400
3401 /* Unpack UTF-16 encoded data */
3402 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003403 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003404 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003405
3406 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003407 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003409 /* Check for BOM marks (U+FEFF) in the input and adjust current
3410 byte order setting accordingly. In native mode, the leading BOM
3411 mark is skipped, in all other modes, it is copied to the output
3412 stream as-is (giving a ZWNBSP character). */
3413 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003414 if (size >= 2) {
3415 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003416#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003417 if (bom == 0xFEFF) {
3418 q += 2;
3419 bo = -1;
3420 }
3421 else if (bom == 0xFFFE) {
3422 q += 2;
3423 bo = 1;
3424 }
Tim Petersced69f82003-09-16 20:30:58 +00003425#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003426 if (bom == 0xFEFF) {
3427 q += 2;
3428 bo = 1;
3429 }
3430 else if (bom == 0xFFFE) {
3431 q += 2;
3432 bo = -1;
3433 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003434#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003435 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003436 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437
Tim Peters772747b2001-08-09 22:21:55 +00003438 if (bo == -1) {
3439 /* force LE */
3440 ihi = 1;
3441 ilo = 0;
3442 }
3443 else if (bo == 1) {
3444 /* force BE */
3445 ihi = 0;
3446 ilo = 1;
3447 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003448#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3449 native_ordering = ilo < ihi;
3450#else
3451 native_ordering = ilo > ihi;
3452#endif
Tim Peters772747b2001-08-09 22:21:55 +00003453
Antoine Pitrouab868312009-01-10 15:40:25 +00003454 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003455 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003456 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003457 /* First check for possible aligned read of a C 'long'. Unaligned
3458 reads are more expensive, better to defer to another iteration. */
3459 if (!((size_t) q & LONG_PTR_MASK)) {
3460 /* Fast path for runs of non-surrogate chars. */
3461 register const unsigned char *_q = q;
3462 Py_UNICODE *_p = p;
3463 if (native_ordering) {
3464 /* Native ordering is simple: as long as the input cannot
3465 possibly contain a surrogate char, do an unrolled copy
3466 of several 16-bit code points to the target object.
3467 The non-surrogate check is done on several input bytes
3468 at a time (as many as a C 'long' can contain). */
3469 while (_q < aligned_end) {
3470 unsigned long data = * (unsigned long *) _q;
3471 if (data & FAST_CHAR_MASK)
3472 break;
3473 _p[0] = ((unsigned short *) _q)[0];
3474 _p[1] = ((unsigned short *) _q)[1];
3475#if (SIZEOF_LONG == 8)
3476 _p[2] = ((unsigned short *) _q)[2];
3477 _p[3] = ((unsigned short *) _q)[3];
3478#endif
3479 _q += SIZEOF_LONG;
3480 _p += SIZEOF_LONG / 2;
3481 }
3482 }
3483 else {
3484 /* Byteswapped ordering is similar, but we must decompose
3485 the copy bytewise, and take care of zero'ing out the
3486 upper bytes if the target object is in 32-bit units
3487 (that is, in UCS-4 builds). */
3488 while (_q < aligned_end) {
3489 unsigned long data = * (unsigned long *) _q;
3490 if (data & SWAPPED_FAST_CHAR_MASK)
3491 break;
3492 /* Zero upper bytes in UCS-4 builds */
3493#if (Py_UNICODE_SIZE > 2)
3494 _p[0] = 0;
3495 _p[1] = 0;
3496#if (SIZEOF_LONG == 8)
3497 _p[2] = 0;
3498 _p[3] = 0;
3499#endif
3500#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003501 /* Issue #4916; UCS-4 builds on big endian machines must
3502 fill the two last bytes of each 4-byte unit. */
3503#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3504# define OFF 2
3505#else
3506# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003507#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003508 ((unsigned char *) _p)[OFF + 1] = _q[0];
3509 ((unsigned char *) _p)[OFF + 0] = _q[1];
3510 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3511 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3512#if (SIZEOF_LONG == 8)
3513 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3514 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3515 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3516 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3517#endif
3518#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003519 _q += SIZEOF_LONG;
3520 _p += SIZEOF_LONG / 2;
3521 }
3522 }
3523 p = _p;
3524 q = _q;
3525 if (q >= e)
3526 break;
3527 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003528 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003529
Benjamin Peterson14339b62009-01-31 16:36:08 +00003530 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003531
3532 if (ch < 0xD800 || ch > 0xDFFF) {
3533 *p++ = ch;
3534 continue;
3535 }
3536
3537 /* UTF-16 code pair: */
3538 if (q > e) {
3539 errmsg = "unexpected end of data";
3540 startinpos = (((const char *)q) - 2) - starts;
3541 endinpos = ((const char *)e) + 1 - starts;
3542 goto utf16Error;
3543 }
3544 if (0xD800 <= ch && ch <= 0xDBFF) {
3545 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3546 q += 2;
3547 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003548#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003549 *p++ = ch;
3550 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003551#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003552 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003553#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003554 continue;
3555 }
3556 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003557 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003558 startinpos = (((const char *)q)-4)-starts;
3559 endinpos = startinpos+2;
3560 goto utf16Error;
3561 }
3562
Benjamin Peterson14339b62009-01-31 16:36:08 +00003563 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003564 errmsg = "illegal encoding";
3565 startinpos = (((const char *)q)-2)-starts;
3566 endinpos = startinpos+2;
3567 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003568
Benjamin Peterson29060642009-01-31 22:14:21 +00003569 utf16Error:
3570 outpos = p - PyUnicode_AS_UNICODE(unicode);
3571 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003572 errors,
3573 &errorHandler,
3574 "utf16", errmsg,
3575 &starts,
3576 (const char **)&e,
3577 &startinpos,
3578 &endinpos,
3579 &exc,
3580 (const char **)&q,
3581 &unicode,
3582 &outpos,
3583 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003584 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003585 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003586 /* remaining byte at the end? (size should be even) */
3587 if (e == q) {
3588 if (!consumed) {
3589 errmsg = "truncated data";
3590 startinpos = ((const char *)q) - starts;
3591 endinpos = ((const char *)e) + 1 - starts;
3592 outpos = p - PyUnicode_AS_UNICODE(unicode);
3593 if (unicode_decode_call_errorhandler(
3594 errors,
3595 &errorHandler,
3596 "utf16", errmsg,
3597 &starts,
3598 (const char **)&e,
3599 &startinpos,
3600 &endinpos,
3601 &exc,
3602 (const char **)&q,
3603 &unicode,
3604 &outpos,
3605 &p))
3606 goto onError;
3607 /* The remaining input chars are ignored if the callback
3608 chooses to skip the input */
3609 }
3610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611
3612 if (byteorder)
3613 *byteorder = bo;
3614
Walter Dörwald69652032004-09-07 20:24:22 +00003615 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003616 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003617
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003619 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 goto onError;
3621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 Py_XDECREF(errorHandler);
3623 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003624 return (PyObject *)unicode;
3625
Benjamin Peterson29060642009-01-31 22:14:21 +00003626 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003627 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 Py_XDECREF(errorHandler);
3629 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 return NULL;
3631}
3632
Antoine Pitrouab868312009-01-10 15:40:25 +00003633#undef FAST_CHAR_MASK
3634#undef SWAPPED_FAST_CHAR_MASK
3635
Tim Peters772747b2001-08-09 22:21:55 +00003636PyObject *
3637PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003638 Py_ssize_t size,
3639 const char *errors,
3640 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003642 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003643 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003644 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003645#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003646 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003647#else
3648 const int pairs = 0;
3649#endif
Tim Peters772747b2001-08-09 22:21:55 +00003650 /* Offsets from p for storing byte pairs in the right order. */
3651#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3652 int ihi = 1, ilo = 0;
3653#else
3654 int ihi = 0, ilo = 1;
3655#endif
3656
Benjamin Peterson29060642009-01-31 22:14:21 +00003657#define STORECHAR(CH) \
3658 do { \
3659 p[ihi] = ((CH) >> 8) & 0xff; \
3660 p[ilo] = (CH) & 0xff; \
3661 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003662 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003664#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003665 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003666 if (s[i] >= 0x10000)
3667 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003668#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003669 /* 2 * (size + pairs + (byteorder == 0)) */
3670 if (size > PY_SSIZE_T_MAX ||
3671 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003672 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003673 nsize = size + pairs + (byteorder == 0);
3674 bytesize = nsize * 2;
3675 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003677 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003678 if (v == NULL)
3679 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003681 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003683 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003684 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003685 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003686
3687 if (byteorder == -1) {
3688 /* force LE */
3689 ihi = 1;
3690 ilo = 0;
3691 }
3692 else if (byteorder == 1) {
3693 /* force BE */
3694 ihi = 0;
3695 ilo = 1;
3696 }
3697
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003698 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003699 Py_UNICODE ch = *s++;
3700 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003701#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003702 if (ch >= 0x10000) {
3703 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3704 ch = 0xD800 | ((ch-0x10000) >> 10);
3705 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003706#endif
Tim Peters772747b2001-08-09 22:21:55 +00003707 STORECHAR(ch);
3708 if (ch2)
3709 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003710 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003711
3712 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003713 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003714#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715}
3716
3717PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3718{
3719 if (!PyUnicode_Check(unicode)) {
3720 PyErr_BadArgument();
3721 return NULL;
3722 }
3723 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003724 PyUnicode_GET_SIZE(unicode),
3725 NULL,
3726 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003727}
3728
3729/* --- Unicode Escape Codec ----------------------------------------------- */
3730
Fredrik Lundh06d12682001-01-24 07:59:11 +00003731static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003732
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003734 Py_ssize_t size,
3735 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003736{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003738 Py_ssize_t startinpos;
3739 Py_ssize_t endinpos;
3740 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003741 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003743 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003745 char* message;
3746 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003747 PyObject *errorHandler = NULL;
3748 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003749
Guido van Rossumd57fd912000-03-10 22:53:23 +00003750 /* Escaped strings will always be longer than the resulting
3751 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003752 length after conversion to the true value.
3753 (but if the error callback returns a long replacement string
3754 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003755 v = _PyUnicode_New(size);
3756 if (v == NULL)
3757 goto onError;
3758 if (size == 0)
3759 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003760
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003761 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003763
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764 while (s < end) {
3765 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003766 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003767 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768
3769 /* Non-escape characters are interpreted as Unicode ordinals */
3770 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003771 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772 continue;
3773 }
3774
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776 /* \ - Escapes */
3777 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003778 c = *s++;
3779 if (s > end)
3780 c = '\0'; /* Invalid after \ */
3781 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782
Benjamin Peterson29060642009-01-31 22:14:21 +00003783 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 case '\n': break;
3785 case '\\': *p++ = '\\'; break;
3786 case '\'': *p++ = '\''; break;
3787 case '\"': *p++ = '\"'; break;
3788 case 'b': *p++ = '\b'; break;
3789 case 'f': *p++ = '\014'; break; /* FF */
3790 case 't': *p++ = '\t'; break;
3791 case 'n': *p++ = '\n'; break;
3792 case 'r': *p++ = '\r'; break;
3793 case 'v': *p++ = '\013'; break; /* VT */
3794 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3795
Benjamin Peterson29060642009-01-31 22:14:21 +00003796 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797 case '0': case '1': case '2': case '3':
3798 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003799 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003800 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003801 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003802 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003803 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003805 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806 break;
3807
Benjamin Peterson29060642009-01-31 22:14:21 +00003808 /* hex escapes */
3809 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003811 digits = 2;
3812 message = "truncated \\xXX escape";
3813 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814
Benjamin Peterson29060642009-01-31 22:14:21 +00003815 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003817 digits = 4;
3818 message = "truncated \\uXXXX escape";
3819 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820
Benjamin Peterson29060642009-01-31 22:14:21 +00003821 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003822 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003823 digits = 8;
3824 message = "truncated \\UXXXXXXXX escape";
3825 hexescape:
3826 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003827 outpos = p-PyUnicode_AS_UNICODE(v);
3828 if (s+digits>end) {
3829 endinpos = size;
3830 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003831 errors, &errorHandler,
3832 "unicodeescape", "end of string in escape sequence",
3833 &starts, &end, &startinpos, &endinpos, &exc, &s,
3834 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003835 goto onError;
3836 goto nextByte;
3837 }
3838 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003839 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003840 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003841 endinpos = (s+i+1)-starts;
3842 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003843 errors, &errorHandler,
3844 "unicodeescape", message,
3845 &starts, &end, &startinpos, &endinpos, &exc, &s,
3846 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003847 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003848 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003849 }
3850 chr = (chr<<4) & ~0xF;
3851 if (c >= '0' && c <= '9')
3852 chr += c - '0';
3853 else if (c >= 'a' && c <= 'f')
3854 chr += 10 + c - 'a';
3855 else
3856 chr += 10 + c - 'A';
3857 }
3858 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003859 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003860 /* _decoding_error will have already written into the
3861 target buffer. */
3862 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003863 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003864 /* when we get here, chr is a 32-bit unicode character */
3865 if (chr <= 0xffff)
3866 /* UCS-2 character */
3867 *p++ = (Py_UNICODE) chr;
3868 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003869 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003870 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003871#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003872 *p++ = chr;
3873#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003874 chr -= 0x10000L;
3875 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003876 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003877#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003878 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003879 endinpos = s-starts;
3880 outpos = p-PyUnicode_AS_UNICODE(v);
3881 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003882 errors, &errorHandler,
3883 "unicodeescape", "illegal Unicode character",
3884 &starts, &end, &startinpos, &endinpos, &exc, &s,
3885 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003886 goto onError;
3887 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003888 break;
3889
Benjamin Peterson29060642009-01-31 22:14:21 +00003890 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003891 case 'N':
3892 message = "malformed \\N character escape";
3893 if (ucnhash_CAPI == NULL) {
3894 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003895 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003896 if (ucnhash_CAPI == NULL)
3897 goto ucnhashError;
3898 }
3899 if (*s == '{') {
3900 const char *start = s+1;
3901 /* look for the closing brace */
3902 while (*s != '}' && s < end)
3903 s++;
3904 if (s > start && s < end && *s == '}') {
3905 /* found a name. look it up in the unicode database */
3906 message = "unknown Unicode character name";
3907 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003908 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003909 goto store;
3910 }
3911 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003912 endinpos = s-starts;
3913 outpos = p-PyUnicode_AS_UNICODE(v);
3914 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003915 errors, &errorHandler,
3916 "unicodeescape", message,
3917 &starts, &end, &startinpos, &endinpos, &exc, &s,
3918 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003919 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003920 break;
3921
3922 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003923 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003924 message = "\\ at end of string";
3925 s--;
3926 endinpos = s-starts;
3927 outpos = p-PyUnicode_AS_UNICODE(v);
3928 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003929 errors, &errorHandler,
3930 "unicodeescape", message,
3931 &starts, &end, &startinpos, &endinpos, &exc, &s,
3932 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003933 goto onError;
3934 }
3935 else {
3936 *p++ = '\\';
3937 *p++ = (unsigned char)s[-1];
3938 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003939 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003940 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003941 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003942 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003944 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003946 Py_XDECREF(errorHandler);
3947 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003949
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003951 PyErr_SetString(
3952 PyExc_UnicodeError,
3953 "\\N escapes not supported (can't load unicodedata module)"
3954 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003955 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003956 Py_XDECREF(errorHandler);
3957 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003958 return NULL;
3959
Benjamin Peterson29060642009-01-31 22:14:21 +00003960 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962 Py_XDECREF(errorHandler);
3963 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 return NULL;
3965}
3966
3967/* Return a Unicode-Escape string version of the Unicode object.
3968
3969 If quotes is true, the string is enclosed in u"" or u'' quotes as
3970 appropriate.
3971
3972*/
3973
Thomas Wouters477c8d52006-05-27 19:21:47 +00003974Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003975 Py_ssize_t size,
3976 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00003977{
3978 /* like wcschr, but doesn't stop at NULL characters */
3979
3980 while (size-- > 0) {
3981 if (*s == ch)
3982 return s;
3983 s++;
3984 }
3985
3986 return NULL;
3987}
Barry Warsaw51ac5802000-03-20 16:36:48 +00003988
Walter Dörwald79e913e2007-05-12 11:08:06 +00003989static const char *hexdigits = "0123456789abcdef";
3990
3991PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003992 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003994 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003997#ifdef Py_UNICODE_WIDE
3998 const Py_ssize_t expandsize = 10;
3999#else
4000 const Py_ssize_t expandsize = 6;
4001#endif
4002
Thomas Wouters89f507f2006-12-13 04:49:30 +00004003 /* XXX(nnorwitz): rather than over-allocating, it would be
4004 better to choose a different scheme. Perhaps scan the
4005 first N-chars of the string and allocate based on that size.
4006 */
4007 /* Initial allocation is based on the longest-possible unichr
4008 escape.
4009
4010 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4011 unichr, so in this case it's the longest unichr escape. In
4012 narrow (UTF-16) builds this is five chars per source unichr
4013 since there are two unichrs in the surrogate pair, so in narrow
4014 (UTF-16) builds it's not the longest unichr escape.
4015
4016 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4017 so in the narrow (UTF-16) build case it's the longest unichr
4018 escape.
4019 */
4020
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004021 if (size == 0)
4022 return PyBytes_FromStringAndSize(NULL, 0);
4023
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004024 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004025 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004026
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004027 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004028 2
4029 + expandsize*size
4030 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031 if (repr == NULL)
4032 return NULL;
4033
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004034 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 while (size-- > 0) {
4037 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004038
Walter Dörwald79e913e2007-05-12 11:08:06 +00004039 /* Escape backslashes */
4040 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041 *p++ = '\\';
4042 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004043 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004044 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004045
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004046#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004047 /* Map 21-bit characters to '\U00xxxxxx' */
4048 else if (ch >= 0x10000) {
4049 *p++ = '\\';
4050 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004051 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4052 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4053 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4054 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4055 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4056 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4057 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4058 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004059 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004060 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004061#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004062 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4063 else if (ch >= 0xD800 && ch < 0xDC00) {
4064 Py_UNICODE ch2;
4065 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004066
Benjamin Peterson29060642009-01-31 22:14:21 +00004067 ch2 = *s++;
4068 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004069 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004070 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4071 *p++ = '\\';
4072 *p++ = 'U';
4073 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4074 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4075 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4076 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4077 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4078 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4079 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4080 *p++ = hexdigits[ucs & 0x0000000F];
4081 continue;
4082 }
4083 /* Fall through: isolated surrogates are copied as-is */
4084 s--;
4085 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004086 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004087#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004088
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004090 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091 *p++ = '\\';
4092 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004093 *p++ = hexdigits[(ch >> 12) & 0x000F];
4094 *p++ = hexdigits[(ch >> 8) & 0x000F];
4095 *p++ = hexdigits[(ch >> 4) & 0x000F];
4096 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004098
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004099 /* Map special whitespace to '\t', \n', '\r' */
4100 else if (ch == '\t') {
4101 *p++ = '\\';
4102 *p++ = 't';
4103 }
4104 else if (ch == '\n') {
4105 *p++ = '\\';
4106 *p++ = 'n';
4107 }
4108 else if (ch == '\r') {
4109 *p++ = '\\';
4110 *p++ = 'r';
4111 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004112
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004113 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004114 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004116 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004117 *p++ = hexdigits[(ch >> 4) & 0x000F];
4118 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004119 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004120
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 /* Copy everything else as-is */
4122 else
4123 *p++ = (char) ch;
4124 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004126 assert(p - PyBytes_AS_STRING(repr) > 0);
4127 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4128 return NULL;
4129 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130}
4131
Alexandre Vassalotti2056bed2008-12-27 19:46:35 +00004132PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004134 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 if (!PyUnicode_Check(unicode)) {
4136 PyErr_BadArgument();
4137 return NULL;
4138 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004139 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4140 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004141 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004142}
4143
4144/* --- Raw Unicode Escape Codec ------------------------------------------- */
4145
4146PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 Py_ssize_t size,
4148 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004151 Py_ssize_t startinpos;
4152 Py_ssize_t endinpos;
4153 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 const char *end;
4157 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 PyObject *errorHandler = NULL;
4159 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004160
Guido van Rossumd57fd912000-03-10 22:53:23 +00004161 /* Escaped strings will always be longer than the resulting
4162 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163 length after conversion to the true value. (But decoding error
4164 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 v = _PyUnicode_New(size);
4166 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004167 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004170 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171 end = s + size;
4172 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004173 unsigned char c;
4174 Py_UCS4 x;
4175 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004176 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 /* Non-escape characters are interpreted as Unicode ordinals */
4179 if (*s != '\\') {
4180 *p++ = (unsigned char)*s++;
4181 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004182 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004183 startinpos = s-starts;
4184
4185 /* \u-escapes are only interpreted iff the number of leading
4186 backslashes if odd */
4187 bs = s;
4188 for (;s < end;) {
4189 if (*s != '\\')
4190 break;
4191 *p++ = (unsigned char)*s++;
4192 }
4193 if (((s - bs) & 1) == 0 ||
4194 s >= end ||
4195 (*s != 'u' && *s != 'U')) {
4196 continue;
4197 }
4198 p--;
4199 count = *s=='u' ? 4 : 8;
4200 s++;
4201
4202 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4203 outpos = p-PyUnicode_AS_UNICODE(v);
4204 for (x = 0, i = 0; i < count; ++i, ++s) {
4205 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004206 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004207 endinpos = s-starts;
4208 if (unicode_decode_call_errorhandler(
4209 errors, &errorHandler,
4210 "rawunicodeescape", "truncated \\uXXXX",
4211 &starts, &end, &startinpos, &endinpos, &exc, &s,
4212 &v, &outpos, &p))
4213 goto onError;
4214 goto nextByte;
4215 }
4216 x = (x<<4) & ~0xF;
4217 if (c >= '0' && c <= '9')
4218 x += c - '0';
4219 else if (c >= 'a' && c <= 'f')
4220 x += 10 + c - 'a';
4221 else
4222 x += 10 + c - 'A';
4223 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004224 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 /* UCS-2 character */
4226 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004227 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004228 /* UCS-4 character. Either store directly, or as
4229 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004230#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004231 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004232#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004233 x -= 0x10000L;
4234 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4235 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004236#endif
4237 } else {
4238 endinpos = s-starts;
4239 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004240 if (unicode_decode_call_errorhandler(
4241 errors, &errorHandler,
4242 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004243 &starts, &end, &startinpos, &endinpos, &exc, &s,
4244 &v, &outpos, &p))
4245 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004246 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 nextByte:
4248 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004250 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004251 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252 Py_XDECREF(errorHandler);
4253 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004255
Benjamin Peterson29060642009-01-31 22:14:21 +00004256 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004258 Py_XDECREF(errorHandler);
4259 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260 return NULL;
4261}
4262
4263PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004264 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004265{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004266 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267 char *p;
4268 char *q;
4269
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004270#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004271 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004272#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004273 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004274#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004275
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004276 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004277 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004278
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004279 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 if (repr == NULL)
4281 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004282 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004283 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004285 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286 while (size-- > 0) {
4287 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004288#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 /* Map 32-bit characters to '\Uxxxxxxxx' */
4290 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004291 *p++ = '\\';
4292 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004293 *p++ = hexdigits[(ch >> 28) & 0xf];
4294 *p++ = hexdigits[(ch >> 24) & 0xf];
4295 *p++ = hexdigits[(ch >> 20) & 0xf];
4296 *p++ = hexdigits[(ch >> 16) & 0xf];
4297 *p++ = hexdigits[(ch >> 12) & 0xf];
4298 *p++ = hexdigits[(ch >> 8) & 0xf];
4299 *p++ = hexdigits[(ch >> 4) & 0xf];
4300 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004301 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004302 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004303#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004304 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4305 if (ch >= 0xD800 && ch < 0xDC00) {
4306 Py_UNICODE ch2;
4307 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004308
Benjamin Peterson29060642009-01-31 22:14:21 +00004309 ch2 = *s++;
4310 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004311 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004312 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4313 *p++ = '\\';
4314 *p++ = 'U';
4315 *p++ = hexdigits[(ucs >> 28) & 0xf];
4316 *p++ = hexdigits[(ucs >> 24) & 0xf];
4317 *p++ = hexdigits[(ucs >> 20) & 0xf];
4318 *p++ = hexdigits[(ucs >> 16) & 0xf];
4319 *p++ = hexdigits[(ucs >> 12) & 0xf];
4320 *p++ = hexdigits[(ucs >> 8) & 0xf];
4321 *p++ = hexdigits[(ucs >> 4) & 0xf];
4322 *p++ = hexdigits[ucs & 0xf];
4323 continue;
4324 }
4325 /* Fall through: isolated surrogates are copied as-is */
4326 s--;
4327 size++;
4328 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004329#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004330 /* Map 16-bit characters to '\uxxxx' */
4331 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 *p++ = '\\';
4333 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004334 *p++ = hexdigits[(ch >> 12) & 0xf];
4335 *p++ = hexdigits[(ch >> 8) & 0xf];
4336 *p++ = hexdigits[(ch >> 4) & 0xf];
4337 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004339 /* Copy everything else as-is */
4340 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341 *p++ = (char) ch;
4342 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004343 size = p - q;
4344
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004345 assert(size > 0);
4346 if (_PyBytes_Resize(&repr, size) < 0)
4347 return NULL;
4348 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349}
4350
4351PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4352{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004353 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004355 PyErr_BadArgument();
4356 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004358 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4359 PyUnicode_GET_SIZE(unicode));
4360
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004361 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362}
4363
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004364/* --- Unicode Internal Codec ------------------------------------------- */
4365
4366PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 Py_ssize_t size,
4368 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004369{
4370 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004371 Py_ssize_t startinpos;
4372 Py_ssize_t endinpos;
4373 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004374 PyUnicodeObject *v;
4375 Py_UNICODE *p;
4376 const char *end;
4377 const char *reason;
4378 PyObject *errorHandler = NULL;
4379 PyObject *exc = NULL;
4380
Neal Norwitzd43069c2006-01-08 01:12:10 +00004381#ifdef Py_UNICODE_WIDE
4382 Py_UNICODE unimax = PyUnicode_GetMax();
4383#endif
4384
Thomas Wouters89f507f2006-12-13 04:49:30 +00004385 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004386 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4387 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004388 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004389 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004390 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004391 p = PyUnicode_AS_UNICODE(v);
4392 end = s + size;
4393
4394 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004395 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004396 /* We have to sanity check the raw data, otherwise doom looms for
4397 some malformed UCS-4 data. */
4398 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004399#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004400 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004401#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004402 end-s < Py_UNICODE_SIZE
4403 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004404 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004405 startinpos = s - starts;
4406 if (end-s < Py_UNICODE_SIZE) {
4407 endinpos = end-starts;
4408 reason = "truncated input";
4409 }
4410 else {
4411 endinpos = s - starts + Py_UNICODE_SIZE;
4412 reason = "illegal code point (> 0x10FFFF)";
4413 }
4414 outpos = p - PyUnicode_AS_UNICODE(v);
4415 if (unicode_decode_call_errorhandler(
4416 errors, &errorHandler,
4417 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004418 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004419 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004420 goto onError;
4421 }
4422 }
4423 else {
4424 p++;
4425 s += Py_UNICODE_SIZE;
4426 }
4427 }
4428
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004429 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004430 goto onError;
4431 Py_XDECREF(errorHandler);
4432 Py_XDECREF(exc);
4433 return (PyObject *)v;
4434
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004436 Py_XDECREF(v);
4437 Py_XDECREF(errorHandler);
4438 Py_XDECREF(exc);
4439 return NULL;
4440}
4441
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442/* --- Latin-1 Codec ------------------------------------------------------ */
4443
4444PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 Py_ssize_t size,
4446 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447{
4448 PyUnicodeObject *v;
4449 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004450 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004451
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004453 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004454 Py_UNICODE r = *(unsigned char*)s;
4455 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004456 }
4457
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 v = _PyUnicode_New(size);
4459 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004460 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004461 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004462 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004464 e = s + size;
4465 /* Unrolling the copy makes it much faster by reducing the looping
4466 overhead. This is similar to what many memcpy() implementations do. */
4467 unrolled_end = e - 4;
4468 while (s < unrolled_end) {
4469 p[0] = (unsigned char) s[0];
4470 p[1] = (unsigned char) s[1];
4471 p[2] = (unsigned char) s[2];
4472 p[3] = (unsigned char) s[3];
4473 s += 4;
4474 p += 4;
4475 }
4476 while (s < e)
4477 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004478 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004479
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481 Py_XDECREF(v);
4482 return NULL;
4483}
4484
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004485/* create or adjust a UnicodeEncodeError */
4486static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004487 const char *encoding,
4488 const Py_UNICODE *unicode, Py_ssize_t size,
4489 Py_ssize_t startpos, Py_ssize_t endpos,
4490 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004492 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 *exceptionObject = PyUnicodeEncodeError_Create(
4494 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004495 }
4496 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4498 goto onError;
4499 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4500 goto onError;
4501 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4502 goto onError;
4503 return;
4504 onError:
4505 Py_DECREF(*exceptionObject);
4506 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507 }
4508}
4509
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004510/* raises a UnicodeEncodeError */
4511static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004512 const char *encoding,
4513 const Py_UNICODE *unicode, Py_ssize_t size,
4514 Py_ssize_t startpos, Py_ssize_t endpos,
4515 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004516{
4517 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004519 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004521}
4522
4523/* error handling callback helper:
4524 build arguments, call the callback and check the arguments,
4525 put the result into newpos and return the replacement string, which
4526 has to be freed by the caller */
4527static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 PyObject **errorHandler,
4529 const char *encoding, const char *reason,
4530 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4531 Py_ssize_t startpos, Py_ssize_t endpos,
4532 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004533{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004534 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535
4536 PyObject *restuple;
4537 PyObject *resunicode;
4538
4539 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004541 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004543 }
4544
4545 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004546 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549
4550 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004551 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004552 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004555 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004556 Py_DECREF(restuple);
4557 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004559 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004560 &resunicode, newpos)) {
4561 Py_DECREF(restuple);
4562 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004564 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4565 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4566 Py_DECREF(restuple);
4567 return NULL;
4568 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004569 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004570 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004571 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004572 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4573 Py_DECREF(restuple);
4574 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004575 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576 Py_INCREF(resunicode);
4577 Py_DECREF(restuple);
4578 return resunicode;
4579}
4580
4581static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004582 Py_ssize_t size,
4583 const char *errors,
4584 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585{
4586 /* output object */
4587 PyObject *res;
4588 /* pointers to the beginning and end+1 of input */
4589 const Py_UNICODE *startp = p;
4590 const Py_UNICODE *endp = p + size;
4591 /* pointer to the beginning of the unencodable characters */
4592 /* const Py_UNICODE *badp = NULL; */
4593 /* pointer into the output */
4594 char *str;
4595 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004596 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004597 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4598 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 PyObject *errorHandler = NULL;
4600 PyObject *exc = NULL;
4601 /* the following variable is used for caching string comparisons
4602 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4603 int known_errorHandler = -1;
4604
4605 /* allocate enough for a simple encoding without
4606 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004607 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004608 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004609 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004611 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004612 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004613 ressize = size;
4614
4615 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 /* can we encode this? */
4619 if (c<limit) {
4620 /* no overflow check, because we know that the space is enough */
4621 *str++ = (char)c;
4622 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004623 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004624 else {
4625 Py_ssize_t unicodepos = p-startp;
4626 Py_ssize_t requiredsize;
4627 PyObject *repunicode;
4628 Py_ssize_t repsize;
4629 Py_ssize_t newpos;
4630 Py_ssize_t respos;
4631 Py_UNICODE *uni2;
4632 /* startpos for collecting unencodable chars */
4633 const Py_UNICODE *collstart = p;
4634 const Py_UNICODE *collend = p;
4635 /* find all unecodable characters */
4636 while ((collend < endp) && ((*collend)>=limit))
4637 ++collend;
4638 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4639 if (known_errorHandler==-1) {
4640 if ((errors==NULL) || (!strcmp(errors, "strict")))
4641 known_errorHandler = 1;
4642 else if (!strcmp(errors, "replace"))
4643 known_errorHandler = 2;
4644 else if (!strcmp(errors, "ignore"))
4645 known_errorHandler = 3;
4646 else if (!strcmp(errors, "xmlcharrefreplace"))
4647 known_errorHandler = 4;
4648 else
4649 known_errorHandler = 0;
4650 }
4651 switch (known_errorHandler) {
4652 case 1: /* strict */
4653 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4654 goto onError;
4655 case 2: /* replace */
4656 while (collstart++<collend)
4657 *str++ = '?'; /* fall through */
4658 case 3: /* ignore */
4659 p = collend;
4660 break;
4661 case 4: /* xmlcharrefreplace */
4662 respos = str - PyBytes_AS_STRING(res);
4663 /* determine replacement size (temporarily (mis)uses p) */
4664 for (p = collstart, repsize = 0; p < collend; ++p) {
4665 if (*p<10)
4666 repsize += 2+1+1;
4667 else if (*p<100)
4668 repsize += 2+2+1;
4669 else if (*p<1000)
4670 repsize += 2+3+1;
4671 else if (*p<10000)
4672 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004673#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004674 else
4675 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004676#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004677 else if (*p<100000)
4678 repsize += 2+5+1;
4679 else if (*p<1000000)
4680 repsize += 2+6+1;
4681 else
4682 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004683#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004684 }
4685 requiredsize = respos+repsize+(endp-collend);
4686 if (requiredsize > ressize) {
4687 if (requiredsize<2*ressize)
4688 requiredsize = 2*ressize;
4689 if (_PyBytes_Resize(&res, requiredsize))
4690 goto onError;
4691 str = PyBytes_AS_STRING(res) + respos;
4692 ressize = requiredsize;
4693 }
4694 /* generate replacement (temporarily (mis)uses p) */
4695 for (p = collstart; p < collend; ++p) {
4696 str += sprintf(str, "&#%d;", (int)*p);
4697 }
4698 p = collend;
4699 break;
4700 default:
4701 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4702 encoding, reason, startp, size, &exc,
4703 collstart-startp, collend-startp, &newpos);
4704 if (repunicode == NULL)
4705 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004706 if (PyBytes_Check(repunicode)) {
4707 /* Directly copy bytes result to output. */
4708 repsize = PyBytes_Size(repunicode);
4709 if (repsize > 1) {
4710 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004711 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004712 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4713 Py_DECREF(repunicode);
4714 goto onError;
4715 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004716 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004717 ressize += repsize-1;
4718 }
4719 memcpy(str, PyBytes_AsString(repunicode), repsize);
4720 str += repsize;
4721 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004722 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004723 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004724 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004725 /* need more space? (at least enough for what we
4726 have+the replacement+the rest of the string, so
4727 we won't have to check space for encodable characters) */
4728 respos = str - PyBytes_AS_STRING(res);
4729 repsize = PyUnicode_GET_SIZE(repunicode);
4730 requiredsize = respos+repsize+(endp-collend);
4731 if (requiredsize > ressize) {
4732 if (requiredsize<2*ressize)
4733 requiredsize = 2*ressize;
4734 if (_PyBytes_Resize(&res, requiredsize)) {
4735 Py_DECREF(repunicode);
4736 goto onError;
4737 }
4738 str = PyBytes_AS_STRING(res) + respos;
4739 ressize = requiredsize;
4740 }
4741 /* check if there is anything unencodable in the replacement
4742 and copy it to the output */
4743 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4744 c = *uni2;
4745 if (c >= limit) {
4746 raise_encode_exception(&exc, encoding, startp, size,
4747 unicodepos, unicodepos+1, reason);
4748 Py_DECREF(repunicode);
4749 goto onError;
4750 }
4751 *str = (char)c;
4752 }
4753 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004754 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004755 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004756 }
4757 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004758 /* Resize if we allocated to much */
4759 size = str - PyBytes_AS_STRING(res);
4760 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004761 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004762 if (_PyBytes_Resize(&res, size) < 0)
4763 goto onError;
4764 }
4765
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004766 Py_XDECREF(errorHandler);
4767 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004768 return res;
4769
4770 onError:
4771 Py_XDECREF(res);
4772 Py_XDECREF(errorHandler);
4773 Py_XDECREF(exc);
4774 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004775}
4776
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004778 Py_ssize_t size,
4779 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004781 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782}
4783
4784PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4785{
4786 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004787 PyErr_BadArgument();
4788 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 }
4790 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004791 PyUnicode_GET_SIZE(unicode),
4792 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793}
4794
4795/* --- 7-bit ASCII Codec -------------------------------------------------- */
4796
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 Py_ssize_t size,
4799 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004801 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802 PyUnicodeObject *v;
4803 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004804 Py_ssize_t startinpos;
4805 Py_ssize_t endinpos;
4806 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004807 const char *e;
4808 PyObject *errorHandler = NULL;
4809 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004810
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004812 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004813 Py_UNICODE r = *(unsigned char*)s;
4814 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004815 }
Tim Petersced69f82003-09-16 20:30:58 +00004816
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 v = _PyUnicode_New(size);
4818 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004819 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004821 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004823 e = s + size;
4824 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004825 register unsigned char c = (unsigned char)*s;
4826 if (c < 128) {
4827 *p++ = c;
4828 ++s;
4829 }
4830 else {
4831 startinpos = s-starts;
4832 endinpos = startinpos + 1;
4833 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4834 if (unicode_decode_call_errorhandler(
4835 errors, &errorHandler,
4836 "ascii", "ordinal not in range(128)",
4837 &starts, &e, &startinpos, &endinpos, &exc, &s,
4838 &v, &outpos, &p))
4839 goto onError;
4840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004842 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004843 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4844 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004845 Py_XDECREF(errorHandler);
4846 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004848
Benjamin Peterson29060642009-01-31 22:14:21 +00004849 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004851 Py_XDECREF(errorHandler);
4852 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853 return NULL;
4854}
4855
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00004857 Py_ssize_t size,
4858 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004860 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861}
4862
4863PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4864{
4865 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004866 PyErr_BadArgument();
4867 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 }
4869 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004870 PyUnicode_GET_SIZE(unicode),
4871 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872}
4873
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004874#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004875
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004876/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004877
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004878#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004879#define NEED_RETRY
4880#endif
4881
4882/* XXX This code is limited to "true" double-byte encodings, as
4883 a) it assumes an incomplete character consists of a single byte, and
4884 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004885 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004886
4887static int is_dbcs_lead_byte(const char *s, int offset)
4888{
4889 const char *curr = s + offset;
4890
4891 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 const char *prev = CharPrev(s, curr);
4893 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004894 }
4895 return 0;
4896}
4897
4898/*
4899 * Decode MBCS string into unicode object. If 'final' is set, converts
4900 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4901 */
4902static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 const char *s, /* MBCS string */
4904 int size, /* sizeof MBCS string */
Victor Stinner554f3f02010-06-16 23:33:54 +00004905 int final,
4906 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004907{
4908 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004909 Py_ssize_t n;
4910 DWORD usize;
4911 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004912
4913 assert(size >= 0);
4914
Victor Stinner554f3f02010-06-16 23:33:54 +00004915 /* check and handle 'errors' arg */
4916 if (errors==NULL || strcmp(errors, "strict")==0)
4917 flags = MB_ERR_INVALID_CHARS;
4918 else if (strcmp(errors, "ignore")==0)
4919 flags = 0;
4920 else {
4921 PyErr_Format(PyExc_ValueError,
4922 "mbcs encoding does not support errors='%s'",
4923 errors);
4924 return -1;
4925 }
4926
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004927 /* Skip trailing lead-byte unless 'final' is set */
4928 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004930
4931 /* First get the size of the result */
4932 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004933 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4934 if (usize==0)
4935 goto mbcs_decode_error;
4936 } else
4937 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004938
4939 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 /* Create unicode object */
4941 *v = _PyUnicode_New(usize);
4942 if (*v == NULL)
4943 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004944 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004945 }
4946 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004947 /* Extend unicode object */
4948 n = PyUnicode_GET_SIZE(*v);
4949 if (_PyUnicode_Resize(v, n + usize) < 0)
4950 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004951 }
4952
4953 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00004954 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00004956 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4957 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004959 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004960 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00004961
4962mbcs_decode_error:
4963 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4964 we raise a UnicodeDecodeError - else it is a 'generic'
4965 windows error
4966 */
4967 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4968 /* Ideally, we should get reason from FormatMessage - this
4969 is the Windows 2000 English version of the message
4970 */
4971 PyObject *exc = NULL;
4972 const char *reason = "No mapping for the Unicode character exists "
4973 "in the target multi-byte code page.";
4974 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4975 if (exc != NULL) {
4976 PyCodec_StrictErrors(exc);
4977 Py_DECREF(exc);
4978 }
4979 } else {
4980 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4981 }
4982 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004983}
4984
4985PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 Py_ssize_t size,
4987 const char *errors,
4988 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004989{
4990 PyUnicodeObject *v = NULL;
4991 int done;
4992
4993 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004994 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004995
4996#ifdef NEED_RETRY
4997 retry:
4998 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00004999 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005000 else
5001#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005002 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005003
5004 if (done < 0) {
5005 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005006 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005007 }
5008
5009 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005011
5012#ifdef NEED_RETRY
5013 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005014 s += done;
5015 size -= done;
5016 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005017 }
5018#endif
5019
5020 return (PyObject *)v;
5021}
5022
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005023PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005024 Py_ssize_t size,
5025 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005026{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005027 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5028}
5029
5030/*
5031 * Convert unicode into string object (MBCS).
5032 * Returns 0 if succeed, -1 otherwise.
5033 */
5034static int encode_mbcs(PyObject **repr,
Benjamin Peterson29060642009-01-31 22:14:21 +00005035 const Py_UNICODE *p, /* unicode */
Victor Stinner554f3f02010-06-16 23:33:54 +00005036 int size, /* size of unicode */
5037 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005038{
Victor Stinner554f3f02010-06-16 23:33:54 +00005039 BOOL usedDefaultChar = FALSE;
5040 BOOL *pusedDefaultChar;
5041 int mbcssize;
5042 Py_ssize_t n;
5043 PyObject *exc = NULL;
5044 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005045
5046 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005047
Victor Stinner554f3f02010-06-16 23:33:54 +00005048 /* check and handle 'errors' arg */
5049 if (errors==NULL || strcmp(errors, "strict")==0) {
5050 flags = WC_NO_BEST_FIT_CHARS;
5051 pusedDefaultChar = &usedDefaultChar;
5052 } else if (strcmp(errors, "replace")==0) {
5053 flags = 0;
5054 pusedDefaultChar = NULL;
5055 } else {
5056 PyErr_Format(PyExc_ValueError,
5057 "mbcs encoding does not support errors='%s'",
5058 errors);
5059 return -1;
5060 }
5061
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005062 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005063 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005064 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5065 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 if (mbcssize == 0) {
5067 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5068 return -1;
5069 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005070 /* If we used a default char, then we failed! */
5071 if (pusedDefaultChar && *pusedDefaultChar)
5072 goto mbcs_encode_error;
5073 } else {
5074 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005075 }
5076
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005077 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 /* Create string object */
5079 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5080 if (*repr == NULL)
5081 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005082 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005083 }
5084 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005085 /* Extend string object */
5086 n = PyBytes_Size(*repr);
5087 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5088 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005089 }
5090
5091 /* Do the conversion */
5092 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005094 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5095 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005096 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5097 return -1;
5098 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005099 if (pusedDefaultChar && *pusedDefaultChar)
5100 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005101 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005102 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005103
5104mbcs_encode_error:
5105 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5106 Py_XDECREF(exc);
5107 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005108}
5109
5110PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 Py_ssize_t size,
5112 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005113{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005114 PyObject *repr = NULL;
5115 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005116
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005117#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005119 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005120 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005121 else
5122#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005123 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005124
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005125 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005126 Py_XDECREF(repr);
5127 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005128 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005129
5130#ifdef NEED_RETRY
5131 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 p += INT_MAX;
5133 size -= INT_MAX;
5134 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005135 }
5136#endif
5137
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005138 return repr;
5139}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005140
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005141PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5142{
5143 if (!PyUnicode_Check(unicode)) {
5144 PyErr_BadArgument();
5145 return NULL;
5146 }
5147 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005148 PyUnicode_GET_SIZE(unicode),
5149 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005150}
5151
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005152#undef NEED_RETRY
5153
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005154#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005155
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156/* --- Character Mapping Codec -------------------------------------------- */
5157
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005159 Py_ssize_t size,
5160 PyObject *mapping,
5161 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005163 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005164 Py_ssize_t startinpos;
5165 Py_ssize_t endinpos;
5166 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005167 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168 PyUnicodeObject *v;
5169 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005170 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005171 PyObject *errorHandler = NULL;
5172 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005173 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005174 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005175
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176 /* Default to Latin-1 */
5177 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005178 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179
5180 v = _PyUnicode_New(size);
5181 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005182 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005184 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005186 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005187 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 mapstring = PyUnicode_AS_UNICODE(mapping);
5189 maplen = PyUnicode_GET_SIZE(mapping);
5190 while (s < e) {
5191 unsigned char ch = *s;
5192 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 if (ch < maplen)
5195 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 if (x == 0xfffe) {
5198 /* undefined mapping */
5199 outpos = p-PyUnicode_AS_UNICODE(v);
5200 startinpos = s-starts;
5201 endinpos = startinpos+1;
5202 if (unicode_decode_call_errorhandler(
5203 errors, &errorHandler,
5204 "charmap", "character maps to <undefined>",
5205 &starts, &e, &startinpos, &endinpos, &exc, &s,
5206 &v, &outpos, &p)) {
5207 goto onError;
5208 }
5209 continue;
5210 }
5211 *p++ = x;
5212 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005213 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005214 }
5215 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005216 while (s < e) {
5217 unsigned char ch = *s;
5218 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005219
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5221 w = PyLong_FromLong((long)ch);
5222 if (w == NULL)
5223 goto onError;
5224 x = PyObject_GetItem(mapping, w);
5225 Py_DECREF(w);
5226 if (x == NULL) {
5227 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5228 /* No mapping found means: mapping is undefined. */
5229 PyErr_Clear();
5230 x = Py_None;
5231 Py_INCREF(x);
5232 } else
5233 goto onError;
5234 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005235
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 /* Apply mapping */
5237 if (PyLong_Check(x)) {
5238 long value = PyLong_AS_LONG(x);
5239 if (value < 0 || value > 65535) {
5240 PyErr_SetString(PyExc_TypeError,
5241 "character mapping must be in range(65536)");
5242 Py_DECREF(x);
5243 goto onError;
5244 }
5245 *p++ = (Py_UNICODE)value;
5246 }
5247 else if (x == Py_None) {
5248 /* undefined mapping */
5249 outpos = p-PyUnicode_AS_UNICODE(v);
5250 startinpos = s-starts;
5251 endinpos = startinpos+1;
5252 if (unicode_decode_call_errorhandler(
5253 errors, &errorHandler,
5254 "charmap", "character maps to <undefined>",
5255 &starts, &e, &startinpos, &endinpos, &exc, &s,
5256 &v, &outpos, &p)) {
5257 Py_DECREF(x);
5258 goto onError;
5259 }
5260 Py_DECREF(x);
5261 continue;
5262 }
5263 else if (PyUnicode_Check(x)) {
5264 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005265
Benjamin Peterson29060642009-01-31 22:14:21 +00005266 if (targetsize == 1)
5267 /* 1-1 mapping */
5268 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005269
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 else if (targetsize > 1) {
5271 /* 1-n mapping */
5272 if (targetsize > extrachars) {
5273 /* resize first */
5274 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5275 Py_ssize_t needed = (targetsize - extrachars) + \
5276 (targetsize << 2);
5277 extrachars += needed;
5278 /* XXX overflow detection missing */
5279 if (_PyUnicode_Resize(&v,
5280 PyUnicode_GET_SIZE(v) + needed) < 0) {
5281 Py_DECREF(x);
5282 goto onError;
5283 }
5284 p = PyUnicode_AS_UNICODE(v) + oldpos;
5285 }
5286 Py_UNICODE_COPY(p,
5287 PyUnicode_AS_UNICODE(x),
5288 targetsize);
5289 p += targetsize;
5290 extrachars -= targetsize;
5291 }
5292 /* 1-0 mapping: skip the character */
5293 }
5294 else {
5295 /* wrong return value */
5296 PyErr_SetString(PyExc_TypeError,
5297 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005298 Py_DECREF(x);
5299 goto onError;
5300 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005301 Py_DECREF(x);
5302 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005303 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005304 }
5305 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005306 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5307 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005308 Py_XDECREF(errorHandler);
5309 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005311
Benjamin Peterson29060642009-01-31 22:14:21 +00005312 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005313 Py_XDECREF(errorHandler);
5314 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315 Py_XDECREF(v);
5316 return NULL;
5317}
5318
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005319/* Charmap encoding: the lookup table */
5320
5321struct encoding_map{
Benjamin Peterson29060642009-01-31 22:14:21 +00005322 PyObject_HEAD
5323 unsigned char level1[32];
5324 int count2, count3;
5325 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005326};
5327
5328static PyObject*
5329encoding_map_size(PyObject *obj, PyObject* args)
5330{
5331 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005332 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005334}
5335
5336static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005337 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 PyDoc_STR("Return the size (in bytes) of this object") },
5339 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005340};
5341
5342static void
5343encoding_map_dealloc(PyObject* o)
5344{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005345 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005346}
5347
5348static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005349 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 "EncodingMap", /*tp_name*/
5351 sizeof(struct encoding_map), /*tp_basicsize*/
5352 0, /*tp_itemsize*/
5353 /* methods */
5354 encoding_map_dealloc, /*tp_dealloc*/
5355 0, /*tp_print*/
5356 0, /*tp_getattr*/
5357 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005358 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 0, /*tp_repr*/
5360 0, /*tp_as_number*/
5361 0, /*tp_as_sequence*/
5362 0, /*tp_as_mapping*/
5363 0, /*tp_hash*/
5364 0, /*tp_call*/
5365 0, /*tp_str*/
5366 0, /*tp_getattro*/
5367 0, /*tp_setattro*/
5368 0, /*tp_as_buffer*/
5369 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5370 0, /*tp_doc*/
5371 0, /*tp_traverse*/
5372 0, /*tp_clear*/
5373 0, /*tp_richcompare*/
5374 0, /*tp_weaklistoffset*/
5375 0, /*tp_iter*/
5376 0, /*tp_iternext*/
5377 encoding_map_methods, /*tp_methods*/
5378 0, /*tp_members*/
5379 0, /*tp_getset*/
5380 0, /*tp_base*/
5381 0, /*tp_dict*/
5382 0, /*tp_descr_get*/
5383 0, /*tp_descr_set*/
5384 0, /*tp_dictoffset*/
5385 0, /*tp_init*/
5386 0, /*tp_alloc*/
5387 0, /*tp_new*/
5388 0, /*tp_free*/
5389 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005390};
5391
5392PyObject*
5393PyUnicode_BuildEncodingMap(PyObject* string)
5394{
5395 Py_UNICODE *decode;
5396 PyObject *result;
5397 struct encoding_map *mresult;
5398 int i;
5399 int need_dict = 0;
5400 unsigned char level1[32];
5401 unsigned char level2[512];
5402 unsigned char *mlevel1, *mlevel2, *mlevel3;
5403 int count2 = 0, count3 = 0;
5404
5405 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5406 PyErr_BadArgument();
5407 return NULL;
5408 }
5409 decode = PyUnicode_AS_UNICODE(string);
5410 memset(level1, 0xFF, sizeof level1);
5411 memset(level2, 0xFF, sizeof level2);
5412
5413 /* If there isn't a one-to-one mapping of NULL to \0,
5414 or if there are non-BMP characters, we need to use
5415 a mapping dictionary. */
5416 if (decode[0] != 0)
5417 need_dict = 1;
5418 for (i = 1; i < 256; i++) {
5419 int l1, l2;
5420 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005421#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005422 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005423#endif
5424 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005425 need_dict = 1;
5426 break;
5427 }
5428 if (decode[i] == 0xFFFE)
5429 /* unmapped character */
5430 continue;
5431 l1 = decode[i] >> 11;
5432 l2 = decode[i] >> 7;
5433 if (level1[l1] == 0xFF)
5434 level1[l1] = count2++;
5435 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005436 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005437 }
5438
5439 if (count2 >= 0xFF || count3 >= 0xFF)
5440 need_dict = 1;
5441
5442 if (need_dict) {
5443 PyObject *result = PyDict_New();
5444 PyObject *key, *value;
5445 if (!result)
5446 return NULL;
5447 for (i = 0; i < 256; i++) {
5448 key = value = NULL;
Christian Heimes217cfd12007-12-02 14:31:20 +00005449 key = PyLong_FromLong(decode[i]);
5450 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005451 if (!key || !value)
5452 goto failed1;
5453 if (PyDict_SetItem(result, key, value) == -1)
5454 goto failed1;
5455 Py_DECREF(key);
5456 Py_DECREF(value);
5457 }
5458 return result;
5459 failed1:
5460 Py_XDECREF(key);
5461 Py_XDECREF(value);
5462 Py_DECREF(result);
5463 return NULL;
5464 }
5465
5466 /* Create a three-level trie */
5467 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5468 16*count2 + 128*count3 - 1);
5469 if (!result)
5470 return PyErr_NoMemory();
5471 PyObject_Init(result, &EncodingMapType);
5472 mresult = (struct encoding_map*)result;
5473 mresult->count2 = count2;
5474 mresult->count3 = count3;
5475 mlevel1 = mresult->level1;
5476 mlevel2 = mresult->level23;
5477 mlevel3 = mresult->level23 + 16*count2;
5478 memcpy(mlevel1, level1, 32);
5479 memset(mlevel2, 0xFF, 16*count2);
5480 memset(mlevel3, 0, 128*count3);
5481 count3 = 0;
5482 for (i = 1; i < 256; i++) {
5483 int o1, o2, o3, i2, i3;
5484 if (decode[i] == 0xFFFE)
5485 /* unmapped character */
5486 continue;
5487 o1 = decode[i]>>11;
5488 o2 = (decode[i]>>7) & 0xF;
5489 i2 = 16*mlevel1[o1] + o2;
5490 if (mlevel2[i2] == 0xFF)
5491 mlevel2[i2] = count3++;
5492 o3 = decode[i] & 0x7F;
5493 i3 = 128*mlevel2[i2] + o3;
5494 mlevel3[i3] = i;
5495 }
5496 return result;
5497}
5498
5499static int
5500encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5501{
5502 struct encoding_map *map = (struct encoding_map*)mapping;
5503 int l1 = c>>11;
5504 int l2 = (c>>7) & 0xF;
5505 int l3 = c & 0x7F;
5506 int i;
5507
5508#ifdef Py_UNICODE_WIDE
5509 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005511 }
5512#endif
5513 if (c == 0)
5514 return 0;
5515 /* level 1*/
5516 i = map->level1[l1];
5517 if (i == 0xFF) {
5518 return -1;
5519 }
5520 /* level 2*/
5521 i = map->level23[16*i+l2];
5522 if (i == 0xFF) {
5523 return -1;
5524 }
5525 /* level 3 */
5526 i = map->level23[16*map->count2 + 128*i + l3];
5527 if (i == 0) {
5528 return -1;
5529 }
5530 return i;
5531}
5532
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005533/* Lookup the character ch in the mapping. If the character
5534 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005535 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005536static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537{
Christian Heimes217cfd12007-12-02 14:31:20 +00005538 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005539 PyObject *x;
5540
5541 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005543 x = PyObject_GetItem(mapping, w);
5544 Py_DECREF(w);
5545 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5547 /* No mapping found means: mapping is undefined. */
5548 PyErr_Clear();
5549 x = Py_None;
5550 Py_INCREF(x);
5551 return x;
5552 } else
5553 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005555 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005557 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 long value = PyLong_AS_LONG(x);
5559 if (value < 0 || value > 255) {
5560 PyErr_SetString(PyExc_TypeError,
5561 "character mapping must be in range(256)");
5562 Py_DECREF(x);
5563 return NULL;
5564 }
5565 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005567 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 /* wrong return value */
5571 PyErr_Format(PyExc_TypeError,
5572 "character mapping must return integer, bytes or None, not %.400s",
5573 x->ob_type->tp_name);
5574 Py_DECREF(x);
5575 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 }
5577}
5578
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005579static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005580charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005581{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005582 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5583 /* exponentially overallocate to minimize reallocations */
5584 if (requiredsize < 2*outsize)
5585 requiredsize = 2*outsize;
5586 if (_PyBytes_Resize(outobj, requiredsize))
5587 return -1;
5588 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005589}
5590
Benjamin Peterson14339b62009-01-31 16:36:08 +00005591typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005593}charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005594/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005595 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005596 space is available. Return a new reference to the object that
5597 was put in the output buffer, or Py_None, if the mapping was undefined
5598 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005599 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005600static
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005601charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00005602 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005604 PyObject *rep;
5605 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005606 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005607
Christian Heimes90aa7642007-12-19 02:45:37 +00005608 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005609 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005611 if (res == -1)
5612 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 if (outsize<requiredsize)
5614 if (charmapencode_resize(outobj, outpos, requiredsize))
5615 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005616 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 outstart[(*outpos)++] = (char)res;
5618 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005619 }
5620
5621 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005622 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005624 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005625 Py_DECREF(rep);
5626 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005627 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 if (PyLong_Check(rep)) {
5629 Py_ssize_t requiredsize = *outpos+1;
5630 if (outsize<requiredsize)
5631 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5632 Py_DECREF(rep);
5633 return enc_EXCEPTION;
5634 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005635 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005636 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005637 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005638 else {
5639 const char *repchars = PyBytes_AS_STRING(rep);
5640 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5641 Py_ssize_t requiredsize = *outpos+repsize;
5642 if (outsize<requiredsize)
5643 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5644 Py_DECREF(rep);
5645 return enc_EXCEPTION;
5646 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005647 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005648 memcpy(outstart + *outpos, repchars, repsize);
5649 *outpos += repsize;
5650 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005651 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005652 Py_DECREF(rep);
5653 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005654}
5655
5656/* handle an error in PyUnicode_EncodeCharmap
5657 Return 0 on success, -1 on error */
5658static
5659int charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005660 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005661 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005662 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005663 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005664{
5665 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005666 Py_ssize_t repsize;
5667 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005668 Py_UNICODE *uni2;
5669 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005670 Py_ssize_t collstartpos = *inpos;
5671 Py_ssize_t collendpos = *inpos+1;
5672 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005673 char *encoding = "charmap";
5674 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005675 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005676
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005677 /* find all unencodable characters */
5678 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005679 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005680 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 int res = encoding_map_lookup(p[collendpos], mapping);
5682 if (res != -1)
5683 break;
5684 ++collendpos;
5685 continue;
5686 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005687
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 rep = charmapencode_lookup(p[collendpos], mapping);
5689 if (rep==NULL)
5690 return -1;
5691 else if (rep!=Py_None) {
5692 Py_DECREF(rep);
5693 break;
5694 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005695 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697 }
5698 /* cache callback name lookup
5699 * (if not done yet, i.e. it's the first error) */
5700 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005701 if ((errors==NULL) || (!strcmp(errors, "strict")))
5702 *known_errorHandler = 1;
5703 else if (!strcmp(errors, "replace"))
5704 *known_errorHandler = 2;
5705 else if (!strcmp(errors, "ignore"))
5706 *known_errorHandler = 3;
5707 else if (!strcmp(errors, "xmlcharrefreplace"))
5708 *known_errorHandler = 4;
5709 else
5710 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005711 }
5712 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005713 case 1: /* strict */
5714 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5715 return -1;
5716 case 2: /* replace */
5717 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005718 x = charmapencode_output('?', mapping, res, respos);
5719 if (x==enc_EXCEPTION) {
5720 return -1;
5721 }
5722 else if (x==enc_FAILED) {
5723 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5724 return -1;
5725 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005726 }
5727 /* fall through */
5728 case 3: /* ignore */
5729 *inpos = collendpos;
5730 break;
5731 case 4: /* xmlcharrefreplace */
5732 /* generate replacement (temporarily (mis)uses p) */
5733 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005734 char buffer[2+29+1+1];
5735 char *cp;
5736 sprintf(buffer, "&#%d;", (int)p[collpos]);
5737 for (cp = buffer; *cp; ++cp) {
5738 x = charmapencode_output(*cp, mapping, res, respos);
5739 if (x==enc_EXCEPTION)
5740 return -1;
5741 else if (x==enc_FAILED) {
5742 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5743 return -1;
5744 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005745 }
5746 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005747 *inpos = collendpos;
5748 break;
5749 default:
5750 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 encoding, reason, p, size, exceptionObject,
5752 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005753 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005754 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005755 if (PyBytes_Check(repunicode)) {
5756 /* Directly copy bytes result to output. */
5757 Py_ssize_t outsize = PyBytes_Size(*res);
5758 Py_ssize_t requiredsize;
5759 repsize = PyBytes_Size(repunicode);
5760 requiredsize = *respos + repsize;
5761 if (requiredsize > outsize)
5762 /* Make room for all additional bytes. */
5763 if (charmapencode_resize(res, respos, requiredsize)) {
5764 Py_DECREF(repunicode);
5765 return -1;
5766 }
5767 memcpy(PyBytes_AsString(*res) + *respos,
5768 PyBytes_AsString(repunicode), repsize);
5769 *respos += repsize;
5770 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005771 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005772 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005773 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005774 /* generate replacement */
5775 repsize = PyUnicode_GET_SIZE(repunicode);
5776 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 x = charmapencode_output(*uni2, mapping, res, respos);
5778 if (x==enc_EXCEPTION) {
5779 return -1;
5780 }
5781 else if (x==enc_FAILED) {
5782 Py_DECREF(repunicode);
5783 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5784 return -1;
5785 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005786 }
5787 *inpos = newpos;
5788 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005789 }
5790 return 0;
5791}
5792
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00005794 Py_ssize_t size,
5795 PyObject *mapping,
5796 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005798 /* output object */
5799 PyObject *res = NULL;
5800 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005801 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005802 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005803 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005804 PyObject *errorHandler = NULL;
5805 PyObject *exc = NULL;
5806 /* the following variable is used for caching string comparisons
5807 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5808 * 3=ignore, 4=xmlcharrefreplace */
5809 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810
5811 /* Default to Latin-1 */
5812 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005815 /* allocate enough for a simple encoding without
5816 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005817 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005818 if (res == NULL)
5819 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005820 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005823 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 /* try to encode it */
5825 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5826 if (x==enc_EXCEPTION) /* error */
5827 goto onError;
5828 if (x==enc_FAILED) { /* unencodable character */
5829 if (charmap_encoding_error(p, size, &inpos, mapping,
5830 &exc,
5831 &known_errorHandler, &errorHandler, errors,
5832 &res, &respos)) {
5833 goto onError;
5834 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005835 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 else
5837 /* done with this character => adjust input position */
5838 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005841 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005842 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005843 if (_PyBytes_Resize(&res, respos) < 0)
5844 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005845
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005846 Py_XDECREF(exc);
5847 Py_XDECREF(errorHandler);
5848 return res;
5849
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005851 Py_XDECREF(res);
5852 Py_XDECREF(exc);
5853 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 return NULL;
5855}
5856
5857PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson29060642009-01-31 22:14:21 +00005858 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859{
5860 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 PyErr_BadArgument();
5862 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 }
5864 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 PyUnicode_GET_SIZE(unicode),
5866 mapping,
5867 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868}
5869
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005870/* create or adjust a UnicodeTranslateError */
5871static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 const Py_UNICODE *unicode, Py_ssize_t size,
5873 Py_ssize_t startpos, Py_ssize_t endpos,
5874 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005876 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005877 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 }
5880 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5882 goto onError;
5883 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5884 goto onError;
5885 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5886 goto onError;
5887 return;
5888 onError:
5889 Py_DECREF(*exceptionObject);
5890 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 }
5892}
5893
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005894/* raises a UnicodeTranslateError */
5895static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 const Py_UNICODE *unicode, Py_ssize_t size,
5897 Py_ssize_t startpos, Py_ssize_t endpos,
5898 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005899{
5900 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005902 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005904}
5905
5906/* error handling callback helper:
5907 build arguments, call the callback and check the arguments,
5908 put the result into newpos and return the replacement string, which
5909 has to be freed by the caller */
5910static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00005911 PyObject **errorHandler,
5912 const char *reason,
5913 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5914 Py_ssize_t startpos, Py_ssize_t endpos,
5915 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005917 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005918
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005919 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005920 PyObject *restuple;
5921 PyObject *resunicode;
5922
5923 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005924 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005925 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005927 }
5928
5929 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005930 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005931 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005932 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005933
5934 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005935 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005936 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005937 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005938 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005939 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 Py_DECREF(restuple);
5941 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005942 }
5943 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 &resunicode, &i_newpos)) {
5945 Py_DECREF(restuple);
5946 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005947 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00005948 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005950 else
5951 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005952 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5954 Py_DECREF(restuple);
5955 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00005956 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005957 Py_INCREF(resunicode);
5958 Py_DECREF(restuple);
5959 return resunicode;
5960}
5961
5962/* Lookup the character ch in the mapping and put the result in result,
5963 which must be decrefed by the caller.
5964 Return 0 on success, -1 on error */
5965static
5966int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5967{
Christian Heimes217cfd12007-12-02 14:31:20 +00005968 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005969 PyObject *x;
5970
5971 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005973 x = PyObject_GetItem(mapping, w);
5974 Py_DECREF(w);
5975 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5977 /* No mapping found means: use 1:1 mapping. */
5978 PyErr_Clear();
5979 *result = NULL;
5980 return 0;
5981 } else
5982 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005983 }
5984 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 *result = x;
5986 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987 }
Christian Heimes217cfd12007-12-02 14:31:20 +00005988 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 long value = PyLong_AS_LONG(x);
5990 long max = PyUnicode_GetMax();
5991 if (value < 0 || value > max) {
5992 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00005993 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 Py_DECREF(x);
5995 return -1;
5996 }
5997 *result = x;
5998 return 0;
5999 }
6000 else if (PyUnicode_Check(x)) {
6001 *result = x;
6002 return 0;
6003 }
6004 else {
6005 /* wrong return value */
6006 PyErr_SetString(PyExc_TypeError,
6007 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006008 Py_DECREF(x);
6009 return -1;
6010 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011}
6012/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 if not reallocate and adjust various state variables.
6014 Return 0 on success, -1 on error */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006015static
Walter Dörwald4894c302003-10-24 14:25:28 +00006016int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006018{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006019 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006020 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 /* remember old output position */
6022 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6023 /* exponentially overallocate to minimize reallocations */
6024 if (requiredsize < 2 * oldsize)
6025 requiredsize = 2 * oldsize;
6026 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6027 return -1;
6028 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006029 }
6030 return 0;
6031}
6032/* lookup the character, put the result in the output string and adjust
6033 various state variables. Return a new reference to the object that
6034 was put in the output buffer in *result, or Py_None, if the mapping was
6035 undefined (in which case no character was written).
6036 The called must decref result.
6037 Return 0 on success, -1 on error. */
6038static
Walter Dörwald4894c302003-10-24 14:25:28 +00006039int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6041 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042{
Walter Dörwald4894c302003-10-24 14:25:28 +00006043 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006044 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006045 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 /* not found => default to 1:1 mapping */
6047 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006048 }
6049 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006051 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 /* no overflow check, because we know that the space is enough */
6053 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006054 }
6055 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6057 if (repsize==1) {
6058 /* no overflow check, because we know that the space is enough */
6059 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6060 }
6061 else if (repsize!=0) {
6062 /* more than one character */
6063 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6064 (insize - (curinp-startinp)) +
6065 repsize - 1;
6066 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6067 return -1;
6068 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6069 *outp += repsize;
6070 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006071 }
6072 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006074 return 0;
6075}
6076
6077PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 Py_ssize_t size,
6079 PyObject *mapping,
6080 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006082 /* output object */
6083 PyObject *res = NULL;
6084 /* pointers to the beginning and end+1 of input */
6085 const Py_UNICODE *startp = p;
6086 const Py_UNICODE *endp = p + size;
6087 /* pointer into the output */
6088 Py_UNICODE *str;
6089 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006090 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006091 char *reason = "character maps to <undefined>";
6092 PyObject *errorHandler = NULL;
6093 PyObject *exc = NULL;
6094 /* the following variable is used for caching string comparisons
6095 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6096 * 3=ignore, 4=xmlcharrefreplace */
6097 int known_errorHandler = -1;
6098
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 PyErr_BadArgument();
6101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006103
6104 /* allocate enough for a simple 1:1 translation without
6105 replacements, if we need more, we'll resize */
6106 res = PyUnicode_FromUnicode(NULL, size);
6107 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006113 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 /* try to encode it */
6115 PyObject *x = NULL;
6116 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6117 Py_XDECREF(x);
6118 goto onError;
6119 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006120 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 if (x!=Py_None) /* it worked => adjust input pointer */
6122 ++p;
6123 else { /* untranslatable character */
6124 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6125 Py_ssize_t repsize;
6126 Py_ssize_t newpos;
6127 Py_UNICODE *uni2;
6128 /* startpos for collecting untranslatable chars */
6129 const Py_UNICODE *collstart = p;
6130 const Py_UNICODE *collend = p+1;
6131 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 /* find all untranslatable characters */
6134 while (collend < endp) {
6135 if (charmaptranslate_lookup(*collend, mapping, &x))
6136 goto onError;
6137 Py_XDECREF(x);
6138 if (x!=Py_None)
6139 break;
6140 ++collend;
6141 }
6142 /* cache callback name lookup
6143 * (if not done yet, i.e. it's the first error) */
6144 if (known_errorHandler==-1) {
6145 if ((errors==NULL) || (!strcmp(errors, "strict")))
6146 known_errorHandler = 1;
6147 else if (!strcmp(errors, "replace"))
6148 known_errorHandler = 2;
6149 else if (!strcmp(errors, "ignore"))
6150 known_errorHandler = 3;
6151 else if (!strcmp(errors, "xmlcharrefreplace"))
6152 known_errorHandler = 4;
6153 else
6154 known_errorHandler = 0;
6155 }
6156 switch (known_errorHandler) {
6157 case 1: /* strict */
6158 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006159 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 case 2: /* replace */
6161 /* No need to check for space, this is a 1:1 replacement */
6162 for (coll = collstart; coll<collend; ++coll)
6163 *str++ = '?';
6164 /* fall through */
6165 case 3: /* ignore */
6166 p = collend;
6167 break;
6168 case 4: /* xmlcharrefreplace */
6169 /* generate replacement (temporarily (mis)uses p) */
6170 for (p = collstart; p < collend; ++p) {
6171 char buffer[2+29+1+1];
6172 char *cp;
6173 sprintf(buffer, "&#%d;", (int)*p);
6174 if (charmaptranslate_makespace(&res, &str,
6175 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6176 goto onError;
6177 for (cp = buffer; *cp; ++cp)
6178 *str++ = *cp;
6179 }
6180 p = collend;
6181 break;
6182 default:
6183 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6184 reason, startp, size, &exc,
6185 collstart-startp, collend-startp, &newpos);
6186 if (repunicode == NULL)
6187 goto onError;
6188 /* generate replacement */
6189 repsize = PyUnicode_GET_SIZE(repunicode);
6190 if (charmaptranslate_makespace(&res, &str,
6191 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6192 Py_DECREF(repunicode);
6193 goto onError;
6194 }
6195 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6196 *str++ = *uni2;
6197 p = startp + newpos;
6198 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006199 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006200 }
6201 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006202 /* Resize if we allocated to much */
6203 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006204 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 if (PyUnicode_Resize(&res, respos) < 0)
6206 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006207 }
6208 Py_XDECREF(exc);
6209 Py_XDECREF(errorHandler);
6210 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006213 Py_XDECREF(res);
6214 Py_XDECREF(exc);
6215 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 return NULL;
6217}
6218
6219PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 PyObject *mapping,
6221 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222{
6223 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006224
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 str = PyUnicode_FromObject(str);
6226 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006227 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 PyUnicode_GET_SIZE(str),
6230 mapping,
6231 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 Py_DECREF(str);
6233 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006234
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 Py_XDECREF(str);
6237 return NULL;
6238}
Tim Petersced69f82003-09-16 20:30:58 +00006239
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006240PyObject *
6241PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6242 Py_ssize_t length)
6243{
6244 PyObject *result;
6245 Py_UNICODE *p; /* write pointer into result */
6246 Py_ssize_t i;
6247 /* Copy to a new string */
6248 result = (PyObject *)_PyUnicode_New(length);
6249 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6250 if (result == NULL)
6251 return result;
6252 p = PyUnicode_AS_UNICODE(result);
6253 /* Iterate over code points */
6254 for (i = 0; i < length; i++) {
6255 Py_UNICODE ch =s[i];
6256 if (ch > 127) {
6257 int decimal = Py_UNICODE_TODECIMAL(ch);
6258 if (decimal >= 0)
6259 p[i] = '0' + decimal;
6260 }
6261 }
6262 return result;
6263}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006264/* --- Decimal Encoder ---------------------------------------------------- */
6265
6266int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006267 Py_ssize_t length,
6268 char *output,
6269 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006270{
6271 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006272 PyObject *errorHandler = NULL;
6273 PyObject *exc = NULL;
6274 const char *encoding = "decimal";
6275 const char *reason = "invalid decimal Unicode string";
6276 /* the following variable is used for caching string comparisons
6277 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6278 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006279
6280 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 PyErr_BadArgument();
6282 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006283 }
6284
6285 p = s;
6286 end = s + length;
6287 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 register Py_UNICODE ch = *p;
6289 int decimal;
6290 PyObject *repunicode;
6291 Py_ssize_t repsize;
6292 Py_ssize_t newpos;
6293 Py_UNICODE *uni2;
6294 Py_UNICODE *collstart;
6295 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006296
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006298 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 ++p;
6300 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006301 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 decimal = Py_UNICODE_TODECIMAL(ch);
6303 if (decimal >= 0) {
6304 *output++ = '0' + decimal;
6305 ++p;
6306 continue;
6307 }
6308 if (0 < ch && ch < 256) {
6309 *output++ = (char)ch;
6310 ++p;
6311 continue;
6312 }
6313 /* All other characters are considered unencodable */
6314 collstart = p;
6315 collend = p+1;
6316 while (collend < end) {
6317 if ((0 < *collend && *collend < 256) ||
6318 !Py_UNICODE_ISSPACE(*collend) ||
6319 Py_UNICODE_TODECIMAL(*collend))
6320 break;
6321 }
6322 /* cache callback name lookup
6323 * (if not done yet, i.e. it's the first error) */
6324 if (known_errorHandler==-1) {
6325 if ((errors==NULL) || (!strcmp(errors, "strict")))
6326 known_errorHandler = 1;
6327 else if (!strcmp(errors, "replace"))
6328 known_errorHandler = 2;
6329 else if (!strcmp(errors, "ignore"))
6330 known_errorHandler = 3;
6331 else if (!strcmp(errors, "xmlcharrefreplace"))
6332 known_errorHandler = 4;
6333 else
6334 known_errorHandler = 0;
6335 }
6336 switch (known_errorHandler) {
6337 case 1: /* strict */
6338 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6339 goto onError;
6340 case 2: /* replace */
6341 for (p = collstart; p < collend; ++p)
6342 *output++ = '?';
6343 /* fall through */
6344 case 3: /* ignore */
6345 p = collend;
6346 break;
6347 case 4: /* xmlcharrefreplace */
6348 /* generate replacement (temporarily (mis)uses p) */
6349 for (p = collstart; p < collend; ++p)
6350 output += sprintf(output, "&#%d;", (int)*p);
6351 p = collend;
6352 break;
6353 default:
6354 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6355 encoding, reason, s, length, &exc,
6356 collstart-s, collend-s, &newpos);
6357 if (repunicode == NULL)
6358 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006359 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006360 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006361 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6362 Py_DECREF(repunicode);
6363 goto onError;
6364 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 /* generate replacement */
6366 repsize = PyUnicode_GET_SIZE(repunicode);
6367 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6368 Py_UNICODE ch = *uni2;
6369 if (Py_UNICODE_ISSPACE(ch))
6370 *output++ = ' ';
6371 else {
6372 decimal = Py_UNICODE_TODECIMAL(ch);
6373 if (decimal >= 0)
6374 *output++ = '0' + decimal;
6375 else if (0 < ch && ch < 256)
6376 *output++ = (char)ch;
6377 else {
6378 Py_DECREF(repunicode);
6379 raise_encode_exception(&exc, encoding,
6380 s, length, collstart-s, collend-s, reason);
6381 goto onError;
6382 }
6383 }
6384 }
6385 p = s + newpos;
6386 Py_DECREF(repunicode);
6387 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006388 }
6389 /* 0-terminate the output string */
6390 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006391 Py_XDECREF(exc);
6392 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006393 return 0;
6394
Benjamin Peterson29060642009-01-31 22:14:21 +00006395 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396 Py_XDECREF(exc);
6397 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006398 return -1;
6399}
6400
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401/* --- Helpers ------------------------------------------------------------ */
6402
Eric Smith8c663262007-08-25 02:26:07 +00006403#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006404#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006405
Thomas Wouters477c8d52006-05-27 19:21:47 +00006406#include "stringlib/count.h"
6407#include "stringlib/find.h"
6408#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006409#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006410
Eric Smith5807c412008-05-11 21:00:57 +00006411#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006412#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006413#include "stringlib/localeutil.h"
6414
Thomas Wouters477c8d52006-05-27 19:21:47 +00006415/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006416#define ADJUST_INDICES(start, end, len) \
6417 if (end > len) \
6418 end = len; \
6419 else if (end < 0) { \
6420 end += len; \
6421 if (end < 0) \
6422 end = 0; \
6423 } \
6424 if (start < 0) { \
6425 start += len; \
6426 if (start < 0) \
6427 start = 0; \
6428 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006429
Martin v. Löwis18e16552006-02-15 17:27:45 +00006430Py_ssize_t PyUnicode_Count(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006431 PyObject *substr,
6432 Py_ssize_t start,
6433 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006435 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006436 PyUnicodeObject* str_obj;
6437 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006438
Thomas Wouters477c8d52006-05-27 19:21:47 +00006439 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6440 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006442 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6443 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006444 Py_DECREF(str_obj);
6445 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 }
Tim Petersced69f82003-09-16 20:30:58 +00006447
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006448 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006449 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006450 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6451 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006452 );
6453
6454 Py_DECREF(sub_obj);
6455 Py_DECREF(str_obj);
6456
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 return result;
6458}
6459
Martin v. Löwis18e16552006-02-15 17:27:45 +00006460Py_ssize_t PyUnicode_Find(PyObject *str,
Thomas Wouters477c8d52006-05-27 19:21:47 +00006461 PyObject *sub,
6462 Py_ssize_t start,
6463 Py_ssize_t end,
6464 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006466 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006467
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006469 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006471 sub = PyUnicode_FromObject(sub);
6472 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 Py_DECREF(str);
6474 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 }
Tim Petersced69f82003-09-16 20:30:58 +00006476
Thomas Wouters477c8d52006-05-27 19:21:47 +00006477 if (direction > 0)
6478 result = stringlib_find_slice(
6479 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6480 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6481 start, end
6482 );
6483 else
6484 result = stringlib_rfind_slice(
6485 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6486 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6487 start, end
6488 );
6489
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006491 Py_DECREF(sub);
6492
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493 return result;
6494}
6495
Tim Petersced69f82003-09-16 20:30:58 +00006496static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497int tailmatch(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 PyUnicodeObject *substring,
6499 Py_ssize_t start,
6500 Py_ssize_t end,
6501 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 if (substring->length == 0)
6504 return 1;
6505
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006506 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 end -= substring->length;
6508 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510
6511 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 if (Py_UNICODE_MATCH(self, end, substring))
6513 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 } else {
6515 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 }
6518
6519 return 0;
6520}
6521
Martin v. Löwis18e16552006-02-15 17:27:45 +00006522Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 PyObject *substr,
6524 Py_ssize_t start,
6525 Py_ssize_t end,
6526 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006528 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006529
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 str = PyUnicode_FromObject(str);
6531 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 substr = PyUnicode_FromObject(substr);
6534 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 Py_DECREF(str);
6536 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 }
Tim Petersced69f82003-09-16 20:30:58 +00006538
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 (PyUnicodeObject *)substr,
6541 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 Py_DECREF(str);
6543 Py_DECREF(substr);
6544 return result;
6545}
6546
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547/* Apply fixfct filter to the Unicode object self and return a
6548 reference to the modified object */
6549
Tim Petersced69f82003-09-16 20:30:58 +00006550static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006553{
6554
6555 PyUnicodeObject *u;
6556
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006557 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006560
6561 Py_UNICODE_COPY(u->str, self->str, self->length);
6562
Tim Peters7a29bd52001-09-12 03:03:31 +00006563 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 /* fixfct should return TRUE if it modified the buffer. If
6565 FALSE, return a reference to the original buffer instead
6566 (to save space, not time) */
6567 Py_INCREF(self);
6568 Py_DECREF(u);
6569 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006570 }
6571 return (PyObject*) u;
6572}
6573
Tim Petersced69f82003-09-16 20:30:58 +00006574static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575int fixupper(PyUnicodeObject *self)
6576{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006577 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578 Py_UNICODE *s = self->str;
6579 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006580
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006582 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006583
Benjamin Peterson29060642009-01-31 22:14:21 +00006584 ch = Py_UNICODE_TOUPPER(*s);
6585 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 *s = ch;
6588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006589 s++;
6590 }
6591
6592 return status;
6593}
6594
Tim Petersced69f82003-09-16 20:30:58 +00006595static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596int fixlower(PyUnicodeObject *self)
6597{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006598 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 Py_UNICODE *s = self->str;
6600 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006601
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006603 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006604
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 ch = Py_UNICODE_TOLOWER(*s);
6606 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 *s = ch;
6609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 s++;
6611 }
6612
6613 return status;
6614}
6615
Tim Petersced69f82003-09-16 20:30:58 +00006616static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617int fixswapcase(PyUnicodeObject *self)
6618{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006619 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 Py_UNICODE *s = self->str;
6621 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006622
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623 while (len-- > 0) {
6624 if (Py_UNICODE_ISUPPER(*s)) {
6625 *s = Py_UNICODE_TOLOWER(*s);
6626 status = 1;
6627 } else if (Py_UNICODE_ISLOWER(*s)) {
6628 *s = Py_UNICODE_TOUPPER(*s);
6629 status = 1;
6630 }
6631 s++;
6632 }
6633
6634 return status;
6635}
6636
Tim Petersced69f82003-09-16 20:30:58 +00006637static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006638int fixcapitalize(PyUnicodeObject *self)
6639{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006640 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006641 Py_UNICODE *s = self->str;
6642 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006643
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006644 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006645 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006646 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006647 *s = Py_UNICODE_TOUPPER(*s);
6648 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006650 s++;
6651 while (--len > 0) {
6652 if (Py_UNICODE_ISUPPER(*s)) {
6653 *s = Py_UNICODE_TOLOWER(*s);
6654 status = 1;
6655 }
6656 s++;
6657 }
6658 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659}
6660
6661static
6662int fixtitle(PyUnicodeObject *self)
6663{
6664 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6665 register Py_UNICODE *e;
6666 int previous_is_cased;
6667
6668 /* Shortcut for single character strings */
6669 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006670 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6671 if (*p != ch) {
6672 *p = ch;
6673 return 1;
6674 }
6675 else
6676 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 }
Tim Petersced69f82003-09-16 20:30:58 +00006678
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 e = p + PyUnicode_GET_SIZE(self);
6680 previous_is_cased = 0;
6681 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006682 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006683
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 if (previous_is_cased)
6685 *p = Py_UNICODE_TOLOWER(ch);
6686 else
6687 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006688
Benjamin Peterson29060642009-01-31 22:14:21 +00006689 if (Py_UNICODE_ISLOWER(ch) ||
6690 Py_UNICODE_ISUPPER(ch) ||
6691 Py_UNICODE_ISTITLE(ch))
6692 previous_is_cased = 1;
6693 else
6694 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 }
6696 return 1;
6697}
6698
Tim Peters8ce9f162004-08-27 01:49:32 +00006699PyObject *
6700PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701{
Skip Montanaro6543b452004-09-16 03:28:13 +00006702 const Py_UNICODE blank = ' ';
6703 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006704 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006705 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006706 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6707 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006708 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6709 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006710 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006711 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712
Tim Peters05eba1f2004-08-27 21:32:02 +00006713 fseq = PySequence_Fast(seq, "");
6714 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006715 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006716 }
6717
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006718 /* NOTE: the following code can't call back into Python code,
6719 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006720 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006721
Tim Peters05eba1f2004-08-27 21:32:02 +00006722 seqlen = PySequence_Fast_GET_SIZE(fseq);
6723 /* If empty sequence, return u"". */
6724 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006725 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6726 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006727 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006728 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006729 /* If singleton sequence with an exact Unicode, return that. */
6730 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 item = items[0];
6732 if (PyUnicode_CheckExact(item)) {
6733 Py_INCREF(item);
6734 res = (PyUnicodeObject *)item;
6735 goto Done;
6736 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006737 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006738 else {
6739 /* Set up sep and seplen */
6740 if (separator == NULL) {
6741 sep = &blank;
6742 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006743 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006744 else {
6745 if (!PyUnicode_Check(separator)) {
6746 PyErr_Format(PyExc_TypeError,
6747 "separator: expected str instance,"
6748 " %.80s found",
6749 Py_TYPE(separator)->tp_name);
6750 goto onError;
6751 }
6752 sep = PyUnicode_AS_UNICODE(separator);
6753 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006754 }
6755 }
6756
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006757 /* There are at least two things to join, or else we have a subclass
6758 * of str in the sequence.
6759 * Do a pre-pass to figure out the total amount of space we'll
6760 * need (sz), and see whether all argument are strings.
6761 */
6762 sz = 0;
6763 for (i = 0; i < seqlen; i++) {
6764 const Py_ssize_t old_sz = sz;
6765 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 if (!PyUnicode_Check(item)) {
6767 PyErr_Format(PyExc_TypeError,
6768 "sequence item %zd: expected str instance,"
6769 " %.80s found",
6770 i, Py_TYPE(item)->tp_name);
6771 goto onError;
6772 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006773 sz += PyUnicode_GET_SIZE(item);
6774 if (i != 0)
6775 sz += seplen;
6776 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6777 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006779 goto onError;
6780 }
6781 }
Tim Petersced69f82003-09-16 20:30:58 +00006782
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006783 res = _PyUnicode_New(sz);
6784 if (res == NULL)
6785 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006786
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006787 /* Catenate everything. */
6788 res_p = PyUnicode_AS_UNICODE(res);
6789 for (i = 0; i < seqlen; ++i) {
6790 Py_ssize_t itemlen;
6791 item = items[i];
6792 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006793 /* Copy item, and maybe the separator. */
6794 if (i) {
6795 Py_UNICODE_COPY(res_p, sep, seplen);
6796 res_p += seplen;
6797 }
6798 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6799 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006800 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006801
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006803 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804 return (PyObject *)res;
6805
Benjamin Peterson29060642009-01-31 22:14:21 +00006806 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006807 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006808 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006809 return NULL;
6810}
6811
Tim Petersced69f82003-09-16 20:30:58 +00006812static
6813PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 Py_ssize_t left,
6815 Py_ssize_t right,
6816 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817{
6818 PyUnicodeObject *u;
6819
6820 if (left < 0)
6821 left = 0;
6822 if (right < 0)
6823 right = 0;
6824
Tim Peters7a29bd52001-09-12 03:03:31 +00006825 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 Py_INCREF(self);
6827 return self;
6828 }
6829
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006830 if (left > PY_SSIZE_T_MAX - self->length ||
6831 right > PY_SSIZE_T_MAX - (left + self->length)) {
6832 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6833 return NULL;
6834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 u = _PyUnicode_New(left + self->length + right);
6836 if (u) {
6837 if (left)
6838 Py_UNICODE_FILL(u->str, fill, left);
6839 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6840 if (right)
6841 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6842 }
6843
6844 return u;
6845}
6846
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006847PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850
6851 string = PyUnicode_FromObject(string);
6852 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006853 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006855 list = stringlib_splitlines(
6856 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6857 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858
6859 Py_DECREF(string);
6860 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861}
6862
Tim Petersced69f82003-09-16 20:30:58 +00006863static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864PyObject *split(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006865 PyUnicodeObject *substring,
6866 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006869 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006872 return stringlib_split_whitespace(
6873 (PyObject*) self, self->str, self->length, maxcount
6874 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006876 return stringlib_split(
6877 (PyObject*) self, self->str, self->length,
6878 substring->str, substring->length,
6879 maxcount
6880 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881}
6882
Tim Petersced69f82003-09-16 20:30:58 +00006883static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006884PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 PyUnicodeObject *substring,
6886 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006887{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006888 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006889 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006890
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006891 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006892 return stringlib_rsplit_whitespace(
6893 (PyObject*) self, self->str, self->length, maxcount
6894 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006895
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006896 return stringlib_rsplit(
6897 (PyObject*) self, self->str, self->length,
6898 substring->str, substring->length,
6899 maxcount
6900 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006901}
6902
6903static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00006905 PyUnicodeObject *str1,
6906 PyUnicodeObject *str2,
6907 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006908{
6909 PyUnicodeObject *u;
6910
6911 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006912 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006913 else if (maxcount == 0 || self->length == 0)
6914 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915
Thomas Wouters477c8d52006-05-27 19:21:47 +00006916 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006917 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006918 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006919 if (str1->length == 0)
6920 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006921 if (str1->length == 1) {
6922 /* replace characters */
6923 Py_UNICODE u1, u2;
6924 if (!findchar(self->str, self->length, str1->str[0]))
6925 goto nothing;
6926 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6927 if (!u)
6928 return NULL;
6929 Py_UNICODE_COPY(u->str, self->str, self->length);
6930 u1 = str1->str[0];
6931 u2 = str2->str[0];
6932 for (i = 0; i < u->length; i++)
6933 if (u->str[i] == u1) {
6934 if (--maxcount < 0)
6935 break;
6936 u->str[i] = u2;
6937 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006939 i = stringlib_find(
6940 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00006942 if (i < 0)
6943 goto nothing;
6944 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6945 if (!u)
6946 return NULL;
6947 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006948
6949 /* change everything in-place, starting with this one */
6950 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6951 i += str1->length;
6952
6953 while ( --maxcount > 0) {
6954 i = stringlib_find(self->str+i, self->length-i,
6955 str1->str, str1->length,
6956 i);
6957 if (i == -1)
6958 break;
6959 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6960 i += str1->length;
6961 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006964
6965 Py_ssize_t n, i, j, e;
6966 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967 Py_UNICODE *p;
6968
6969 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006970 n = stringlib_count(self->str, self->length, str1->str, str1->length,
6971 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006972 if (n == 0)
6973 goto nothing;
6974 /* new_size = self->length + n * (str2->length - str1->length)); */
6975 delta = (str2->length - str1->length);
6976 if (delta == 0) {
6977 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006979 product = n * (str2->length - str1->length);
6980 if ((product / (str2->length - str1->length)) != n) {
6981 PyErr_SetString(PyExc_OverflowError,
6982 "replace string is too long");
6983 return NULL;
6984 }
6985 new_size = self->length + product;
6986 if (new_size < 0) {
6987 PyErr_SetString(PyExc_OverflowError,
6988 "replace string is too long");
6989 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990 }
6991 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006992 u = _PyUnicode_New(new_size);
6993 if (!u)
6994 return NULL;
6995 i = 0;
6996 p = u->str;
6997 e = self->length - str1->length;
6998 if (str1->length > 0) {
6999 while (n-- > 0) {
7000 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007001 j = stringlib_find(self->str+i, self->length-i,
7002 str1->str, str1->length,
7003 i);
7004 if (j == -1)
7005 break;
7006 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007007 /* copy unchanged part [i:j] */
7008 Py_UNICODE_COPY(p, self->str+i, j-i);
7009 p += j - i;
7010 }
7011 /* copy substitution string */
7012 if (str2->length > 0) {
7013 Py_UNICODE_COPY(p, str2->str, str2->length);
7014 p += str2->length;
7015 }
7016 i = j + str1->length;
7017 }
7018 if (i < self->length)
7019 /* copy tail [i:] */
7020 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7021 } else {
7022 /* interleave */
7023 while (n > 0) {
7024 Py_UNICODE_COPY(p, str2->str, str2->length);
7025 p += str2->length;
7026 if (--n <= 0)
7027 break;
7028 *p++ = self->str[i++];
7029 }
7030 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007034
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007036 /* nothing to replace; return original string (when possible) */
7037 if (PyUnicode_CheckExact(self)) {
7038 Py_INCREF(self);
7039 return (PyObject *) self;
7040 }
7041 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042}
7043
7044/* --- Unicode Object Methods --------------------------------------------- */
7045
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007046PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007047 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048\n\
7049Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007050characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051
7052static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007053unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007055 return fixup(self, fixtitle);
7056}
7057
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007058PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060\n\
7061Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007062have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063
7064static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007065unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067 return fixup(self, fixcapitalize);
7068}
7069
7070#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007071PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007072 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073\n\
7074Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007075normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076
7077static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007078unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079{
7080 PyObject *list;
7081 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007082 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007083
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084 /* Split into words */
7085 list = split(self, NULL, -1);
7086 if (!list)
7087 return NULL;
7088
7089 /* Capitalize each word */
7090 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7091 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093 if (item == NULL)
7094 goto onError;
7095 Py_DECREF(PyList_GET_ITEM(list, i));
7096 PyList_SET_ITEM(list, i, item);
7097 }
7098
7099 /* Join the words to form a new string */
7100 item = PyUnicode_Join(NULL, list);
7101
Benjamin Peterson29060642009-01-31 22:14:21 +00007102 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103 Py_DECREF(list);
7104 return (PyObject *)item;
7105}
7106#endif
7107
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007108/* Argument converter. Coerces to a single unicode character */
7109
7110static int
7111convert_uc(PyObject *obj, void *addr)
7112{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007113 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7114 PyObject *uniobj;
7115 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007116
Benjamin Peterson14339b62009-01-31 16:36:08 +00007117 uniobj = PyUnicode_FromObject(obj);
7118 if (uniobj == NULL) {
7119 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007121 return 0;
7122 }
7123 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7124 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007126 Py_DECREF(uniobj);
7127 return 0;
7128 }
7129 unistr = PyUnicode_AS_UNICODE(uniobj);
7130 *fillcharloc = unistr[0];
7131 Py_DECREF(uniobj);
7132 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007133}
7134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007135PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007138Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007139done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140
7141static PyObject *
7142unicode_center(PyUnicodeObject *self, PyObject *args)
7143{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007144 Py_ssize_t marg, left;
7145 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007146 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007147
Thomas Woutersde017742006-02-16 19:34:37 +00007148 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 return NULL;
7150
Tim Peters7a29bd52001-09-12 03:03:31 +00007151 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 Py_INCREF(self);
7153 return (PyObject*) self;
7154 }
7155
7156 marg = width - self->length;
7157 left = marg / 2 + (marg & width & 1);
7158
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007159 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160}
7161
Marc-André Lemburge5034372000-08-08 08:04:29 +00007162#if 0
7163
7164/* This code should go into some future Unicode collation support
7165 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007166 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007167
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007168/* speedy UTF-16 code point order comparison */
7169/* gleaned from: */
7170/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7171
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007172static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007173{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007174 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007175 0, 0, 0, 0, 0, 0, 0, 0,
7176 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007177 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007178};
7179
Guido van Rossumd57fd912000-03-10 22:53:23 +00007180static int
7181unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7182{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007183 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007184
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185 Py_UNICODE *s1 = str1->str;
7186 Py_UNICODE *s2 = str2->str;
7187
7188 len1 = str1->length;
7189 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007190
Guido van Rossumd57fd912000-03-10 22:53:23 +00007191 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007192 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007193
7194 c1 = *s1++;
7195 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007196
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 if (c1 > (1<<11) * 26)
7198 c1 += utf16Fixup[c1>>11];
7199 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007200 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007201 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007202
7203 if (c1 != c2)
7204 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007205
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007206 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 }
7208
7209 return (len1 < len2) ? -1 : (len1 != len2);
7210}
7211
Marc-André Lemburge5034372000-08-08 08:04:29 +00007212#else
7213
7214static int
7215unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7216{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007217 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007218
7219 Py_UNICODE *s1 = str1->str;
7220 Py_UNICODE *s2 = str2->str;
7221
7222 len1 = str1->length;
7223 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007224
Marc-André Lemburge5034372000-08-08 08:04:29 +00007225 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007226 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007227
Fredrik Lundh45714e92001-06-26 16:39:36 +00007228 c1 = *s1++;
7229 c2 = *s2++;
7230
7231 if (c1 != c2)
7232 return (c1 < c2) ? -1 : 1;
7233
Marc-André Lemburge5034372000-08-08 08:04:29 +00007234 len1--; len2--;
7235 }
7236
7237 return (len1 < len2) ? -1 : (len1 != len2);
7238}
7239
7240#endif
7241
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242int PyUnicode_Compare(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007243 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007244{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007245 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7246 return unicode_compare((PyUnicodeObject *)left,
7247 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007248 PyErr_Format(PyExc_TypeError,
7249 "Can't compare %.100s and %.100s",
7250 left->ob_type->tp_name,
7251 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252 return -1;
7253}
7254
Martin v. Löwis5b222132007-06-10 09:51:05 +00007255int
7256PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7257{
7258 int i;
7259 Py_UNICODE *id;
7260 assert(PyUnicode_Check(uni));
7261 id = PyUnicode_AS_UNICODE(uni);
7262 /* Compare Unicode string and source character set string */
7263 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007264 if (id[i] != str[i])
7265 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007266 /* This check keeps Python strings that end in '\0' from comparing equal
7267 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007268 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007269 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007270 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007271 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007272 return 0;
7273}
7274
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007275
Benjamin Peterson29060642009-01-31 22:14:21 +00007276#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007277 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007278
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007279PyObject *PyUnicode_RichCompare(PyObject *left,
7280 PyObject *right,
7281 int op)
7282{
7283 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007284
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007285 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7286 PyObject *v;
7287 if (((PyUnicodeObject *) left)->length !=
7288 ((PyUnicodeObject *) right)->length) {
7289 if (op == Py_EQ) {
7290 Py_INCREF(Py_False);
7291 return Py_False;
7292 }
7293 if (op == Py_NE) {
7294 Py_INCREF(Py_True);
7295 return Py_True;
7296 }
7297 }
7298 if (left == right)
7299 result = 0;
7300 else
7301 result = unicode_compare((PyUnicodeObject *)left,
7302 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007303
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007304 /* Convert the return value to a Boolean */
7305 switch (op) {
7306 case Py_EQ:
7307 v = TEST_COND(result == 0);
7308 break;
7309 case Py_NE:
7310 v = TEST_COND(result != 0);
7311 break;
7312 case Py_LE:
7313 v = TEST_COND(result <= 0);
7314 break;
7315 case Py_GE:
7316 v = TEST_COND(result >= 0);
7317 break;
7318 case Py_LT:
7319 v = TEST_COND(result == -1);
7320 break;
7321 case Py_GT:
7322 v = TEST_COND(result == 1);
7323 break;
7324 default:
7325 PyErr_BadArgument();
7326 return NULL;
7327 }
7328 Py_INCREF(v);
7329 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007330 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007331
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007332 Py_INCREF(Py_NotImplemented);
7333 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007334}
7335
Guido van Rossum403d68b2000-03-13 15:55:09 +00007336int PyUnicode_Contains(PyObject *container,
Benjamin Peterson29060642009-01-31 22:14:21 +00007337 PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007338{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007339 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007340 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007341
7342 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007343 sub = PyUnicode_FromObject(element);
7344 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 PyErr_Format(PyExc_TypeError,
7346 "'in <string>' requires string as left operand, not %s",
7347 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007348 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007349 }
7350
Thomas Wouters477c8d52006-05-27 19:21:47 +00007351 str = PyUnicode_FromObject(container);
7352 if (!str) {
7353 Py_DECREF(sub);
7354 return -1;
7355 }
7356
7357 result = stringlib_contains_obj(str, sub);
7358
7359 Py_DECREF(str);
7360 Py_DECREF(sub);
7361
Guido van Rossum403d68b2000-03-13 15:55:09 +00007362 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007363}
7364
Guido van Rossumd57fd912000-03-10 22:53:23 +00007365/* Concat to string or Unicode object giving a new Unicode object. */
7366
7367PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007369{
7370 PyUnicodeObject *u = NULL, *v = NULL, *w;
7371
7372 /* Coerce the two arguments */
7373 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7374 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7377 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007378 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379
7380 /* Shortcuts */
7381 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 Py_DECREF(v);
7383 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384 }
7385 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 Py_DECREF(u);
7387 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388 }
7389
7390 /* Concat the two Unicode strings */
7391 w = _PyUnicode_New(u->length + v->length);
7392 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394 Py_UNICODE_COPY(w->str, u->str, u->length);
7395 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7396
7397 Py_DECREF(u);
7398 Py_DECREF(v);
7399 return (PyObject *)w;
7400
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402 Py_XDECREF(u);
7403 Py_XDECREF(v);
7404 return NULL;
7405}
7406
Walter Dörwald1ab83302007-05-18 17:15:44 +00007407void
7408PyUnicode_Append(PyObject **pleft, PyObject *right)
7409{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007410 PyObject *new;
7411 if (*pleft == NULL)
7412 return;
7413 if (right == NULL || !PyUnicode_Check(*pleft)) {
7414 Py_DECREF(*pleft);
7415 *pleft = NULL;
7416 return;
7417 }
7418 new = PyUnicode_Concat(*pleft, right);
7419 Py_DECREF(*pleft);
7420 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007421}
7422
7423void
7424PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7425{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007426 PyUnicode_Append(pleft, right);
7427 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007428}
7429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007430PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007431 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007432\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007433Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007434string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007435interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436
7437static PyObject *
7438unicode_count(PyUnicodeObject *self, PyObject *args)
7439{
7440 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007441 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007442 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443 PyObject *result;
7444
Jesus Ceaac451502011-04-20 17:09:23 +02007445 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
7446 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007448
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007449 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007450 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007451 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007452 substring->str, substring->length,
7453 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007454 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007455
7456 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007457
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458 return result;
7459}
7460
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007461PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007462 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007464Encode S using the codec registered for encoding. Default encoding\n\
7465is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007466handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007467a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7468'xmlcharrefreplace' as well as any other name registered with\n\
7469codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470
7471static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007472unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007473{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007474 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 char *encoding = NULL;
7476 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007477
Benjamin Peterson308d6372009-09-18 21:42:35 +00007478 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7479 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007481 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007482}
7483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007484PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007485 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007486\n\
7487Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007488If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489
7490static PyObject*
7491unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7492{
7493 Py_UNICODE *e;
7494 Py_UNICODE *p;
7495 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007496 Py_UNICODE *qe;
7497 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007498 PyUnicodeObject *u;
7499 int tabsize = 8;
7500
7501 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007503
Thomas Wouters7e474022000-07-16 12:04:32 +00007504 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007505 i = 0; /* chars up to and including most recent \n or \r */
7506 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7507 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 for (p = self->str; p < e; p++)
7509 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 if (tabsize > 0) {
7511 incr = tabsize - (j % tabsize); /* cannot overflow */
7512 if (j > PY_SSIZE_T_MAX - incr)
7513 goto overflow1;
7514 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007515 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007516 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 if (j > PY_SSIZE_T_MAX - 1)
7519 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 j++;
7521 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007522 if (i > PY_SSIZE_T_MAX - j)
7523 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007525 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007526 }
7527 }
7528
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007529 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007531
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 /* Second pass: create output string and fill it */
7533 u = _PyUnicode_New(i + j);
7534 if (!u)
7535 return NULL;
7536
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007537 j = 0; /* same as in first pass */
7538 q = u->str; /* next output char */
7539 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540
7541 for (p = self->str; p < e; p++)
7542 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007543 if (tabsize > 0) {
7544 i = tabsize - (j % tabsize);
7545 j += i;
7546 while (i--) {
7547 if (q >= qe)
7548 goto overflow2;
7549 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007550 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007552 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007553 else {
7554 if (q >= qe)
7555 goto overflow2;
7556 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007557 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558 if (*p == '\n' || *p == '\r')
7559 j = 0;
7560 }
7561
7562 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007563
7564 overflow2:
7565 Py_DECREF(u);
7566 overflow1:
7567 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7568 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569}
7570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007571PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573\n\
7574Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007575such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576arguments start and end are interpreted as in slice notation.\n\
7577\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007578Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007579
7580static PyObject *
7581unicode_find(PyUnicodeObject *self, PyObject *args)
7582{
Jesus Ceaac451502011-04-20 17:09:23 +02007583 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007584 Py_ssize_t start;
7585 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007586 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587
Jesus Ceaac451502011-04-20 17:09:23 +02007588 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
7589 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591
Thomas Wouters477c8d52006-05-27 19:21:47 +00007592 result = stringlib_find_slice(
7593 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7594 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7595 start, end
7596 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597
7598 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007599
Christian Heimes217cfd12007-12-02 14:31:20 +00007600 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601}
7602
7603static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007604unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605{
7606 if (index < 0 || index >= self->length) {
7607 PyErr_SetString(PyExc_IndexError, "string index out of range");
7608 return NULL;
7609 }
7610
7611 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7612}
7613
Guido van Rossumc2504932007-09-18 19:42:40 +00007614/* Believe it or not, this produces the same value for ASCII strings
7615 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007616static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007617unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618{
Guido van Rossumc2504932007-09-18 19:42:40 +00007619 Py_ssize_t len;
7620 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007621 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007622
7623 if (self->hash != -1)
7624 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007625 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007626 p = self->str;
7627 x = *p << 7;
7628 while (--len >= 0)
7629 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007630 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007631 if (x == -1)
7632 x = -2;
7633 self->hash = x;
7634 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007635}
7636
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007637PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007640Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007641
7642static PyObject *
7643unicode_index(PyUnicodeObject *self, PyObject *args)
7644{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007645 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02007646 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007647 Py_ssize_t start;
7648 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649
Jesus Ceaac451502011-04-20 17:09:23 +02007650 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
7651 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653
Thomas Wouters477c8d52006-05-27 19:21:47 +00007654 result = stringlib_find_slice(
7655 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7656 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7657 start, end
7658 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659
7660 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007661
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662 if (result < 0) {
7663 PyErr_SetString(PyExc_ValueError, "substring not found");
7664 return NULL;
7665 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007666
Christian Heimes217cfd12007-12-02 14:31:20 +00007667 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668}
7669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007670PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007671 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007672\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007673Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007674at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675
7676static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007677unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678{
7679 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7680 register const Py_UNICODE *e;
7681 int cased;
7682
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683 /* Shortcut for single character strings */
7684 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007687 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007688 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007689 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007690
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691 e = p + PyUnicode_GET_SIZE(self);
7692 cased = 0;
7693 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007694 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007695
Benjamin Peterson29060642009-01-31 22:14:21 +00007696 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7697 return PyBool_FromLong(0);
7698 else if (!cased && Py_UNICODE_ISLOWER(ch))
7699 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007701 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007702}
7703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007704PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007707Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007708at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709
7710static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007711unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712{
7713 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7714 register const Py_UNICODE *e;
7715 int cased;
7716
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717 /* Shortcut for single character strings */
7718 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007719 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007721 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007722 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007723 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007724
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725 e = p + PyUnicode_GET_SIZE(self);
7726 cased = 0;
7727 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007728 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007729
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7731 return PyBool_FromLong(0);
7732 else if (!cased && Py_UNICODE_ISUPPER(ch))
7733 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007735 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736}
7737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007738PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007741Return True if S is a titlecased string and there is at least one\n\
7742character in S, i.e. upper- and titlecase characters may only\n\
7743follow uncased characters and lowercase characters only cased ones.\n\
7744Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745
7746static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007747unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748{
7749 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7750 register const Py_UNICODE *e;
7751 int cased, previous_is_cased;
7752
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753 /* Shortcut for single character strings */
7754 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007755 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7756 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007757
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007758 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007759 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007761
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762 e = p + PyUnicode_GET_SIZE(self);
7763 cased = 0;
7764 previous_is_cased = 0;
7765 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007766 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007767
Benjamin Peterson29060642009-01-31 22:14:21 +00007768 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7769 if (previous_is_cased)
7770 return PyBool_FromLong(0);
7771 previous_is_cased = 1;
7772 cased = 1;
7773 }
7774 else if (Py_UNICODE_ISLOWER(ch)) {
7775 if (!previous_is_cased)
7776 return PyBool_FromLong(0);
7777 previous_is_cased = 1;
7778 cased = 1;
7779 }
7780 else
7781 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007783 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784}
7785
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007786PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007789Return True if all characters in S are whitespace\n\
7790and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007791
7792static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007793unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794{
7795 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7796 register const Py_UNICODE *e;
7797
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798 /* Shortcut for single character strings */
7799 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 Py_UNICODE_ISSPACE(*p))
7801 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007803 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007804 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007805 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007806
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807 e = p + PyUnicode_GET_SIZE(self);
7808 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007809 if (!Py_UNICODE_ISSPACE(*p))
7810 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007812 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813}
7814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007815PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007816 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007817\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007818Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007819and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007820
7821static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007822unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007823{
7824 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7825 register const Py_UNICODE *e;
7826
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007827 /* Shortcut for single character strings */
7828 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007829 Py_UNICODE_ISALPHA(*p))
7830 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007831
7832 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007833 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007835
7836 e = p + PyUnicode_GET_SIZE(self);
7837 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 if (!Py_UNICODE_ISALPHA(*p))
7839 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007840 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007841 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007842}
7843
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007844PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007846\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007847Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007848and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007849
7850static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007851unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007852{
7853 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7854 register const Py_UNICODE *e;
7855
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007856 /* Shortcut for single character strings */
7857 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007858 Py_UNICODE_ISALNUM(*p))
7859 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007860
7861 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007862 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007864
7865 e = p + PyUnicode_GET_SIZE(self);
7866 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 if (!Py_UNICODE_ISALNUM(*p))
7868 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007869 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007870 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007871}
7872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007873PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007876Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007877False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878
7879static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007880unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007881{
7882 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7883 register const Py_UNICODE *e;
7884
Guido van Rossumd57fd912000-03-10 22:53:23 +00007885 /* Shortcut for single character strings */
7886 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 Py_UNICODE_ISDECIMAL(*p))
7888 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007890 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007891 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007893
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894 e = p + PyUnicode_GET_SIZE(self);
7895 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007896 if (!Py_UNICODE_ISDECIMAL(*p))
7897 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007899 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900}
7901
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007902PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007904\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007905Return True if all characters in S are digits\n\
7906and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907
7908static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007909unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007910{
7911 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7912 register const Py_UNICODE *e;
7913
Guido van Rossumd57fd912000-03-10 22:53:23 +00007914 /* Shortcut for single character strings */
7915 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 Py_UNICODE_ISDIGIT(*p))
7917 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007918
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007919 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007920 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007922
Guido van Rossumd57fd912000-03-10 22:53:23 +00007923 e = p + PyUnicode_GET_SIZE(self);
7924 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 if (!Py_UNICODE_ISDIGIT(*p))
7926 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007927 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007928 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007929}
7930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007931PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007932 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007934Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007935False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007936
7937static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007938unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007939{
7940 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7941 register const Py_UNICODE *e;
7942
Guido van Rossumd57fd912000-03-10 22:53:23 +00007943 /* Shortcut for single character strings */
7944 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 Py_UNICODE_ISNUMERIC(*p))
7946 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007948 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007949 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007950 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007951
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952 e = p + PyUnicode_GET_SIZE(self);
7953 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007954 if (!Py_UNICODE_ISNUMERIC(*p))
7955 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007957 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958}
7959
Martin v. Löwis47383402007-08-15 07:32:56 +00007960int
7961PyUnicode_IsIdentifier(PyObject *self)
7962{
7963 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7964 register const Py_UNICODE *e;
7965
7966 /* Special case for empty strings */
7967 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007969
7970 /* PEP 3131 says that the first character must be in
7971 XID_Start and subsequent characters in XID_Continue,
7972 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00007973 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00007974 letters, digits, underscore). However, given the current
7975 definition of XID_Start and XID_Continue, it is sufficient
7976 to check just for these, except that _ must be allowed
7977 as starting an identifier. */
7978 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7979 return 0;
7980
7981 e = p + PyUnicode_GET_SIZE(self);
7982 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 if (!_PyUnicode_IsXidContinue(*p))
7984 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00007985 }
7986 return 1;
7987}
7988
7989PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00007991\n\
7992Return True if S is a valid identifier according\n\
7993to the language definition.");
7994
7995static PyObject*
7996unicode_isidentifier(PyObject *self)
7997{
7998 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7999}
8000
Georg Brandl559e5d72008-06-11 18:37:52 +00008001PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008003\n\
8004Return True if all characters in S are considered\n\
8005printable in repr() or S is empty, False otherwise.");
8006
8007static PyObject*
8008unicode_isprintable(PyObject *self)
8009{
8010 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8011 register const Py_UNICODE *e;
8012
8013 /* Shortcut for single character strings */
8014 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8015 Py_RETURN_TRUE;
8016 }
8017
8018 e = p + PyUnicode_GET_SIZE(self);
8019 for (; p < e; p++) {
8020 if (!Py_UNICODE_ISPRINTABLE(*p)) {
8021 Py_RETURN_FALSE;
8022 }
8023 }
8024 Py_RETURN_TRUE;
8025}
8026
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008027PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008028 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029\n\
8030Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008031iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032
8033static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008034unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008036 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037}
8038
Martin v. Löwis18e16552006-02-15 17:27:45 +00008039static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040unicode_length(PyUnicodeObject *self)
8041{
8042 return self->length;
8043}
8044
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008045PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008046 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008048Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008049done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050
8051static PyObject *
8052unicode_ljust(PyUnicodeObject *self, PyObject *args)
8053{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008054 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008055 Py_UNICODE fillchar = ' ';
8056
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008057 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058 return NULL;
8059
Tim Peters7a29bd52001-09-12 03:03:31 +00008060 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061 Py_INCREF(self);
8062 return (PyObject*) self;
8063 }
8064
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008065 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008066}
8067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008068PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008071Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072
8073static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008074unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076 return fixup(self, fixlower);
8077}
8078
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008079#define LEFTSTRIP 0
8080#define RIGHTSTRIP 1
8081#define BOTHSTRIP 2
8082
8083/* Arrays indexed by above */
8084static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8085
8086#define STRIPNAME(i) (stripformat[i]+3)
8087
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008088/* externally visible for str.strip(unicode) */
8089PyObject *
8090_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8091{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008092 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8093 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8094 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8095 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8096 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008097
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008099
Benjamin Peterson14339b62009-01-31 16:36:08 +00008100 i = 0;
8101 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8103 i++;
8104 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008105 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008106
Benjamin Peterson14339b62009-01-31 16:36:08 +00008107 j = len;
8108 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008109 do {
8110 j--;
8111 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8112 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008113 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008114
Benjamin Peterson14339b62009-01-31 16:36:08 +00008115 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008116 Py_INCREF(self);
8117 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008118 }
8119 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008121}
8122
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123
8124static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008125do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008127 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8128 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008129
Benjamin Peterson14339b62009-01-31 16:36:08 +00008130 i = 0;
8131 if (striptype != RIGHTSTRIP) {
8132 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8133 i++;
8134 }
8135 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008136
Benjamin Peterson14339b62009-01-31 16:36:08 +00008137 j = len;
8138 if (striptype != LEFTSTRIP) {
8139 do {
8140 j--;
8141 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8142 j++;
8143 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008144
Benjamin Peterson14339b62009-01-31 16:36:08 +00008145 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8146 Py_INCREF(self);
8147 return (PyObject*)self;
8148 }
8149 else
8150 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151}
8152
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008153
8154static PyObject *
8155do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8156{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008157 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008158
Benjamin Peterson14339b62009-01-31 16:36:08 +00008159 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8160 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008161
Benjamin Peterson14339b62009-01-31 16:36:08 +00008162 if (sep != NULL && sep != Py_None) {
8163 if (PyUnicode_Check(sep))
8164 return _PyUnicode_XStrip(self, striptype, sep);
8165 else {
8166 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 "%s arg must be None or str",
8168 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008169 return NULL;
8170 }
8171 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008172
Benjamin Peterson14339b62009-01-31 16:36:08 +00008173 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008174}
8175
8176
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008177PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008179\n\
8180Return a copy of the string S with leading and trailing\n\
8181whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008182If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008183
8184static PyObject *
8185unicode_strip(PyUnicodeObject *self, PyObject *args)
8186{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008187 if (PyTuple_GET_SIZE(args) == 0)
8188 return do_strip(self, BOTHSTRIP); /* Common case */
8189 else
8190 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008191}
8192
8193
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008194PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008195 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008196\n\
8197Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008198If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008199
8200static PyObject *
8201unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8202{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008203 if (PyTuple_GET_SIZE(args) == 0)
8204 return do_strip(self, LEFTSTRIP); /* Common case */
8205 else
8206 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008207}
8208
8209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008210PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008211 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008212\n\
8213Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008214If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008215
8216static PyObject *
8217unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8218{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008219 if (PyTuple_GET_SIZE(args) == 0)
8220 return do_strip(self, RIGHTSTRIP); /* Common case */
8221 else
8222 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008223}
8224
8225
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008227unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228{
8229 PyUnicodeObject *u;
8230 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008231 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008232 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233
Georg Brandl222de0f2009-04-12 12:01:50 +00008234 if (len < 1) {
8235 Py_INCREF(unicode_empty);
8236 return (PyObject *)unicode_empty;
8237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238
Tim Peters7a29bd52001-09-12 03:03:31 +00008239 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008240 /* no repeat, return original string */
8241 Py_INCREF(str);
8242 return (PyObject*) str;
8243 }
Tim Peters8f422462000-09-09 06:13:41 +00008244
8245 /* ensure # of chars needed doesn't overflow int and # of bytes
8246 * needed doesn't overflow size_t
8247 */
8248 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008249 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008250 PyErr_SetString(PyExc_OverflowError,
8251 "repeated string is too long");
8252 return NULL;
8253 }
8254 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8255 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8256 PyErr_SetString(PyExc_OverflowError,
8257 "repeated string is too long");
8258 return NULL;
8259 }
8260 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261 if (!u)
8262 return NULL;
8263
8264 p = u->str;
8265
Georg Brandl222de0f2009-04-12 12:01:50 +00008266 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008267 Py_UNICODE_FILL(p, str->str[0], len);
8268 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008269 Py_ssize_t done = str->length; /* number of characters copied this far */
8270 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008271 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008272 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008273 Py_UNICODE_COPY(p+done, p, n);
8274 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276 }
8277
8278 return (PyObject*) u;
8279}
8280
8281PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 PyObject *subobj,
8283 PyObject *replobj,
8284 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285{
8286 PyObject *self;
8287 PyObject *str1;
8288 PyObject *str2;
8289 PyObject *result;
8290
8291 self = PyUnicode_FromObject(obj);
8292 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294 str1 = PyUnicode_FromObject(subobj);
8295 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 Py_DECREF(self);
8297 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298 }
8299 str2 = PyUnicode_FromObject(replobj);
8300 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 Py_DECREF(self);
8302 Py_DECREF(str1);
8303 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304 }
Tim Petersced69f82003-09-16 20:30:58 +00008305 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 (PyUnicodeObject *)str1,
8307 (PyUnicodeObject *)str2,
8308 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 Py_DECREF(self);
8310 Py_DECREF(str1);
8311 Py_DECREF(str2);
8312 return result;
8313}
8314
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008315PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008316 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317\n\
8318Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008319old replaced by new. If the optional argument count is\n\
8320given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321
8322static PyObject*
8323unicode_replace(PyUnicodeObject *self, PyObject *args)
8324{
8325 PyUnicodeObject *str1;
8326 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008327 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328 PyObject *result;
8329
Martin v. Löwis18e16552006-02-15 17:27:45 +00008330 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008331 return NULL;
8332 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8333 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008335 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008336 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 Py_DECREF(str1);
8338 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008339 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340
8341 result = replace(self, str1, str2, maxcount);
8342
8343 Py_DECREF(str1);
8344 Py_DECREF(str2);
8345 return result;
8346}
8347
8348static
8349PyObject *unicode_repr(PyObject *unicode)
8350{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008351 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008352 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008353 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8354 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8355
8356 /* XXX(nnorwitz): rather than over-allocating, it would be
8357 better to choose a different scheme. Perhaps scan the
8358 first N-chars of the string and allocate based on that size.
8359 */
8360 /* Initial allocation is based on the longest-possible unichr
8361 escape.
8362
8363 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8364 unichr, so in this case it's the longest unichr escape. In
8365 narrow (UTF-16) builds this is five chars per source unichr
8366 since there are two unichrs in the surrogate pair, so in narrow
8367 (UTF-16) builds it's not the longest unichr escape.
8368
8369 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8370 so in the narrow (UTF-16) build case it's the longest unichr
8371 escape.
8372 */
8373
Walter Dörwald1ab83302007-05-18 17:15:44 +00008374 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008376#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008378#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008380#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008382 if (repr == NULL)
8383 return NULL;
8384
Walter Dörwald1ab83302007-05-18 17:15:44 +00008385 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008386
8387 /* Add quote */
8388 *p++ = (findchar(s, size, '\'') &&
8389 !findchar(s, size, '"')) ? '"' : '\'';
8390 while (size-- > 0) {
8391 Py_UNICODE ch = *s++;
8392
8393 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008394 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008395 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008396 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008397 continue;
8398 }
8399
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008401 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008402 *p++ = '\\';
8403 *p++ = 't';
8404 }
8405 else if (ch == '\n') {
8406 *p++ = '\\';
8407 *p++ = 'n';
8408 }
8409 else if (ch == '\r') {
8410 *p++ = '\\';
8411 *p++ = 'r';
8412 }
8413
8414 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008415 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008416 *p++ = '\\';
8417 *p++ = 'x';
8418 *p++ = hexdigits[(ch >> 4) & 0x000F];
8419 *p++ = hexdigits[ch & 0x000F];
8420 }
8421
Georg Brandl559e5d72008-06-11 18:37:52 +00008422 /* Copy ASCII characters as-is */
8423 else if (ch < 0x7F) {
8424 *p++ = ch;
8425 }
8426
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008428 else {
8429 Py_UCS4 ucs = ch;
8430
8431#ifndef Py_UNICODE_WIDE
8432 Py_UNICODE ch2 = 0;
8433 /* Get code point from surrogate pair */
8434 if (size > 0) {
8435 ch2 = *s;
8436 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008438 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008440 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008441 size--;
8442 }
8443 }
8444#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008445 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008446 (categories Z* and C* except ASCII space)
8447 */
8448 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8449 /* Map 8-bit characters to '\xhh' */
8450 if (ucs <= 0xff) {
8451 *p++ = '\\';
8452 *p++ = 'x';
8453 *p++ = hexdigits[(ch >> 4) & 0x000F];
8454 *p++ = hexdigits[ch & 0x000F];
8455 }
8456 /* Map 21-bit characters to '\U00xxxxxx' */
8457 else if (ucs >= 0x10000) {
8458 *p++ = '\\';
8459 *p++ = 'U';
8460 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8461 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8462 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8463 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8464 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8465 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8466 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8467 *p++ = hexdigits[ucs & 0x0000000F];
8468 }
8469 /* Map 16-bit characters to '\uxxxx' */
8470 else {
8471 *p++ = '\\';
8472 *p++ = 'u';
8473 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8474 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8475 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8476 *p++ = hexdigits[ucs & 0x000F];
8477 }
8478 }
8479 /* Copy characters as-is */
8480 else {
8481 *p++ = ch;
8482#ifndef Py_UNICODE_WIDE
8483 if (ucs >= 0x10000)
8484 *p++ = ch2;
8485#endif
8486 }
8487 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008488 }
8489 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008490 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008491
8492 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008493 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008494 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008495}
8496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008497PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008499\n\
8500Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008501such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008502arguments start and end are interpreted as in slice notation.\n\
8503\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008504Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008505
8506static PyObject *
8507unicode_rfind(PyUnicodeObject *self, PyObject *args)
8508{
Jesus Ceaac451502011-04-20 17:09:23 +02008509 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008510 Py_ssize_t start;
8511 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008512 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008513
Jesus Ceaac451502011-04-20 17:09:23 +02008514 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
8515 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008517
Thomas Wouters477c8d52006-05-27 19:21:47 +00008518 result = stringlib_rfind_slice(
8519 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8520 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8521 start, end
8522 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523
8524 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008525
Christian Heimes217cfd12007-12-02 14:31:20 +00008526 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008527}
8528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008529PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008531\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008532Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008533
8534static PyObject *
8535unicode_rindex(PyUnicodeObject *self, PyObject *args)
8536{
Jesus Ceaac451502011-04-20 17:09:23 +02008537 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008538 Py_ssize_t start;
8539 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008540 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008541
Jesus Ceaac451502011-04-20 17:09:23 +02008542 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
8543 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008544 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545
Thomas Wouters477c8d52006-05-27 19:21:47 +00008546 result = stringlib_rfind_slice(
8547 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8548 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8549 start, end
8550 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008551
8552 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008553
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554 if (result < 0) {
8555 PyErr_SetString(PyExc_ValueError, "substring not found");
8556 return NULL;
8557 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008558 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559}
8560
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008561PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008564Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008565done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008566
8567static PyObject *
8568unicode_rjust(PyUnicodeObject *self, PyObject *args)
8569{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008570 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008571 Py_UNICODE fillchar = ' ';
8572
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008573 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574 return NULL;
8575
Tim Peters7a29bd52001-09-12 03:03:31 +00008576 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577 Py_INCREF(self);
8578 return (PyObject*) self;
8579 }
8580
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008581 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582}
8583
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008585 PyObject *sep,
8586 Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587{
8588 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008589
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590 s = PyUnicode_FromObject(s);
8591 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008592 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008593 if (sep != NULL) {
8594 sep = PyUnicode_FromObject(sep);
8595 if (sep == NULL) {
8596 Py_DECREF(s);
8597 return NULL;
8598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599 }
8600
8601 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8602
8603 Py_DECREF(s);
8604 Py_XDECREF(sep);
8605 return result;
8606}
8607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008608PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610\n\
8611Return a list of the words in S, using sep as the\n\
8612delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008613splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008614whitespace string is a separator and empty strings are\n\
8615removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616
8617static PyObject*
8618unicode_split(PyUnicodeObject *self, PyObject *args)
8619{
8620 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008621 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622
Martin v. Löwis18e16552006-02-15 17:27:45 +00008623 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 return NULL;
8625
8626 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632}
8633
Thomas Wouters477c8d52006-05-27 19:21:47 +00008634PyObject *
8635PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8636{
8637 PyObject* str_obj;
8638 PyObject* sep_obj;
8639 PyObject* out;
8640
8641 str_obj = PyUnicode_FromObject(str_in);
8642 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008644 sep_obj = PyUnicode_FromObject(sep_in);
8645 if (!sep_obj) {
8646 Py_DECREF(str_obj);
8647 return NULL;
8648 }
8649
8650 out = stringlib_partition(
8651 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8652 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8653 );
8654
8655 Py_DECREF(sep_obj);
8656 Py_DECREF(str_obj);
8657
8658 return out;
8659}
8660
8661
8662PyObject *
8663PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8664{
8665 PyObject* str_obj;
8666 PyObject* sep_obj;
8667 PyObject* out;
8668
8669 str_obj = PyUnicode_FromObject(str_in);
8670 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008672 sep_obj = PyUnicode_FromObject(sep_in);
8673 if (!sep_obj) {
8674 Py_DECREF(str_obj);
8675 return NULL;
8676 }
8677
8678 out = stringlib_rpartition(
8679 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8680 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8681 );
8682
8683 Py_DECREF(sep_obj);
8684 Py_DECREF(str_obj);
8685
8686 return out;
8687}
8688
8689PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008691\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008692Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008693the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008694found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008695
8696static PyObject*
8697unicode_partition(PyUnicodeObject *self, PyObject *separator)
8698{
8699 return PyUnicode_Partition((PyObject *)self, separator);
8700}
8701
8702PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008703 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008704\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008705Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008706the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008707separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008708
8709static PyObject*
8710unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8711{
8712 return PyUnicode_RPartition((PyObject *)self, separator);
8713}
8714
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008715PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 PyObject *sep,
8717 Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008718{
8719 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008720
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008721 s = PyUnicode_FromObject(s);
8722 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008723 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 if (sep != NULL) {
8725 sep = PyUnicode_FromObject(sep);
8726 if (sep == NULL) {
8727 Py_DECREF(s);
8728 return NULL;
8729 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008730 }
8731
8732 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8733
8734 Py_DECREF(s);
8735 Py_XDECREF(sep);
8736 return result;
8737}
8738
8739PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008741\n\
8742Return a list of the words in S, using sep as the\n\
8743delimiter string, starting at the end of the string and\n\
8744working to the front. If maxsplit is given, at most maxsplit\n\
8745splits are done. If sep is not specified, any whitespace string\n\
8746is a separator.");
8747
8748static PyObject*
8749unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8750{
8751 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008752 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008753
Martin v. Löwis18e16552006-02-15 17:27:45 +00008754 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008755 return NULL;
8756
8757 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008758 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008759 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008760 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008761 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008763}
8764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008765PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767\n\
8768Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008769Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008770is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771
8772static PyObject*
8773unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8774{
Guido van Rossum86662912000-04-11 15:38:46 +00008775 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776
Guido van Rossum86662912000-04-11 15:38:46 +00008777 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008778 return NULL;
8779
Guido van Rossum86662912000-04-11 15:38:46 +00008780 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008781}
8782
8783static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008784PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785{
Walter Dörwald346737f2007-05-31 10:44:43 +00008786 if (PyUnicode_CheckExact(self)) {
8787 Py_INCREF(self);
8788 return self;
8789 } else
8790 /* Subtype -- return genuine unicode string with the same value. */
8791 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8792 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793}
8794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008795PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797\n\
8798Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008799and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800
8801static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008802unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008804 return fixup(self, fixswapcase);
8805}
8806
Georg Brandlceee0772007-11-27 23:48:05 +00008807PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008808 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008809\n\
8810Return a translation table usable for str.translate().\n\
8811If there is only one argument, it must be a dictionary mapping Unicode\n\
8812ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008813Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008814If there are two arguments, they must be strings of equal length, and\n\
8815in the resulting dictionary, each character in x will be mapped to the\n\
8816character at the same position in y. If there is a third argument, it\n\
8817must be a string, whose characters will be mapped to None in the result.");
8818
8819static PyObject*
8820unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8821{
8822 PyObject *x, *y = NULL, *z = NULL;
8823 PyObject *new = NULL, *key, *value;
8824 Py_ssize_t i = 0;
8825 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008826
Georg Brandlceee0772007-11-27 23:48:05 +00008827 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8828 return NULL;
8829 new = PyDict_New();
8830 if (!new)
8831 return NULL;
8832 if (y != NULL) {
8833 /* x must be a string too, of equal length */
8834 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8835 if (!PyUnicode_Check(x)) {
8836 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8837 "be a string if there is a second argument");
8838 goto err;
8839 }
8840 if (PyUnicode_GET_SIZE(x) != ylen) {
8841 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8842 "arguments must have equal length");
8843 goto err;
8844 }
8845 /* create entries for translating chars in x to those in y */
8846 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008847 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8848 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008849 if (!key || !value)
8850 goto err;
8851 res = PyDict_SetItem(new, key, value);
8852 Py_DECREF(key);
8853 Py_DECREF(value);
8854 if (res < 0)
8855 goto err;
8856 }
8857 /* create entries for deleting chars in z */
8858 if (z != NULL) {
8859 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008860 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008861 if (!key)
8862 goto err;
8863 res = PyDict_SetItem(new, key, Py_None);
8864 Py_DECREF(key);
8865 if (res < 0)
8866 goto err;
8867 }
8868 }
8869 } else {
8870 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008871 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008872 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8873 "to maketrans it must be a dict");
8874 goto err;
8875 }
8876 /* copy entries into the new dict, converting string keys to int keys */
8877 while (PyDict_Next(x, &i, &key, &value)) {
8878 if (PyUnicode_Check(key)) {
8879 /* convert string keys to integer keys */
8880 PyObject *newkey;
8881 if (PyUnicode_GET_SIZE(key) != 1) {
8882 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8883 "table must be of length 1");
8884 goto err;
8885 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008886 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008887 if (!newkey)
8888 goto err;
8889 res = PyDict_SetItem(new, newkey, value);
8890 Py_DECREF(newkey);
8891 if (res < 0)
8892 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008893 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008894 /* just keep integer keys */
8895 if (PyDict_SetItem(new, key, value) < 0)
8896 goto err;
8897 } else {
8898 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8899 "be strings or integers");
8900 goto err;
8901 }
8902 }
8903 }
8904 return new;
8905 err:
8906 Py_DECREF(new);
8907 return NULL;
8908}
8909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008910PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008911 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912\n\
8913Return a copy of the string S, where all characters have been mapped\n\
8914through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008915Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008916Unmapped characters are left untouched. Characters mapped to None\n\
8917are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008918
8919static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008920unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008921{
Georg Brandlceee0772007-11-27 23:48:05 +00008922 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923}
8924
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008925PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008928Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008929
8930static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008931unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933 return fixup(self, fixupper);
8934}
8935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008936PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00008939Pad a numeric string S with zeros on the left, to fill a field\n\
8940of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941
8942static PyObject *
8943unicode_zfill(PyUnicodeObject *self, PyObject *args)
8944{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008945 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008946 PyUnicodeObject *u;
8947
Martin v. Löwis18e16552006-02-15 17:27:45 +00008948 Py_ssize_t width;
8949 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008950 return NULL;
8951
8952 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00008953 if (PyUnicode_CheckExact(self)) {
8954 Py_INCREF(self);
8955 return (PyObject*) self;
8956 }
8957 else
8958 return PyUnicode_FromUnicode(
8959 PyUnicode_AS_UNICODE(self),
8960 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00008961 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 }
8963
8964 fill = width - self->length;
8965
8966 u = pad(self, fill, 0, '0');
8967
Walter Dörwald068325e2002-04-15 13:36:47 +00008968 if (u == NULL)
8969 return NULL;
8970
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971 if (u->str[fill] == '+' || u->str[fill] == '-') {
8972 /* move sign to beginning of string */
8973 u->str[0] = u->str[fill];
8974 u->str[fill] = '0';
8975 }
8976
8977 return (PyObject*) u;
8978}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979
8980#if 0
8981static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008982unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983{
Christian Heimes2202f872008-02-06 14:31:34 +00008984 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008986
8987static PyObject *
8988unicode__decimal2ascii(PyObject *self)
8989{
8990 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
8991 PyUnicode_GET_SIZE(self));
8992}
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993#endif
8994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008995PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008996 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00008998Return True if S starts with the specified prefix, False otherwise.\n\
8999With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009000With optional end, stop comparing S at that position.\n\
9001prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002
9003static PyObject *
9004unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009005 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009007 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009009 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009010 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009011 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012
Jesus Ceaac451502011-04-20 17:09:23 +02009013 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009014 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009015 if (PyTuple_Check(subobj)) {
9016 Py_ssize_t i;
9017 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9018 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009019 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009020 if (substring == NULL)
9021 return NULL;
9022 result = tailmatch(self, substring, start, end, -1);
9023 Py_DECREF(substring);
9024 if (result) {
9025 Py_RETURN_TRUE;
9026 }
9027 }
9028 /* nothing matched */
9029 Py_RETURN_FALSE;
9030 }
9031 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009032 if (substring == NULL) {
9033 if (PyErr_ExceptionMatches(PyExc_TypeError))
9034 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
9035 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009037 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009038 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009040 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041}
9042
9043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009044PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009045 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009047Return True if S ends with the specified suffix, False otherwise.\n\
9048With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009049With optional end, stop comparing S at that position.\n\
9050suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051
9052static PyObject *
9053unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009054 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009056 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009058 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009059 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009060 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061
Jesus Ceaac451502011-04-20 17:09:23 +02009062 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009063 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009064 if (PyTuple_Check(subobj)) {
9065 Py_ssize_t i;
9066 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9067 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009068 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009069 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009070 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009071 result = tailmatch(self, substring, start, end, +1);
9072 Py_DECREF(substring);
9073 if (result) {
9074 Py_RETURN_TRUE;
9075 }
9076 }
9077 Py_RETURN_FALSE;
9078 }
9079 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009080 if (substring == NULL) {
9081 if (PyErr_ExceptionMatches(PyExc_TypeError))
9082 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
9083 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009084 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009085 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009086 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009088 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089}
9090
Eric Smith8c663262007-08-25 02:26:07 +00009091#include "stringlib/string_format.h"
9092
9093PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009094 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009095\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009096Return a formatted version of S, using substitutions from args and kwargs.\n\
9097The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009098
Eric Smith27bbca62010-11-04 17:06:58 +00009099PyDoc_STRVAR(format_map__doc__,
9100 "S.format_map(mapping) -> str\n\
9101\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009102Return a formatted version of S, using substitutions from mapping.\n\
9103The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009104
Eric Smith4a7d76d2008-05-30 18:10:19 +00009105static PyObject *
9106unicode__format__(PyObject* self, PyObject* args)
9107{
9108 PyObject *format_spec;
9109
9110 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9111 return NULL;
9112
9113 return _PyUnicode_FormatAdvanced(self,
9114 PyUnicode_AS_UNICODE(format_spec),
9115 PyUnicode_GET_SIZE(format_spec));
9116}
9117
Eric Smith8c663262007-08-25 02:26:07 +00009118PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009119 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009120\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009121Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009122
9123static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009124unicode__sizeof__(PyUnicodeObject *v)
9125{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009126 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9127 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009128}
9129
9130PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009132
9133static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009134unicode_getnewargs(PyUnicodeObject *v)
9135{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009136 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009137}
9138
Guido van Rossumd57fd912000-03-10 22:53:23 +00009139static PyMethodDef unicode_methods[] = {
9140
9141 /* Order is according to common usage: often used methods should
9142 appear first, since lookup is done sequentially. */
9143
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009144 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009145 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9146 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009147 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009148 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9149 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9150 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9151 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9152 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9153 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9154 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009155 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009156 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9157 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9158 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009159 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009160 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9161 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9162 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009163 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009164 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009165 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009166 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009167 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9168 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9169 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9170 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9171 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9172 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9173 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9174 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9175 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9176 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9177 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9178 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9179 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9180 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009181 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009182 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009183 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009184 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009185 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009186 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009187 {"maketrans", (PyCFunction) unicode_maketrans,
9188 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009189 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009190#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009191 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009192#endif
9193
9194#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009195 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009196 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009197 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009198#endif
9199
Benjamin Peterson14339b62009-01-31 16:36:08 +00009200 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009201 {NULL, NULL}
9202};
9203
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009204static PyObject *
9205unicode_mod(PyObject *v, PyObject *w)
9206{
Benjamin Peterson29060642009-01-31 22:14:21 +00009207 if (!PyUnicode_Check(v)) {
9208 Py_INCREF(Py_NotImplemented);
9209 return Py_NotImplemented;
9210 }
9211 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009212}
9213
9214static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009215 0, /*nb_add*/
9216 0, /*nb_subtract*/
9217 0, /*nb_multiply*/
9218 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009219};
9220
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009222 (lenfunc) unicode_length, /* sq_length */
9223 PyUnicode_Concat, /* sq_concat */
9224 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9225 (ssizeargfunc) unicode_getitem, /* sq_item */
9226 0, /* sq_slice */
9227 0, /* sq_ass_item */
9228 0, /* sq_ass_slice */
9229 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230};
9231
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009232static PyObject*
9233unicode_subscript(PyUnicodeObject* self, PyObject* item)
9234{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009235 if (PyIndex_Check(item)) {
9236 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009237 if (i == -1 && PyErr_Occurred())
9238 return NULL;
9239 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009240 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009241 return unicode_getitem(self, i);
9242 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009243 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009244 Py_UNICODE* source_buf;
9245 Py_UNICODE* result_buf;
9246 PyObject* result;
9247
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009248 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009249 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009250 return NULL;
9251 }
9252
9253 if (slicelength <= 0) {
9254 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009255 } else if (start == 0 && step == 1 && slicelength == self->length &&
9256 PyUnicode_CheckExact(self)) {
9257 Py_INCREF(self);
9258 return (PyObject *)self;
9259 } else if (step == 1) {
9260 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009261 } else {
9262 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009263 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9264 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009265
Benjamin Peterson29060642009-01-31 22:14:21 +00009266 if (result_buf == NULL)
9267 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009268
9269 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9270 result_buf[i] = source_buf[cur];
9271 }
Tim Petersced69f82003-09-16 20:30:58 +00009272
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009273 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009274 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009275 return result;
9276 }
9277 } else {
9278 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9279 return NULL;
9280 }
9281}
9282
9283static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009284 (lenfunc)unicode_length, /* mp_length */
9285 (binaryfunc)unicode_subscript, /* mp_subscript */
9286 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009287};
9288
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290/* Helpers for PyUnicode_Format() */
9291
9292static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009293getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009294{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009295 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009296 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009297 (*p_argidx)++;
9298 if (arglen < 0)
9299 return args;
9300 else
9301 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009302 }
9303 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009304 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305 return NULL;
9306}
9307
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009308/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009309
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009310static PyObject *
9311formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009313 char *p;
9314 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009315 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009316
Guido van Rossumd57fd912000-03-10 22:53:23 +00009317 x = PyFloat_AsDouble(v);
9318 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009319 return NULL;
9320
Guido van Rossumd57fd912000-03-10 22:53:23 +00009321 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009322 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009323
Eric Smith0923d1d2009-04-16 20:16:10 +00009324 p = PyOS_double_to_string(x, type, prec,
9325 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009326 if (p == NULL)
9327 return NULL;
9328 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009329 PyMem_Free(p);
9330 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009331}
9332
Tim Peters38fd5b62000-09-21 05:43:11 +00009333static PyObject*
9334formatlong(PyObject *val, int flags, int prec, int type)
9335{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009336 char *buf;
9337 int len;
9338 PyObject *str; /* temporary string object. */
9339 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009340
Benjamin Peterson14339b62009-01-31 16:36:08 +00009341 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9342 if (!str)
9343 return NULL;
9344 result = PyUnicode_FromStringAndSize(buf, len);
9345 Py_DECREF(str);
9346 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009347}
9348
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349static int
9350formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009351 size_t buflen,
9352 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009353{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009354 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009355 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009356 if (PyUnicode_GET_SIZE(v) == 1) {
9357 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9358 buf[1] = '\0';
9359 return 1;
9360 }
9361#ifndef Py_UNICODE_WIDE
9362 if (PyUnicode_GET_SIZE(v) == 2) {
9363 /* Decode a valid surrogate pair */
9364 int c0 = PyUnicode_AS_UNICODE(v)[0];
9365 int c1 = PyUnicode_AS_UNICODE(v)[1];
9366 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9367 0xDC00 <= c1 && c1 <= 0xDFFF) {
9368 buf[0] = c0;
9369 buf[1] = c1;
9370 buf[2] = '\0';
9371 return 2;
9372 }
9373 }
9374#endif
9375 goto onError;
9376 }
9377 else {
9378 /* Integer input truncated to a character */
9379 long x;
9380 x = PyLong_AsLong(v);
9381 if (x == -1 && PyErr_Occurred())
9382 goto onError;
9383
9384 if (x < 0 || x > 0x10ffff) {
9385 PyErr_SetString(PyExc_OverflowError,
9386 "%c arg not in range(0x110000)");
9387 return -1;
9388 }
9389
9390#ifndef Py_UNICODE_WIDE
9391 if (x > 0xffff) {
9392 x -= 0x10000;
9393 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9394 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9395 return 2;
9396 }
9397#endif
9398 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009399 buf[1] = '\0';
9400 return 1;
9401 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009402
Benjamin Peterson29060642009-01-31 22:14:21 +00009403 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009404 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009405 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009406 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009407}
9408
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009409/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009410 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009411*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009412#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009413
Guido van Rossumd57fd912000-03-10 22:53:23 +00009414PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson29060642009-01-31 22:14:21 +00009415 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009416{
9417 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009418 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009419 int args_owned = 0;
9420 PyUnicodeObject *result = NULL;
9421 PyObject *dict = NULL;
9422 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009423
Guido van Rossumd57fd912000-03-10 22:53:23 +00009424 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009425 PyErr_BadInternalCall();
9426 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009427 }
9428 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009429 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009430 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009431 fmt = PyUnicode_AS_UNICODE(uformat);
9432 fmtcnt = PyUnicode_GET_SIZE(uformat);
9433
9434 reslen = rescnt = fmtcnt + 100;
9435 result = _PyUnicode_New(reslen);
9436 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009437 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009438 res = PyUnicode_AS_UNICODE(result);
9439
9440 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009441 arglen = PyTuple_Size(args);
9442 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009443 }
9444 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009445 arglen = -1;
9446 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009448 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009449 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009450 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009451
9452 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009453 if (*fmt != '%') {
9454 if (--rescnt < 0) {
9455 rescnt = fmtcnt + 100;
9456 reslen += rescnt;
9457 if (_PyUnicode_Resize(&result, reslen) < 0)
9458 goto onError;
9459 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9460 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009461 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009462 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009463 }
9464 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009465 /* Got a format specifier */
9466 int flags = 0;
9467 Py_ssize_t width = -1;
9468 int prec = -1;
9469 Py_UNICODE c = '\0';
9470 Py_UNICODE fill;
9471 int isnumok;
9472 PyObject *v = NULL;
9473 PyObject *temp = NULL;
9474 Py_UNICODE *pbuf;
9475 Py_UNICODE sign;
9476 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009477 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478
Benjamin Peterson29060642009-01-31 22:14:21 +00009479 fmt++;
9480 if (*fmt == '(') {
9481 Py_UNICODE *keystart;
9482 Py_ssize_t keylen;
9483 PyObject *key;
9484 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009485
Benjamin Peterson29060642009-01-31 22:14:21 +00009486 if (dict == NULL) {
9487 PyErr_SetString(PyExc_TypeError,
9488 "format requires a mapping");
9489 goto onError;
9490 }
9491 ++fmt;
9492 --fmtcnt;
9493 keystart = fmt;
9494 /* Skip over balanced parentheses */
9495 while (pcount > 0 && --fmtcnt >= 0) {
9496 if (*fmt == ')')
9497 --pcount;
9498 else if (*fmt == '(')
9499 ++pcount;
9500 fmt++;
9501 }
9502 keylen = fmt - keystart - 1;
9503 if (fmtcnt < 0 || pcount > 0) {
9504 PyErr_SetString(PyExc_ValueError,
9505 "incomplete format key");
9506 goto onError;
9507 }
9508#if 0
9509 /* keys are converted to strings using UTF-8 and
9510 then looked up since Python uses strings to hold
9511 variables names etc. in its namespaces and we
9512 wouldn't want to break common idioms. */
9513 key = PyUnicode_EncodeUTF8(keystart,
9514 keylen,
9515 NULL);
9516#else
9517 key = PyUnicode_FromUnicode(keystart, keylen);
9518#endif
9519 if (key == NULL)
9520 goto onError;
9521 if (args_owned) {
9522 Py_DECREF(args);
9523 args_owned = 0;
9524 }
9525 args = PyObject_GetItem(dict, key);
9526 Py_DECREF(key);
9527 if (args == NULL) {
9528 goto onError;
9529 }
9530 args_owned = 1;
9531 arglen = -1;
9532 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009533 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009534 while (--fmtcnt >= 0) {
9535 switch (c = *fmt++) {
9536 case '-': flags |= F_LJUST; continue;
9537 case '+': flags |= F_SIGN; continue;
9538 case ' ': flags |= F_BLANK; continue;
9539 case '#': flags |= F_ALT; continue;
9540 case '0': flags |= F_ZERO; continue;
9541 }
9542 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009543 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009544 if (c == '*') {
9545 v = getnextarg(args, arglen, &argidx);
9546 if (v == NULL)
9547 goto onError;
9548 if (!PyLong_Check(v)) {
9549 PyErr_SetString(PyExc_TypeError,
9550 "* wants int");
9551 goto onError;
9552 }
9553 width = PyLong_AsLong(v);
9554 if (width == -1 && PyErr_Occurred())
9555 goto onError;
9556 if (width < 0) {
9557 flags |= F_LJUST;
9558 width = -width;
9559 }
9560 if (--fmtcnt >= 0)
9561 c = *fmt++;
9562 }
9563 else if (c >= '0' && c <= '9') {
9564 width = c - '0';
9565 while (--fmtcnt >= 0) {
9566 c = *fmt++;
9567 if (c < '0' || c > '9')
9568 break;
9569 if ((width*10) / 10 != width) {
9570 PyErr_SetString(PyExc_ValueError,
9571 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009572 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009573 }
9574 width = width*10 + (c - '0');
9575 }
9576 }
9577 if (c == '.') {
9578 prec = 0;
9579 if (--fmtcnt >= 0)
9580 c = *fmt++;
9581 if (c == '*') {
9582 v = getnextarg(args, arglen, &argidx);
9583 if (v == NULL)
9584 goto onError;
9585 if (!PyLong_Check(v)) {
9586 PyErr_SetString(PyExc_TypeError,
9587 "* wants int");
9588 goto onError;
9589 }
9590 prec = PyLong_AsLong(v);
9591 if (prec == -1 && PyErr_Occurred())
9592 goto onError;
9593 if (prec < 0)
9594 prec = 0;
9595 if (--fmtcnt >= 0)
9596 c = *fmt++;
9597 }
9598 else if (c >= '0' && c <= '9') {
9599 prec = c - '0';
9600 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009601 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009602 if (c < '0' || c > '9')
9603 break;
9604 if ((prec*10) / 10 != prec) {
9605 PyErr_SetString(PyExc_ValueError,
9606 "prec too big");
9607 goto onError;
9608 }
9609 prec = prec*10 + (c - '0');
9610 }
9611 }
9612 } /* prec */
9613 if (fmtcnt >= 0) {
9614 if (c == 'h' || c == 'l' || c == 'L') {
9615 if (--fmtcnt >= 0)
9616 c = *fmt++;
9617 }
9618 }
9619 if (fmtcnt < 0) {
9620 PyErr_SetString(PyExc_ValueError,
9621 "incomplete format");
9622 goto onError;
9623 }
9624 if (c != '%') {
9625 v = getnextarg(args, arglen, &argidx);
9626 if (v == NULL)
9627 goto onError;
9628 }
9629 sign = 0;
9630 fill = ' ';
9631 switch (c) {
9632
9633 case '%':
9634 pbuf = formatbuf;
9635 /* presume that buffer length is at least 1 */
9636 pbuf[0] = '%';
9637 len = 1;
9638 break;
9639
9640 case 's':
9641 case 'r':
9642 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009643 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009644 temp = v;
9645 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009646 }
9647 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009648 if (c == 's')
9649 temp = PyObject_Str(v);
9650 else if (c == 'r')
9651 temp = PyObject_Repr(v);
9652 else
9653 temp = PyObject_ASCII(v);
9654 if (temp == NULL)
9655 goto onError;
9656 if (PyUnicode_Check(temp))
9657 /* nothing to do */;
9658 else {
9659 Py_DECREF(temp);
9660 PyErr_SetString(PyExc_TypeError,
9661 "%s argument has non-string str()");
9662 goto onError;
9663 }
9664 }
9665 pbuf = PyUnicode_AS_UNICODE(temp);
9666 len = PyUnicode_GET_SIZE(temp);
9667 if (prec >= 0 && len > prec)
9668 len = prec;
9669 break;
9670
9671 case 'i':
9672 case 'd':
9673 case 'u':
9674 case 'o':
9675 case 'x':
9676 case 'X':
9677 if (c == 'i')
9678 c = 'd';
9679 isnumok = 0;
9680 if (PyNumber_Check(v)) {
9681 PyObject *iobj=NULL;
9682
9683 if (PyLong_Check(v)) {
9684 iobj = v;
9685 Py_INCREF(iobj);
9686 }
9687 else {
9688 iobj = PyNumber_Long(v);
9689 }
9690 if (iobj!=NULL) {
9691 if (PyLong_Check(iobj)) {
9692 isnumok = 1;
9693 temp = formatlong(iobj, flags, prec, c);
9694 Py_DECREF(iobj);
9695 if (!temp)
9696 goto onError;
9697 pbuf = PyUnicode_AS_UNICODE(temp);
9698 len = PyUnicode_GET_SIZE(temp);
9699 sign = 1;
9700 }
9701 else {
9702 Py_DECREF(iobj);
9703 }
9704 }
9705 }
9706 if (!isnumok) {
9707 PyErr_Format(PyExc_TypeError,
9708 "%%%c format: a number is required, "
9709 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9710 goto onError;
9711 }
9712 if (flags & F_ZERO)
9713 fill = '0';
9714 break;
9715
9716 case 'e':
9717 case 'E':
9718 case 'f':
9719 case 'F':
9720 case 'g':
9721 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009722 temp = formatfloat(v, flags, prec, c);
9723 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009724 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009725 pbuf = PyUnicode_AS_UNICODE(temp);
9726 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009727 sign = 1;
9728 if (flags & F_ZERO)
9729 fill = '0';
9730 break;
9731
9732 case 'c':
9733 pbuf = formatbuf;
9734 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9735 if (len < 0)
9736 goto onError;
9737 break;
9738
9739 default:
9740 PyErr_Format(PyExc_ValueError,
9741 "unsupported format character '%c' (0x%x) "
9742 "at index %zd",
9743 (31<=c && c<=126) ? (char)c : '?',
9744 (int)c,
9745 (Py_ssize_t)(fmt - 1 -
9746 PyUnicode_AS_UNICODE(uformat)));
9747 goto onError;
9748 }
9749 if (sign) {
9750 if (*pbuf == '-' || *pbuf == '+') {
9751 sign = *pbuf++;
9752 len--;
9753 }
9754 else if (flags & F_SIGN)
9755 sign = '+';
9756 else if (flags & F_BLANK)
9757 sign = ' ';
9758 else
9759 sign = 0;
9760 }
9761 if (width < len)
9762 width = len;
9763 if (rescnt - (sign != 0) < width) {
9764 reslen -= rescnt;
9765 rescnt = width + fmtcnt + 100;
9766 reslen += rescnt;
9767 if (reslen < 0) {
9768 Py_XDECREF(temp);
9769 PyErr_NoMemory();
9770 goto onError;
9771 }
9772 if (_PyUnicode_Resize(&result, reslen) < 0) {
9773 Py_XDECREF(temp);
9774 goto onError;
9775 }
9776 res = PyUnicode_AS_UNICODE(result)
9777 + reslen - rescnt;
9778 }
9779 if (sign) {
9780 if (fill != ' ')
9781 *res++ = sign;
9782 rescnt--;
9783 if (width > len)
9784 width--;
9785 }
9786 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9787 assert(pbuf[0] == '0');
9788 assert(pbuf[1] == c);
9789 if (fill != ' ') {
9790 *res++ = *pbuf++;
9791 *res++ = *pbuf++;
9792 }
9793 rescnt -= 2;
9794 width -= 2;
9795 if (width < 0)
9796 width = 0;
9797 len -= 2;
9798 }
9799 if (width > len && !(flags & F_LJUST)) {
9800 do {
9801 --rescnt;
9802 *res++ = fill;
9803 } while (--width > len);
9804 }
9805 if (fill == ' ') {
9806 if (sign)
9807 *res++ = sign;
9808 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9809 assert(pbuf[0] == '0');
9810 assert(pbuf[1] == c);
9811 *res++ = *pbuf++;
9812 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009813 }
9814 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009815 Py_UNICODE_COPY(res, pbuf, len);
9816 res += len;
9817 rescnt -= len;
9818 while (--width >= len) {
9819 --rescnt;
9820 *res++ = ' ';
9821 }
9822 if (dict && (argidx < arglen) && c != '%') {
9823 PyErr_SetString(PyExc_TypeError,
9824 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009825 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009826 goto onError;
9827 }
9828 Py_XDECREF(temp);
9829 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830 } /* until end */
9831 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009832 PyErr_SetString(PyExc_TypeError,
9833 "not all arguments converted during string formatting");
9834 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009835 }
9836
Thomas Woutersa96affe2006-03-12 00:29:36 +00009837 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009838 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009839 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009840 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009841 }
9842 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009843 return (PyObject *)result;
9844
Benjamin Peterson29060642009-01-31 22:14:21 +00009845 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846 Py_XDECREF(result);
9847 Py_DECREF(uformat);
9848 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009849 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009850 }
9851 return NULL;
9852}
9853
Jeremy Hylton938ace62002-07-17 16:30:39 +00009854static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009855unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9856
Tim Peters6d6c1a32001-08-02 04:15:00 +00009857static PyObject *
9858unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9859{
Benjamin Peterson29060642009-01-31 22:14:21 +00009860 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009861 static char *kwlist[] = {"object", "encoding", "errors", 0};
9862 char *encoding = NULL;
9863 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009864
Benjamin Peterson14339b62009-01-31 16:36:08 +00009865 if (type != &PyUnicode_Type)
9866 return unicode_subtype_new(type, args, kwds);
9867 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009868 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009869 return NULL;
9870 if (x == NULL)
9871 return (PyObject *)_PyUnicode_New(0);
9872 if (encoding == NULL && errors == NULL)
9873 return PyObject_Str(x);
9874 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009875 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009876}
9877
Guido van Rossume023fe02001-08-30 03:12:59 +00009878static PyObject *
9879unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9880{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009881 PyUnicodeObject *tmp, *pnew;
9882 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009883
Benjamin Peterson14339b62009-01-31 16:36:08 +00009884 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9885 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9886 if (tmp == NULL)
9887 return NULL;
9888 assert(PyUnicode_Check(tmp));
9889 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9890 if (pnew == NULL) {
9891 Py_DECREF(tmp);
9892 return NULL;
9893 }
9894 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9895 if (pnew->str == NULL) {
9896 _Py_ForgetReference((PyObject *)pnew);
9897 PyObject_Del(pnew);
9898 Py_DECREF(tmp);
9899 return PyErr_NoMemory();
9900 }
9901 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9902 pnew->length = n;
9903 pnew->hash = tmp->hash;
9904 Py_DECREF(tmp);
9905 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009906}
9907
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009908PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009909 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009910\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009911Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009912encoding defaults to the current default string encoding.\n\
9913errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009914
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009915static PyObject *unicode_iter(PyObject *seq);
9916
Guido van Rossumd57fd912000-03-10 22:53:23 +00009917PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009918 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009919 "str", /* tp_name */
9920 sizeof(PyUnicodeObject), /* tp_size */
9921 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009922 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009923 (destructor)unicode_dealloc, /* tp_dealloc */
9924 0, /* tp_print */
9925 0, /* tp_getattr */
9926 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009927 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009928 unicode_repr, /* tp_repr */
9929 &unicode_as_number, /* tp_as_number */
9930 &unicode_as_sequence, /* tp_as_sequence */
9931 &unicode_as_mapping, /* tp_as_mapping */
9932 (hashfunc) unicode_hash, /* tp_hash*/
9933 0, /* tp_call*/
9934 (reprfunc) unicode_str, /* tp_str */
9935 PyObject_GenericGetAttr, /* tp_getattro */
9936 0, /* tp_setattro */
9937 0, /* tp_as_buffer */
9938 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009939 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009940 unicode_doc, /* tp_doc */
9941 0, /* tp_traverse */
9942 0, /* tp_clear */
9943 PyUnicode_RichCompare, /* tp_richcompare */
9944 0, /* tp_weaklistoffset */
9945 unicode_iter, /* tp_iter */
9946 0, /* tp_iternext */
9947 unicode_methods, /* tp_methods */
9948 0, /* tp_members */
9949 0, /* tp_getset */
9950 &PyBaseObject_Type, /* tp_base */
9951 0, /* tp_dict */
9952 0, /* tp_descr_get */
9953 0, /* tp_descr_set */
9954 0, /* tp_dictoffset */
9955 0, /* tp_init */
9956 0, /* tp_alloc */
9957 unicode_new, /* tp_new */
9958 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009959};
9960
9961/* Initialize the Unicode implementation */
9962
Thomas Wouters78890102000-07-22 19:25:51 +00009963void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009964{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009965 int i;
9966
Thomas Wouters477c8d52006-05-27 19:21:47 +00009967 /* XXX - move this array to unicodectype.c ? */
9968 Py_UNICODE linebreak[] = {
9969 0x000A, /* LINE FEED */
9970 0x000D, /* CARRIAGE RETURN */
9971 0x001C, /* FILE SEPARATOR */
9972 0x001D, /* GROUP SEPARATOR */
9973 0x001E, /* RECORD SEPARATOR */
9974 0x0085, /* NEXT LINE */
9975 0x2028, /* LINE SEPARATOR */
9976 0x2029, /* PARAGRAPH SEPARATOR */
9977 };
9978
Fred Drakee4315f52000-05-09 19:53:39 +00009979 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +00009980 free_list = NULL;
9981 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009983 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00009984 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009985
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009986 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00009987 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00009988 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009989 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +00009990
9991 /* initialize the linebreak bloom filter */
9992 bloom_linebreak = make_bloom_mask(
9993 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9994 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009995
9996 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997}
9998
9999/* Finalize the Unicode implementation */
10000
Christian Heimesa156e092008-02-16 07:38:31 +000010001int
10002PyUnicode_ClearFreeList(void)
10003{
10004 int freelist_size = numfree;
10005 PyUnicodeObject *u;
10006
10007 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010008 PyUnicodeObject *v = u;
10009 u = *(PyUnicodeObject **)u;
10010 if (v->str)
10011 PyObject_DEL(v->str);
10012 Py_XDECREF(v->defenc);
10013 PyObject_Del(v);
10014 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010015 }
10016 free_list = NULL;
10017 assert(numfree == 0);
10018 return freelist_size;
10019}
10020
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021void
Thomas Wouters78890102000-07-22 19:25:51 +000010022_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010024 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010026 Py_XDECREF(unicode_empty);
10027 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010028
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010029 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010030 if (unicode_latin1[i]) {
10031 Py_DECREF(unicode_latin1[i]);
10032 unicode_latin1[i] = NULL;
10033 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010034 }
Christian Heimesa156e092008-02-16 07:38:31 +000010035 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010036}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010037
Walter Dörwald16807132007-05-25 13:52:07 +000010038void
10039PyUnicode_InternInPlace(PyObject **p)
10040{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010041 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10042 PyObject *t;
10043 if (s == NULL || !PyUnicode_Check(s))
10044 Py_FatalError(
10045 "PyUnicode_InternInPlace: unicode strings only please!");
10046 /* If it's a subclass, we don't really know what putting
10047 it in the interned dict might do. */
10048 if (!PyUnicode_CheckExact(s))
10049 return;
10050 if (PyUnicode_CHECK_INTERNED(s))
10051 return;
10052 if (interned == NULL) {
10053 interned = PyDict_New();
10054 if (interned == NULL) {
10055 PyErr_Clear(); /* Don't leave an exception */
10056 return;
10057 }
10058 }
10059 /* It might be that the GetItem call fails even
10060 though the key is present in the dictionary,
10061 namely when this happens during a stack overflow. */
10062 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010063 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010064 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010065
Benjamin Peterson29060642009-01-31 22:14:21 +000010066 if (t) {
10067 Py_INCREF(t);
10068 Py_DECREF(*p);
10069 *p = t;
10070 return;
10071 }
Walter Dörwald16807132007-05-25 13:52:07 +000010072
Benjamin Peterson14339b62009-01-31 16:36:08 +000010073 PyThreadState_GET()->recursion_critical = 1;
10074 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10075 PyErr_Clear();
10076 PyThreadState_GET()->recursion_critical = 0;
10077 return;
10078 }
10079 PyThreadState_GET()->recursion_critical = 0;
10080 /* The two references in interned are not counted by refcnt.
10081 The deallocator will take care of this */
10082 Py_REFCNT(s) -= 2;
10083 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010084}
10085
10086void
10087PyUnicode_InternImmortal(PyObject **p)
10088{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010089 PyUnicode_InternInPlace(p);
10090 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10091 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10092 Py_INCREF(*p);
10093 }
Walter Dörwald16807132007-05-25 13:52:07 +000010094}
10095
10096PyObject *
10097PyUnicode_InternFromString(const char *cp)
10098{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010099 PyObject *s = PyUnicode_FromString(cp);
10100 if (s == NULL)
10101 return NULL;
10102 PyUnicode_InternInPlace(&s);
10103 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010104}
10105
10106void _Py_ReleaseInternedUnicodeStrings(void)
10107{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010108 PyObject *keys;
10109 PyUnicodeObject *s;
10110 Py_ssize_t i, n;
10111 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010112
Benjamin Peterson14339b62009-01-31 16:36:08 +000010113 if (interned == NULL || !PyDict_Check(interned))
10114 return;
10115 keys = PyDict_Keys(interned);
10116 if (keys == NULL || !PyList_Check(keys)) {
10117 PyErr_Clear();
10118 return;
10119 }
Walter Dörwald16807132007-05-25 13:52:07 +000010120
Benjamin Peterson14339b62009-01-31 16:36:08 +000010121 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10122 detector, interned unicode strings are not forcibly deallocated;
10123 rather, we give them their stolen references back, and then clear
10124 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010125
Benjamin Peterson14339b62009-01-31 16:36:08 +000010126 n = PyList_GET_SIZE(keys);
10127 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010128 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010129 for (i = 0; i < n; i++) {
10130 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10131 switch (s->state) {
10132 case SSTATE_NOT_INTERNED:
10133 /* XXX Shouldn't happen */
10134 break;
10135 case SSTATE_INTERNED_IMMORTAL:
10136 Py_REFCNT(s) += 1;
10137 immortal_size += s->length;
10138 break;
10139 case SSTATE_INTERNED_MORTAL:
10140 Py_REFCNT(s) += 2;
10141 mortal_size += s->length;
10142 break;
10143 default:
10144 Py_FatalError("Inconsistent interned string state.");
10145 }
10146 s->state = SSTATE_NOT_INTERNED;
10147 }
10148 fprintf(stderr, "total size of all interned strings: "
10149 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10150 "mortal/immortal\n", mortal_size, immortal_size);
10151 Py_DECREF(keys);
10152 PyDict_Clear(interned);
10153 Py_DECREF(interned);
10154 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010155}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010156
10157
10158/********************* Unicode Iterator **************************/
10159
10160typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010161 PyObject_HEAD
10162 Py_ssize_t it_index;
10163 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010164} unicodeiterobject;
10165
10166static void
10167unicodeiter_dealloc(unicodeiterobject *it)
10168{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010169 _PyObject_GC_UNTRACK(it);
10170 Py_XDECREF(it->it_seq);
10171 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010172}
10173
10174static int
10175unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10176{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010177 Py_VISIT(it->it_seq);
10178 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010179}
10180
10181static PyObject *
10182unicodeiter_next(unicodeiterobject *it)
10183{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010184 PyUnicodeObject *seq;
10185 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010186
Benjamin Peterson14339b62009-01-31 16:36:08 +000010187 assert(it != NULL);
10188 seq = it->it_seq;
10189 if (seq == NULL)
10190 return NULL;
10191 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010192
Benjamin Peterson14339b62009-01-31 16:36:08 +000010193 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10194 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010195 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010196 if (item != NULL)
10197 ++it->it_index;
10198 return item;
10199 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010200
Benjamin Peterson14339b62009-01-31 16:36:08 +000010201 Py_DECREF(seq);
10202 it->it_seq = NULL;
10203 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010204}
10205
10206static PyObject *
10207unicodeiter_len(unicodeiterobject *it)
10208{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010209 Py_ssize_t len = 0;
10210 if (it->it_seq)
10211 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10212 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010213}
10214
10215PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10216
10217static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010218 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010219 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010220 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010221};
10222
10223PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010224 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10225 "str_iterator", /* tp_name */
10226 sizeof(unicodeiterobject), /* tp_basicsize */
10227 0, /* tp_itemsize */
10228 /* methods */
10229 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10230 0, /* tp_print */
10231 0, /* tp_getattr */
10232 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010233 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010234 0, /* tp_repr */
10235 0, /* tp_as_number */
10236 0, /* tp_as_sequence */
10237 0, /* tp_as_mapping */
10238 0, /* tp_hash */
10239 0, /* tp_call */
10240 0, /* tp_str */
10241 PyObject_GenericGetAttr, /* tp_getattro */
10242 0, /* tp_setattro */
10243 0, /* tp_as_buffer */
10244 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10245 0, /* tp_doc */
10246 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10247 0, /* tp_clear */
10248 0, /* tp_richcompare */
10249 0, /* tp_weaklistoffset */
10250 PyObject_SelfIter, /* tp_iter */
10251 (iternextfunc)unicodeiter_next, /* tp_iternext */
10252 unicodeiter_methods, /* tp_methods */
10253 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010254};
10255
10256static PyObject *
10257unicode_iter(PyObject *seq)
10258{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010259 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010260
Benjamin Peterson14339b62009-01-31 16:36:08 +000010261 if (!PyUnicode_Check(seq)) {
10262 PyErr_BadInternalCall();
10263 return NULL;
10264 }
10265 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10266 if (it == NULL)
10267 return NULL;
10268 it->it_index = 0;
10269 Py_INCREF(seq);
10270 it->it_seq = (PyUnicodeObject *)seq;
10271 _PyObject_GC_TRACK(it);
10272 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010273}
10274
Martin v. Löwis5b222132007-06-10 09:51:05 +000010275size_t
10276Py_UNICODE_strlen(const Py_UNICODE *u)
10277{
10278 int res = 0;
10279 while(*u++)
10280 res++;
10281 return res;
10282}
10283
10284Py_UNICODE*
10285Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10286{
10287 Py_UNICODE *u = s1;
10288 while ((*u++ = *s2++));
10289 return s1;
10290}
10291
10292Py_UNICODE*
10293Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10294{
10295 Py_UNICODE *u = s1;
10296 while ((*u++ = *s2++))
10297 if (n-- == 0)
10298 break;
10299 return s1;
10300}
10301
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010302Py_UNICODE*
10303Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10304{
10305 Py_UNICODE *u1 = s1;
10306 u1 += Py_UNICODE_strlen(u1);
10307 Py_UNICODE_strcpy(u1, s2);
10308 return s1;
10309}
10310
Martin v. Löwis5b222132007-06-10 09:51:05 +000010311int
10312Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10313{
10314 while (*s1 && *s2 && *s1 == *s2)
10315 s1++, s2++;
10316 if (*s1 && *s2)
10317 return (*s1 < *s2) ? -1 : +1;
10318 if (*s1)
10319 return 1;
10320 if (*s2)
10321 return -1;
10322 return 0;
10323}
10324
Victor Stinneref8d95c2010-08-16 22:03:11 +000010325int
10326Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10327{
10328 register Py_UNICODE u1, u2;
10329 for (; n != 0; n--) {
10330 u1 = *s1;
10331 u2 = *s2;
10332 if (u1 != u2)
10333 return (u1 < u2) ? -1 : +1;
10334 if (u1 == '\0')
10335 return 0;
10336 s1++;
10337 s2++;
10338 }
10339 return 0;
10340}
10341
Martin v. Löwis5b222132007-06-10 09:51:05 +000010342Py_UNICODE*
10343Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10344{
10345 const Py_UNICODE *p;
10346 for (p = s; *p; p++)
10347 if (*p == c)
10348 return (Py_UNICODE*)p;
10349 return NULL;
10350}
10351
Victor Stinner331ea922010-08-10 16:37:20 +000010352Py_UNICODE*
10353Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10354{
10355 const Py_UNICODE *p;
10356 p = s + Py_UNICODE_strlen(s);
10357 while (p != s) {
10358 p--;
10359 if (*p == c)
10360 return (Py_UNICODE*)p;
10361 }
10362 return NULL;
10363}
10364
Victor Stinner71133ff2010-09-01 23:43:53 +000010365Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010366PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010367{
10368 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10369 Py_UNICODE *copy;
10370 Py_ssize_t size;
10371
10372 /* Ensure we won't overflow the size. */
10373 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10374 PyErr_NoMemory();
10375 return NULL;
10376 }
10377 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10378 size *= sizeof(Py_UNICODE);
10379 copy = PyMem_Malloc(size);
10380 if (copy == NULL) {
10381 PyErr_NoMemory();
10382 return NULL;
10383 }
10384 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10385 return copy;
10386}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010387
Georg Brandl66c221e2010-10-14 07:04:07 +000010388/* A _string module, to export formatter_parser and formatter_field_name_split
10389 to the string.Formatter class implemented in Python. */
10390
10391static PyMethodDef _string_methods[] = {
10392 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10393 METH_O, PyDoc_STR("split the argument as a field name")},
10394 {"formatter_parser", (PyCFunction) formatter_parser,
10395 METH_O, PyDoc_STR("parse the argument as a format string")},
10396 {NULL, NULL}
10397};
10398
10399static struct PyModuleDef _string_module = {
10400 PyModuleDef_HEAD_INIT,
10401 "_string",
10402 PyDoc_STR("string helper module"),
10403 0,
10404 _string_methods,
10405 NULL,
10406 NULL,
10407 NULL,
10408 NULL
10409};
10410
10411PyMODINIT_FUNC
10412PyInit__string(void)
10413{
10414 return PyModule_Create(&_string_module);
10415}
10416
10417
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010418#ifdef __cplusplus
10419}
10420#endif