blob: 22d213795a4e5e92cba021f42c18a67eeac1d237 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Alexander Belopolsky40018472011-02-26 01:02:56 +0000145static PyObject *
146unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000147 PyObject **errorHandler,const char *encoding, const char *reason,
148 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
149 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
150
Alexander Belopolsky40018472011-02-26 01:02:56 +0000151static void
152raise_encode_exception(PyObject **exceptionObject,
153 const char *encoding,
154 const Py_UNICODE *unicode, Py_ssize_t size,
155 Py_ssize_t startpos, Py_ssize_t endpos,
156 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000157
Christian Heimes190d79e2008-01-30 11:58:22 +0000158/* Same for linebreaks */
159static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000161/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000162/* 0x000B, * LINE TABULATION */
163/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000165 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000166 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000167/* 0x001C, * FILE SEPARATOR */
168/* 0x001D, * GROUP SEPARATOR */
169/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000175
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000184};
185
186
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000188PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000190#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000192#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 /* This is actually an illegal character, so it should
194 not be passed to unichr. */
195 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000196#endif
197}
198
Thomas Wouters477c8d52006-05-27 19:21:47 +0000199/* --- Bloom Filters ----------------------------------------------------- */
200
201/* stuff to implement simple "bloom filters" for Unicode characters.
202 to keep things simple, we use a single bitmask, using the least 5
203 bits from each unicode characters as the bit index. */
204
205/* the linebreak mask is set up by Unicode_Init below */
206
Antoine Pitrouf068f942010-01-13 14:19:12 +0000207#if LONG_BIT >= 128
208#define BLOOM_WIDTH 128
209#elif LONG_BIT >= 64
210#define BLOOM_WIDTH 64
211#elif LONG_BIT >= 32
212#define BLOOM_WIDTH 32
213#else
214#error "LONG_BIT is smaller than 32"
215#endif
216
Thomas Wouters477c8d52006-05-27 19:21:47 +0000217#define BLOOM_MASK unsigned long
218
219static BLOOM_MASK bloom_linebreak;
220
Antoine Pitrouf068f942010-01-13 14:19:12 +0000221#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
222#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223
Benjamin Peterson29060642009-01-31 22:14:21 +0000224#define BLOOM_LINEBREAK(ch) \
225 ((ch) < 128U ? ascii_linebreak[(ch)] : \
226 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227
Alexander Belopolsky40018472011-02-26 01:02:56 +0000228Py_LOCAL_INLINE(BLOOM_MASK)
229make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243Py_LOCAL_INLINE(int)
244unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245{
246 Py_ssize_t i;
247
248 for (i = 0; i < setlen; i++)
249 if (set[i] == chr)
250 return 1;
251
252 return 0;
253}
254
Benjamin Peterson29060642009-01-31 22:14:21 +0000255#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000256 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
257
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258/* --- Unicode Object ----------------------------------------------------- */
259
Alexander Belopolsky40018472011-02-26 01:02:56 +0000260static int
261unicode_resize(register PyUnicodeObject *unicode,
262 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263{
264 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000265
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000268 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000270 /* Resizing shared object (unicode_empty or single character
271 objects) in-place is not allowed. Use PyUnicode_Resize()
272 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000275 (unicode->length == 1 &&
276 unicode->str[0] < 256U &&
277 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000279 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return -1;
281 }
282
Thomas Wouters477c8d52006-05-27 19:21:47 +0000283 /* We allocate one more byte to make sure the string is Ux0000 terminated.
284 The overallocation is also used by fastsearch, which assumes that it's
285 safe to look at str[length] (without making any assumptions about what
286 it contains). */
287
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000289 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000290 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000292 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 PyErr_NoMemory();
294 return -1;
295 }
296 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000297 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000301 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000302 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000303 }
304 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000305
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306 return 0;
307}
308
309/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000310 Ux0000 terminated; some code (e.g. new_identifier)
311 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312
313 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000314 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
316*/
317
Alexander Belopolsky40018472011-02-26 01:02:56 +0000318static PyUnicodeObject *
319_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320{
321 register PyUnicodeObject *unicode;
322
Thomas Wouters477c8d52006-05-27 19:21:47 +0000323 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (length == 0 && unicode_empty != NULL) {
325 Py_INCREF(unicode_empty);
326 return unicode_empty;
327 }
328
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000329 /* Ensure we won't overflow the size. */
330 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
331 return (PyUnicodeObject *)PyErr_NoMemory();
332 }
333
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000335 if (free_list) {
336 unicode = free_list;
337 free_list = *(PyUnicodeObject **)unicode;
338 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 if (unicode->str) {
340 /* Keep-Alive optimization: we only upsize the buffer,
341 never downsize it. */
342 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000343 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000344 PyObject_DEL(unicode->str);
345 unicode->str = NULL;
346 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000347 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000348 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000351 }
352 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 }
354 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000356 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 if (unicode == NULL)
358 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000359 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
360 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 }
362
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000363 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 PyErr_NoMemory();
365 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000366 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000367 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000368 * the caller fails before initializing str -- unicode_resize()
369 * reads str[0], and the Keep-Alive optimization can keep memory
370 * allocated for str alive across a call to unicode_dealloc(unicode).
371 * We don't want unicode_resize to read uninitialized memory in
372 * that case.
373 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000374 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000378 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000379 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000380 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381
Benjamin Peterson29060642009-01-31 22:14:21 +0000382 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000383 /* XXX UNREF/NEWREF interface should be more symmetrical */
384 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000385 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000386 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388}
389
Alexander Belopolsky40018472011-02-26 01:02:56 +0000390static void
391unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392{
Walter Dörwald16807132007-05-25 13:52:07 +0000393 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000394 case SSTATE_NOT_INTERNED:
395 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000396
Benjamin Peterson29060642009-01-31 22:14:21 +0000397 case SSTATE_INTERNED_MORTAL:
398 /* revive dead object temporarily for DelItem */
399 Py_REFCNT(unicode) = 3;
400 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
401 Py_FatalError(
402 "deletion of interned string failed");
403 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 case SSTATE_INTERNED_IMMORTAL:
406 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000407
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 default:
409 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000410 }
411
Guido van Rossum604ddf82001-12-06 20:03:56 +0000412 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000414 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000415 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
416 PyObject_DEL(unicode->str);
417 unicode->str = NULL;
418 unicode->length = 0;
419 }
420 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000421 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 }
423 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000424 *(PyUnicodeObject **)unicode = free_list;
425 free_list = unicode;
426 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 }
428 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000429 PyObject_DEL(unicode->str);
430 Py_XDECREF(unicode->defenc);
431 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 }
433}
434
Alexander Belopolsky40018472011-02-26 01:02:56 +0000435static int
436_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000437{
438 register PyUnicodeObject *v;
439
440 /* Argument checks */
441 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000442 PyErr_BadInternalCall();
443 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000446 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000447 PyErr_BadInternalCall();
448 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000449 }
450
451 /* Resizing unicode_empty and single character objects is not
452 possible since these are being shared. We simply return a fresh
453 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000454 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000455 (v == unicode_empty || v->length == 1)) {
456 PyUnicodeObject *w = _PyUnicode_New(length);
457 if (w == NULL)
458 return -1;
459 Py_UNICODE_COPY(w->str, v->str,
460 length < v->length ? length : v->length);
461 Py_DECREF(*unicode);
462 *unicode = w;
463 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000464 }
465
466 /* Note that we don't have to modify *unicode for unshared Unicode
467 objects, since we can modify them in-place. */
468 return unicode_resize(v, length);
469}
470
Alexander Belopolsky40018472011-02-26 01:02:56 +0000471int
472PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000473{
474 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
475}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476
Alexander Belopolsky40018472011-02-26 01:02:56 +0000477PyObject *
478PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479{
480 PyUnicodeObject *unicode;
481
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects. */
484 if (u != NULL) {
485
Benjamin Peterson29060642009-01-31 22:14:21 +0000486 /* Optimization for empty strings */
487 if (size == 0 && unicode_empty != NULL) {
488 Py_INCREF(unicode_empty);
489 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000490 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000491
492 /* Single character Unicode objects in the Latin-1 range are
493 shared when using this constructor */
494 if (size == 1 && *u < 256) {
495 unicode = unicode_latin1[*u];
496 if (!unicode) {
497 unicode = _PyUnicode_New(1);
498 if (!unicode)
499 return NULL;
500 unicode->str[0] = *u;
501 unicode_latin1[*u] = unicode;
502 }
503 Py_INCREF(unicode);
504 return (PyObject *)unicode;
505 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 unicode = _PyUnicode_New(size);
509 if (!unicode)
510 return NULL;
511
512 /* Copy the Unicode data into the new object */
513 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000514 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000515
516 return (PyObject *)unicode;
517}
518
Alexander Belopolsky40018472011-02-26 01:02:56 +0000519PyObject *
520PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000521{
522 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 if (size < 0) {
525 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000526 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000527 return NULL;
528 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000531 some optimizations which share commonly used objects.
532 Also, this means the input must be UTF-8, so fall back to the
533 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 if (u != NULL) {
535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536 /* Optimization for empty strings */
537 if (size == 0 && unicode_empty != NULL) {
538 Py_INCREF(unicode_empty);
539 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000540 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000541
542 /* Single characters are shared when using this constructor.
543 Restrict to ASCII, since the input must be UTF-8. */
544 if (size == 1 && Py_CHARMASK(*u) < 128) {
545 unicode = unicode_latin1[Py_CHARMASK(*u)];
546 if (!unicode) {
547 unicode = _PyUnicode_New(1);
548 if (!unicode)
549 return NULL;
550 unicode->str[0] = Py_CHARMASK(*u);
551 unicode_latin1[Py_CHARMASK(*u)] = unicode;
552 }
553 Py_INCREF(unicode);
554 return (PyObject *)unicode;
555 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000556
557 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 }
559
Walter Dörwald55507312007-05-18 13:12:10 +0000560 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 if (!unicode)
562 return NULL;
563
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 return (PyObject *)unicode;
565}
566
Alexander Belopolsky40018472011-02-26 01:02:56 +0000567PyObject *
568PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +0000569{
570 size_t size = strlen(u);
571 if (size > PY_SSIZE_T_MAX) {
572 PyErr_SetString(PyExc_OverflowError, "input too long");
573 return NULL;
574 }
575
576 return PyUnicode_FromStringAndSize(u, size);
577}
578
Guido van Rossumd57fd912000-03-10 22:53:23 +0000579#ifdef HAVE_WCHAR_H
580
Mark Dickinson081dfee2009-03-18 14:47:41 +0000581#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
582# define CONVERT_WCHAR_TO_SURROGATES
583#endif
584
585#ifdef CONVERT_WCHAR_TO_SURROGATES
586
587/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
588 to convert from UTF32 to UTF16. */
589
Alexander Belopolsky40018472011-02-26 01:02:56 +0000590PyObject *
591PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +0000592{
593 PyUnicodeObject *unicode;
594 register Py_ssize_t i;
595 Py_ssize_t alloc;
596 const wchar_t *orig_w;
597
598 if (w == NULL) {
599 if (size == 0)
600 return PyUnicode_FromStringAndSize(NULL, 0);
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 if (size == -1) {
606 size = wcslen(w);
607 }
608
609 alloc = size;
610 orig_w = w;
611 for (i = size; i > 0; i--) {
612 if (*w > 0xFFFF)
613 alloc++;
614 w++;
615 }
616 w = orig_w;
617 unicode = _PyUnicode_New(alloc);
618 if (!unicode)
619 return NULL;
620
621 /* Copy the wchar_t data into the new object */
622 {
623 register Py_UNICODE *u;
624 u = PyUnicode_AS_UNICODE(unicode);
625 for (i = size; i > 0; i--) {
626 if (*w > 0xFFFF) {
627 wchar_t ordinal = *w++;
628 ordinal -= 0x10000;
629 *u++ = 0xD800 | (ordinal >> 10);
630 *u++ = 0xDC00 | (ordinal & 0x3FF);
631 }
632 else
633 *u++ = *w++;
634 }
635 }
636 return (PyObject *)unicode;
637}
638
639#else
640
Alexander Belopolsky40018472011-02-26 01:02:56 +0000641PyObject *
642PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643{
644 PyUnicodeObject *unicode;
645
646 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000647 if (size == 0)
648 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 PyErr_BadInternalCall();
650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 }
652
Martin v. Löwis790465f2008-04-05 20:41:37 +0000653 if (size == -1) {
654 size = wcslen(w);
655 }
656
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 unicode = _PyUnicode_New(size);
658 if (!unicode)
659 return NULL;
660
661 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000662#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000664#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000666 register Py_UNICODE *u;
667 register Py_ssize_t i;
668 u = PyUnicode_AS_UNICODE(unicode);
669 for (i = size; i > 0; i--)
670 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671 }
672#endif
673
674 return (PyObject *)unicode;
675}
676
Mark Dickinson081dfee2009-03-18 14:47:41 +0000677#endif /* CONVERT_WCHAR_TO_SURROGATES */
678
679#undef CONVERT_WCHAR_TO_SURROGATES
680
Walter Dörwald346737f2007-05-31 10:44:43 +0000681static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000682makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
683 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000684{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000685 *fmt++ = '%';
686 if (width) {
687 if (zeropad)
688 *fmt++ = '0';
689 fmt += sprintf(fmt, "%d", width);
690 }
691 if (precision)
692 fmt += sprintf(fmt, ".%d", precision);
693 if (longflag)
694 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000695 else if (longlongflag) {
696 /* longlongflag should only ever be nonzero on machines with
697 HAVE_LONG_LONG defined */
698#ifdef HAVE_LONG_LONG
699 char *f = PY_FORMAT_LONG_LONG;
700 while (*f)
701 *fmt++ = *f++;
702#else
703 /* we shouldn't ever get here */
704 assert(0);
705 *fmt++ = 'l';
706#endif
707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000708 else if (size_tflag) {
709 char *f = PY_FORMAT_SIZE_T;
710 while (*f)
711 *fmt++ = *f++;
712 }
713 *fmt++ = c;
714 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000715}
716
Victor Stinner96865452011-03-01 23:44:09 +0000717/* helper for PyUnicode_FromFormatV() */
718
719static const char*
720parse_format_flags(const char *f,
721 int *p_width, int *p_precision,
722 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
723{
724 int width, precision, longflag, longlongflag, size_tflag;
725
726 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
727 f++;
728 width = 0;
729 while (Py_ISDIGIT((unsigned)*f))
730 width = (width*10) + *f++ - '0';
731 precision = 0;
732 if (*f == '.') {
733 f++;
734 while (Py_ISDIGIT((unsigned)*f))
735 precision = (precision*10) + *f++ - '0';
736 if (*f == '%') {
737 /* "%.3%s" => f points to "3" */
738 f--;
739 }
740 }
741 if (*f == '\0') {
742 /* bogus format "%.1" => go backward, f points to "1" */
743 f--;
744 }
745 if (p_width != NULL)
746 *p_width = width;
747 if (p_precision != NULL)
748 *p_precision = precision;
749
750 /* Handle %ld, %lu, %lld and %llu. */
751 longflag = 0;
752 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +0000753 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +0000754
755 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +0000756 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +0000757 longflag = 1;
758 ++f;
759 }
760#ifdef HAVE_LONG_LONG
761 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +0000762 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +0000763 longlongflag = 1;
764 f += 2;
765 }
766#endif
767 }
768 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +0000769 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +0000770 size_tflag = 1;
771 ++f;
772 }
773 if (p_longflag != NULL)
774 *p_longflag = longflag;
775 if (p_longlongflag != NULL)
776 *p_longlongflag = longlongflag;
777 if (p_size_tflag != NULL)
778 *p_size_tflag = size_tflag;
779 return f;
780}
781
Walter Dörwaldd2034312007-05-18 16:29:38 +0000782#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
783
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000784/* size of fixed-size buffer for formatting single arguments */
785#define ITEM_BUFFER_LEN 21
786/* maximum number of characters required for output of %ld. 21 characters
787 allows for 64-bit integers (in decimal) and an optional sign. */
788#define MAX_LONG_CHARS 21
789/* maximum number of characters required for output of %lld.
790 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
791 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
792#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
793
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794PyObject *
795PyUnicode_FromFormatV(const char *format, va_list vargs)
796{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000797 va_list count;
798 Py_ssize_t callcount = 0;
799 PyObject **callresults = NULL;
800 PyObject **callresult = NULL;
801 Py_ssize_t n = 0;
802 int width = 0;
803 int precision = 0;
804 int zeropad;
805 const char* f;
806 Py_UNICODE *s;
807 PyObject *string;
808 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000809 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* use abuffer instead of buffer, if we need more space
811 * (which can happen if there's a format specifier with width). */
812 char *abuffer = NULL;
813 char *realbuffer;
814 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000815 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000816 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000817
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000818 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000819 /* step 1: count the number of %S/%R/%A/%s format specifications
820 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
821 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
822 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000823 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000824 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +0000825 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
826 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
827 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000828 ++callcount;
829 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000830 else if (128 <= (unsigned char)*f) {
831 PyErr_Format(PyExc_ValueError,
832 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000833 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000834 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000835 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000836 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000837 }
838 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000839 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000840 if (callcount) {
841 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
842 if (!callresults) {
843 PyErr_NoMemory();
844 return NULL;
845 }
846 callresult = callresults;
847 }
848 /* step 3: figure out how large a buffer we need */
849 for (f = format; *f; f++) {
850 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000851#ifdef HAVE_LONG_LONG
Victor Stinner96865452011-03-01 23:44:09 +0000852 int longlongflag;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000853#endif
Victor Stinner96865452011-03-01 23:44:09 +0000854 const char* p;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000855
Victor Stinner96865452011-03-01 23:44:09 +0000856 p = f;
857 f = parse_format_flags(f, &width, NULL,
858 NULL, &longlongflag, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859
Benjamin Peterson14339b62009-01-31 16:36:08 +0000860 switch (*f) {
861 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000862 {
863#ifndef Py_UNICODE_WIDE
864 int ordinal = va_arg(count, int);
865 if (ordinal > 0xffff)
866 n += 2;
867 else
868 n++;
869#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000870 (void)va_arg(count, int);
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000871 n++;
872#endif
873 break;
874 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000875 case '%':
876 n++;
877 break;
878 case 'd': case 'u': case 'i': case 'x':
879 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000880#ifdef HAVE_LONG_LONG
881 if (longlongflag) {
882 if (width < MAX_LONG_LONG_CHARS)
883 width = MAX_LONG_LONG_CHARS;
884 }
885 else
886#endif
887 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
888 including sign. Decimal takes the most space. This
889 isn't enough for octal. If a width is specified we
890 need more (which we allocate later). */
891 if (width < MAX_LONG_CHARS)
892 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000893 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000894 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000895 if (abuffersize < width)
896 abuffersize = width;
897 break;
898 case 's':
899 {
900 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000901 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000902 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
903 if (!str)
904 goto fail;
905 n += PyUnicode_GET_SIZE(str);
906 /* Remember the str and switch to the next slot */
907 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000908 break;
909 }
910 case 'U':
911 {
912 PyObject *obj = va_arg(count, PyObject *);
913 assert(obj && PyUnicode_Check(obj));
914 n += PyUnicode_GET_SIZE(obj);
915 break;
916 }
917 case 'V':
918 {
919 PyObject *obj = va_arg(count, PyObject *);
920 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000921 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000922 assert(obj || str);
923 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +0000924 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000925 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000926 *callresult++ = NULL;
927 }
928 else {
929 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
930 if (!str_obj)
931 goto fail;
932 n += PyUnicode_GET_SIZE(str_obj);
933 *callresult++ = str_obj;
934 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000935 break;
936 }
937 case 'S':
938 {
939 PyObject *obj = va_arg(count, PyObject *);
940 PyObject *str;
941 assert(obj);
942 str = PyObject_Str(obj);
943 if (!str)
944 goto fail;
945 n += PyUnicode_GET_SIZE(str);
946 /* Remember the str and switch to the next slot */
947 *callresult++ = str;
948 break;
949 }
950 case 'R':
951 {
952 PyObject *obj = va_arg(count, PyObject *);
953 PyObject *repr;
954 assert(obj);
955 repr = PyObject_Repr(obj);
956 if (!repr)
957 goto fail;
958 n += PyUnicode_GET_SIZE(repr);
959 /* Remember the repr and switch to the next slot */
960 *callresult++ = repr;
961 break;
962 }
963 case 'A':
964 {
965 PyObject *obj = va_arg(count, PyObject *);
966 PyObject *ascii;
967 assert(obj);
968 ascii = PyObject_ASCII(obj);
969 if (!ascii)
970 goto fail;
971 n += PyUnicode_GET_SIZE(ascii);
972 /* Remember the repr and switch to the next slot */
973 *callresult++ = ascii;
974 break;
975 }
976 case 'p':
977 (void) va_arg(count, int);
978 /* maximum 64-bit pointer representation:
979 * 0xffffffffffffffff
980 * so 19 characters is enough.
981 * XXX I count 18 -- what's the extra for?
982 */
983 n += 19;
984 break;
985 default:
986 /* if we stumble upon an unknown
987 formatting code, copy the rest of
988 the format string to the output
989 string. (we cannot just skip the
990 code, since there's no way to know
991 what's in the argument list) */
992 n += strlen(p);
993 goto expand;
994 }
995 } else
996 n++;
997 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000998 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000999 if (abuffersize > ITEM_BUFFER_LEN) {
1000 /* add 1 for sprintf's trailing null byte */
1001 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001002 if (!abuffer) {
1003 PyErr_NoMemory();
1004 goto fail;
1005 }
1006 realbuffer = abuffer;
1007 }
1008 else
1009 realbuffer = buffer;
1010 /* step 4: fill the buffer */
1011 /* Since we've analyzed how much space we need for the worst case,
1012 we don't have to resize the string.
1013 There can be no errors beyond this point. */
1014 string = PyUnicode_FromUnicode(NULL, n);
1015 if (!string)
1016 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001017
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 s = PyUnicode_AS_UNICODE(string);
1019 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001020
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 for (f = format; *f; f++) {
1022 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001023 const char* p;
1024 int longflag;
1025 int longlongflag;
1026 int size_tflag;
1027
1028 p = f;
1029 zeropad = (f[1] == '0');
1030 f = parse_format_flags(f, &width, &precision,
1031 &longflag, &longlongflag, &size_tflag);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001032
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 switch (*f) {
1034 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001035 {
1036 int ordinal = va_arg(vargs, int);
1037#ifndef Py_UNICODE_WIDE
1038 if (ordinal > 0xffff) {
1039 ordinal -= 0x10000;
1040 *s++ = 0xD800 | (ordinal >> 10);
1041 *s++ = 0xDC00 | (ordinal & 0x3FF);
1042 } else
1043#endif
1044 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001046 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001047 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001048 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
Victor Stinner6d970f42011-03-02 00:04:25 +00001050 width, precision, *f);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 if (longflag)
1052 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001053#ifdef HAVE_LONG_LONG
1054 else if (longlongflag)
1055 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1056#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001057 else if (size_tflag)
1058 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1059 else
1060 sprintf(realbuffer, fmt, va_arg(vargs, int));
1061 appendstring(realbuffer);
1062 break;
1063 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001064 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1065 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001066 if (longflag)
1067 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001068#ifdef HAVE_LONG_LONG
1069 else if (longlongflag)
1070 sprintf(realbuffer, fmt, va_arg(vargs,
1071 unsigned PY_LONG_LONG));
1072#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001073 else if (size_tflag)
1074 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1075 else
1076 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1077 appendstring(realbuffer);
1078 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001079 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001080 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001081 sprintf(realbuffer, fmt, va_arg(vargs, int));
1082 appendstring(realbuffer);
1083 break;
1084 case 's':
1085 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001086 /* unused, since we already have the result */
1087 (void) va_arg(vargs, char *);
1088 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1089 PyUnicode_GET_SIZE(*callresult));
1090 s += PyUnicode_GET_SIZE(*callresult);
1091 /* We're done with the unicode()/repr() => forget it */
1092 Py_DECREF(*callresult);
1093 /* switch to next unicode()/repr() result */
1094 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001095 break;
1096 }
1097 case 'U':
1098 {
1099 PyObject *obj = va_arg(vargs, PyObject *);
1100 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1101 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1102 s += size;
1103 break;
1104 }
1105 case 'V':
1106 {
1107 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001108 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001109 if (obj) {
1110 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1111 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1112 s += size;
1113 } else {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001114 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1115 PyUnicode_GET_SIZE(*callresult));
1116 s += PyUnicode_GET_SIZE(*callresult);
1117 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001118 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001119 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001120 break;
1121 }
1122 case 'S':
1123 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001124 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001125 {
1126 Py_UNICODE *ucopy;
1127 Py_ssize_t usize;
1128 Py_ssize_t upos;
1129 /* unused, since we already have the result */
1130 (void) va_arg(vargs, PyObject *);
1131 ucopy = PyUnicode_AS_UNICODE(*callresult);
1132 usize = PyUnicode_GET_SIZE(*callresult);
1133 for (upos = 0; upos<usize;)
1134 *s++ = ucopy[upos++];
1135 /* We're done with the unicode()/repr() => forget it */
1136 Py_DECREF(*callresult);
1137 /* switch to next unicode()/repr() result */
1138 ++callresult;
1139 break;
1140 }
1141 case 'p':
1142 sprintf(buffer, "%p", va_arg(vargs, void*));
1143 /* %p is ill-defined: ensure leading 0x. */
1144 if (buffer[1] == 'X')
1145 buffer[1] = 'x';
1146 else if (buffer[1] != 'x') {
1147 memmove(buffer+2, buffer, strlen(buffer)+1);
1148 buffer[0] = '0';
1149 buffer[1] = 'x';
1150 }
1151 appendstring(buffer);
1152 break;
1153 case '%':
1154 *s++ = '%';
1155 break;
1156 default:
1157 appendstring(p);
1158 goto end;
1159 }
Victor Stinner1205f272010-09-11 00:54:47 +00001160 }
Victor Stinner1205f272010-09-11 00:54:47 +00001161 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 *s++ = *f;
1163 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001164
Benjamin Peterson29060642009-01-31 22:14:21 +00001165 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001166 if (callresults)
1167 PyObject_Free(callresults);
1168 if (abuffer)
1169 PyObject_Free(abuffer);
1170 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1171 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001173 if (callresults) {
1174 PyObject **callresult2 = callresults;
1175 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001176 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001177 ++callresult2;
1178 }
1179 PyObject_Free(callresults);
1180 }
1181 if (abuffer)
1182 PyObject_Free(abuffer);
1183 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001184}
1185
1186#undef appendstring
1187
1188PyObject *
1189PyUnicode_FromFormat(const char *format, ...)
1190{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001191 PyObject* ret;
1192 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001193
1194#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001195 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001196#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001197 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001198#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001199 ret = PyUnicode_FromFormatV(format, vargs);
1200 va_end(vargs);
1201 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001202}
1203
Victor Stinner5593d8a2010-10-02 11:11:27 +00001204/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1205 convert a Unicode object to a wide character string.
1206
1207 - If w is NULL: return the number of wide characters (including the nul
1208 character) required to convert the unicode object. Ignore size argument.
1209
1210 - Otherwise: return the number of wide characters (excluding the nul
1211 character) written into w. Write at most size wide characters (including
1212 the nul character). */
1213static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001214unicode_aswidechar(PyUnicodeObject *unicode,
1215 wchar_t *w,
1216 Py_ssize_t size)
1217{
1218#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001219 Py_ssize_t res;
1220 if (w != NULL) {
1221 res = PyUnicode_GET_SIZE(unicode);
1222 if (size > res)
1223 size = res + 1;
1224 else
1225 res = size;
1226 memcpy(w, unicode->str, size * sizeof(wchar_t));
1227 return res;
1228 }
1229 else
1230 return PyUnicode_GET_SIZE(unicode) + 1;
1231#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1232 register const Py_UNICODE *u;
1233 const Py_UNICODE *uend;
1234 const wchar_t *worig, *wend;
1235 Py_ssize_t nchar;
1236
Victor Stinner137c34c2010-09-29 10:25:54 +00001237 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001238 uend = u + PyUnicode_GET_SIZE(unicode);
1239 if (w != NULL) {
1240 worig = w;
1241 wend = w + size;
1242 while (u != uend && w != wend) {
1243 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1244 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1245 {
1246 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1247 u += 2;
1248 }
1249 else {
1250 *w = *u;
1251 u++;
1252 }
1253 w++;
1254 }
1255 if (w != wend)
1256 *w = L'\0';
1257 return w - worig;
1258 }
1259 else {
1260 nchar = 1; /* nul character at the end */
1261 while (u != uend) {
1262 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1263 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1264 u += 2;
1265 else
1266 u++;
1267 nchar++;
1268 }
1269 }
1270 return nchar;
1271#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1272 register Py_UNICODE *u, *uend, ordinal;
1273 register Py_ssize_t i;
1274 wchar_t *worig, *wend;
1275 Py_ssize_t nchar;
1276
1277 u = PyUnicode_AS_UNICODE(unicode);
1278 uend = u + PyUnicode_GET_SIZE(u);
1279 if (w != NULL) {
1280 worig = w;
1281 wend = w + size;
1282 while (u != uend && w != wend) {
1283 ordinal = *u;
1284 if (ordinal > 0xffff) {
1285 ordinal -= 0x10000;
1286 *w++ = 0xD800 | (ordinal >> 10);
1287 *w++ = 0xDC00 | (ordinal & 0x3FF);
1288 }
1289 else
1290 *w++ = ordinal;
1291 u++;
1292 }
1293 if (w != wend)
1294 *w = 0;
1295 return w - worig;
1296 }
1297 else {
1298 nchar = 1; /* nul character */
1299 while (u != uend) {
1300 if (*u > 0xffff)
1301 nchar += 2;
1302 else
1303 nchar++;
1304 u++;
1305 }
1306 return nchar;
1307 }
1308#else
1309# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001310#endif
1311}
1312
1313Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001314PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001315 wchar_t *w,
1316 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317{
1318 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001319 PyErr_BadInternalCall();
1320 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001322 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323}
1324
Victor Stinner137c34c2010-09-29 10:25:54 +00001325wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001326PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001327 Py_ssize_t *size)
1328{
1329 wchar_t* buffer;
1330 Py_ssize_t buflen;
1331
1332 if (unicode == NULL) {
1333 PyErr_BadInternalCall();
1334 return NULL;
1335 }
1336
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001337 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001338 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001339 PyErr_NoMemory();
1340 return NULL;
1341 }
1342
Victor Stinner137c34c2010-09-29 10:25:54 +00001343 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1344 if (buffer == NULL) {
1345 PyErr_NoMemory();
1346 return NULL;
1347 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001348 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001349 if (size != NULL)
1350 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001351 return buffer;
1352}
1353
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354#endif
1355
Alexander Belopolsky40018472011-02-26 01:02:56 +00001356PyObject *
1357PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001358{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001359 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001360
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001361 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001362 PyErr_SetString(PyExc_ValueError,
1363 "chr() arg not in range(0x110000)");
1364 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001365 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001366
1367#ifndef Py_UNICODE_WIDE
1368 if (ordinal > 0xffff) {
1369 ordinal -= 0x10000;
1370 s[0] = 0xD800 | (ordinal >> 10);
1371 s[1] = 0xDC00 | (ordinal & 0x3FF);
1372 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001373 }
1374#endif
1375
Hye-Shik Chang40574832004-04-06 07:24:51 +00001376 s[0] = (Py_UNICODE)ordinal;
1377 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001378}
1379
Alexander Belopolsky40018472011-02-26 01:02:56 +00001380PyObject *
1381PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001383 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001384 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001385 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001386 Py_INCREF(obj);
1387 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001388 }
1389 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 /* For a Unicode subtype that's not a Unicode object,
1391 return a true Unicode object with the same data. */
1392 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1393 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001394 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001395 PyErr_Format(PyExc_TypeError,
1396 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001397 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001398 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001399}
1400
Alexander Belopolsky40018472011-02-26 01:02:56 +00001401PyObject *
1402PyUnicode_FromEncodedObject(register PyObject *obj,
1403 const char *encoding,
1404 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001405{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001406 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001407 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001408
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001410 PyErr_BadInternalCall();
1411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001413
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001414 /* Decoding bytes objects is the most common case and should be fast */
1415 if (PyBytes_Check(obj)) {
1416 if (PyBytes_GET_SIZE(obj) == 0) {
1417 Py_INCREF(unicode_empty);
1418 v = (PyObject *) unicode_empty;
1419 }
1420 else {
1421 v = PyUnicode_Decode(
1422 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1423 encoding, errors);
1424 }
1425 return v;
1426 }
1427
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001428 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001429 PyErr_SetString(PyExc_TypeError,
1430 "decoding str is not supported");
1431 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001432 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001433
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001434 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1435 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1436 PyErr_Format(PyExc_TypeError,
1437 "coercing to str: need bytes, bytearray "
1438 "or buffer-like object, %.80s found",
1439 Py_TYPE(obj)->tp_name);
1440 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001441 }
Tim Petersced69f82003-09-16 20:30:58 +00001442
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001443 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001444 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001445 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446 }
Tim Petersced69f82003-09-16 20:30:58 +00001447 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001448 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001449
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001450 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001451 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452}
1453
Victor Stinner600d3be2010-06-10 12:00:55 +00001454/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001455 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1456 1 on success. */
1457static int
1458normalize_encoding(const char *encoding,
1459 char *lower,
1460 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001462 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001463 char *l;
1464 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001465
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001466 e = encoding;
1467 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001468 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001469 while (*e) {
1470 if (l == l_end)
1471 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001472 if (Py_ISUPPER(*e)) {
1473 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001474 }
1475 else if (*e == '_') {
1476 *l++ = '-';
1477 e++;
1478 }
1479 else {
1480 *l++ = *e++;
1481 }
1482 }
1483 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001484 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001485}
1486
Alexander Belopolsky40018472011-02-26 01:02:56 +00001487PyObject *
1488PyUnicode_Decode(const char *s,
1489 Py_ssize_t size,
1490 const char *encoding,
1491 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00001492{
1493 PyObject *buffer = NULL, *unicode;
1494 Py_buffer info;
1495 char lower[11]; /* Enough for any encoding shortcut */
1496
1497 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001499
1500 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001501 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001502 if ((strcmp(lower, "utf-8") == 0) ||
1503 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001504 return PyUnicode_DecodeUTF8(s, size, errors);
1505 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001506 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001507 (strcmp(lower, "iso-8859-1") == 0))
1508 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001509#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001510 else if (strcmp(lower, "mbcs") == 0)
1511 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001512#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001513 else if (strcmp(lower, "ascii") == 0)
1514 return PyUnicode_DecodeASCII(s, size, errors);
1515 else if (strcmp(lower, "utf-16") == 0)
1516 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1517 else if (strcmp(lower, "utf-32") == 0)
1518 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001520
1521 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001522 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001523 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001524 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001525 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 if (buffer == NULL)
1527 goto onError;
1528 unicode = PyCodec_Decode(buffer, encoding, errors);
1529 if (unicode == NULL)
1530 goto onError;
1531 if (!PyUnicode_Check(unicode)) {
1532 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001533 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001534 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001535 Py_DECREF(unicode);
1536 goto onError;
1537 }
1538 Py_DECREF(buffer);
1539 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001540
Benjamin Peterson29060642009-01-31 22:14:21 +00001541 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 Py_XDECREF(buffer);
1543 return NULL;
1544}
1545
Alexander Belopolsky40018472011-02-26 01:02:56 +00001546PyObject *
1547PyUnicode_AsDecodedObject(PyObject *unicode,
1548 const char *encoding,
1549 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001550{
1551 PyObject *v;
1552
1553 if (!PyUnicode_Check(unicode)) {
1554 PyErr_BadArgument();
1555 goto onError;
1556 }
1557
1558 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001559 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001560
1561 /* Decode via the codec registry */
1562 v = PyCodec_Decode(unicode, encoding, errors);
1563 if (v == NULL)
1564 goto onError;
1565 return v;
1566
Benjamin Peterson29060642009-01-31 22:14:21 +00001567 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001568 return NULL;
1569}
1570
Alexander Belopolsky40018472011-02-26 01:02:56 +00001571PyObject *
1572PyUnicode_AsDecodedUnicode(PyObject *unicode,
1573 const char *encoding,
1574 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001575{
1576 PyObject *v;
1577
1578 if (!PyUnicode_Check(unicode)) {
1579 PyErr_BadArgument();
1580 goto onError;
1581 }
1582
1583 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001584 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001585
1586 /* Decode via the codec registry */
1587 v = PyCodec_Decode(unicode, encoding, errors);
1588 if (v == NULL)
1589 goto onError;
1590 if (!PyUnicode_Check(v)) {
1591 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001592 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001593 Py_TYPE(v)->tp_name);
1594 Py_DECREF(v);
1595 goto onError;
1596 }
1597 return v;
1598
Benjamin Peterson29060642009-01-31 22:14:21 +00001599 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001600 return NULL;
1601}
1602
Alexander Belopolsky40018472011-02-26 01:02:56 +00001603PyObject *
1604PyUnicode_Encode(const Py_UNICODE *s,
1605 Py_ssize_t size,
1606 const char *encoding,
1607 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608{
1609 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001610
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 unicode = PyUnicode_FromUnicode(s, size);
1612 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001613 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1615 Py_DECREF(unicode);
1616 return v;
1617}
1618
Alexander Belopolsky40018472011-02-26 01:02:56 +00001619PyObject *
1620PyUnicode_AsEncodedObject(PyObject *unicode,
1621 const char *encoding,
1622 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001623{
1624 PyObject *v;
1625
1626 if (!PyUnicode_Check(unicode)) {
1627 PyErr_BadArgument();
1628 goto onError;
1629 }
1630
1631 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001632 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001633
1634 /* Encode via the codec registry */
1635 v = PyCodec_Encode(unicode, encoding, errors);
1636 if (v == NULL)
1637 goto onError;
1638 return v;
1639
Benjamin Peterson29060642009-01-31 22:14:21 +00001640 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001641 return NULL;
1642}
1643
Victor Stinnerad158722010-10-27 00:25:46 +00001644PyObject *
1645PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001646{
Victor Stinner313a1202010-06-11 23:56:51 +00001647#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001648 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1649 PyUnicode_GET_SIZE(unicode),
1650 NULL);
1651#elif defined(__APPLE__)
1652 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1653 PyUnicode_GET_SIZE(unicode),
1654 "surrogateescape");
1655#else
1656 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001657 return PyUnicode_AsEncodedString(unicode,
1658 Py_FileSystemDefaultEncoding,
1659 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001660 }
1661 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001662 /* locale encoding with surrogateescape */
1663 wchar_t *wchar;
1664 char *bytes;
1665 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001666 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001667
1668 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1669 if (wchar == NULL)
1670 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001671 bytes = _Py_wchar2char(wchar, &error_pos);
1672 if (bytes == NULL) {
1673 if (error_pos != (size_t)-1) {
1674 char *errmsg = strerror(errno);
1675 PyObject *exc = NULL;
1676 if (errmsg == NULL)
1677 errmsg = "Py_wchar2char() failed";
1678 raise_encode_exception(&exc,
1679 "filesystemencoding",
1680 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1681 error_pos, error_pos+1,
1682 errmsg);
1683 Py_XDECREF(exc);
1684 }
1685 else
1686 PyErr_NoMemory();
1687 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001688 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001689 }
1690 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001691
1692 bytes_obj = PyBytes_FromString(bytes);
1693 PyMem_Free(bytes);
1694 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001695 }
Victor Stinnerad158722010-10-27 00:25:46 +00001696#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001697}
1698
Alexander Belopolsky40018472011-02-26 01:02:56 +00001699PyObject *
1700PyUnicode_AsEncodedString(PyObject *unicode,
1701 const char *encoding,
1702 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703{
1704 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001705 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001706
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 if (!PyUnicode_Check(unicode)) {
1708 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001709 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710 }
Fred Drakee4315f52000-05-09 19:53:39 +00001711
Victor Stinner2f283c22011-03-02 01:21:46 +00001712 if (encoding == NULL) {
1713 if (errors == NULL || strcmp(errors, "strict") == 0)
1714 return PyUnicode_AsUTF8String(unicode);
1715 else
1716 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1717 PyUnicode_GET_SIZE(unicode),
1718 errors);
1719 }
Fred Drakee4315f52000-05-09 19:53:39 +00001720
1721 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001722 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001723 if ((strcmp(lower, "utf-8") == 0) ||
1724 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00001725 {
Victor Stinner2f283c22011-03-02 01:21:46 +00001726 if (errors == NULL || strcmp(errors, "strict") == 0)
Victor Stinnera5c68c32011-03-02 01:03:14 +00001727 return PyUnicode_AsUTF8String(unicode);
Victor Stinner2f283c22011-03-02 01:21:46 +00001728 else
Victor Stinnera5c68c32011-03-02 01:03:14 +00001729 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1730 PyUnicode_GET_SIZE(unicode),
1731 errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00001732 }
Victor Stinner37296e82010-06-10 13:36:23 +00001733 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001734 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001735 (strcmp(lower, "iso-8859-1") == 0))
1736 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1737 PyUnicode_GET_SIZE(unicode),
1738 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001739#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001740 else if (strcmp(lower, "mbcs") == 0)
1741 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1742 PyUnicode_GET_SIZE(unicode),
1743 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001744#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001745 else if (strcmp(lower, "ascii") == 0)
1746 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1747 PyUnicode_GET_SIZE(unicode),
1748 errors);
1749 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750
1751 /* Encode via the codec registry */
1752 v = PyCodec_Encode(unicode, encoding, errors);
1753 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001754 return NULL;
1755
1756 /* The normal path */
1757 if (PyBytes_Check(v))
1758 return v;
1759
1760 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001761 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001762 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001763 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001764
1765 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1766 "encoder %s returned bytearray instead of bytes",
1767 encoding);
1768 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001769 Py_DECREF(v);
1770 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001771 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001772
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001773 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1774 Py_DECREF(v);
1775 return b;
1776 }
1777
1778 PyErr_Format(PyExc_TypeError,
1779 "encoder did not return a bytes object (type=%.400s)",
1780 Py_TYPE(v)->tp_name);
1781 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001782 return NULL;
1783}
1784
Alexander Belopolsky40018472011-02-26 01:02:56 +00001785PyObject *
1786PyUnicode_AsEncodedUnicode(PyObject *unicode,
1787 const char *encoding,
1788 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001789{
1790 PyObject *v;
1791
1792 if (!PyUnicode_Check(unicode)) {
1793 PyErr_BadArgument();
1794 goto onError;
1795 }
1796
1797 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001798 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001799
1800 /* Encode via the codec registry */
1801 v = PyCodec_Encode(unicode, encoding, errors);
1802 if (v == NULL)
1803 goto onError;
1804 if (!PyUnicode_Check(v)) {
1805 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001806 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001807 Py_TYPE(v)->tp_name);
1808 Py_DECREF(v);
1809 goto onError;
1810 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001812
Benjamin Peterson29060642009-01-31 22:14:21 +00001813 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 return NULL;
1815}
1816
Alexander Belopolsky40018472011-02-26 01:02:56 +00001817PyObject *
Victor Stinnerf3fd7332011-03-02 01:03:11 +00001818_PyUnicode_AsDefaultEncodedString(PyObject *unicode)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001819{
1820 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001821 if (v)
1822 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001823 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001824 PyUnicode_GET_SIZE(unicode),
1825 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001826 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001827 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001828 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001829 return v;
1830}
1831
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001832PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001833PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001834 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001835 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1836}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001837
Christian Heimes5894ba72007-11-04 11:43:14 +00001838PyObject*
1839PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1840{
Victor Stinnerad158722010-10-27 00:25:46 +00001841#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1842 return PyUnicode_DecodeMBCS(s, size, NULL);
1843#elif defined(__APPLE__)
1844 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1845#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001846 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1847 can be undefined. If it is case, decode using UTF-8. The following assumes
1848 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1849 bootstrapping process where the codecs aren't ready yet.
1850 */
1851 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001852 return PyUnicode_Decode(s, size,
1853 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001854 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001855 }
1856 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001857 /* locale encoding with surrogateescape */
1858 wchar_t *wchar;
1859 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001860 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001861
1862 if (s[size] != '\0' || size != strlen(s)) {
1863 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1864 return NULL;
1865 }
1866
Victor Stinner168e1172010-10-16 23:16:16 +00001867 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001868 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001869 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001870
Victor Stinner168e1172010-10-16 23:16:16 +00001871 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001872 PyMem_Free(wchar);
1873 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001874 }
Victor Stinnerad158722010-10-27 00:25:46 +00001875#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001876}
1877
Martin v. Löwis011e8422009-05-05 04:43:17 +00001878
1879int
1880PyUnicode_FSConverter(PyObject* arg, void* addr)
1881{
1882 PyObject *output = NULL;
1883 Py_ssize_t size;
1884 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001885 if (arg == NULL) {
1886 Py_DECREF(*(PyObject**)addr);
1887 return 1;
1888 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001889 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001890 output = arg;
1891 Py_INCREF(output);
1892 }
1893 else {
1894 arg = PyUnicode_FromObject(arg);
1895 if (!arg)
1896 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001897 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001898 Py_DECREF(arg);
1899 if (!output)
1900 return 0;
1901 if (!PyBytes_Check(output)) {
1902 Py_DECREF(output);
1903 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1904 return 0;
1905 }
1906 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001907 size = PyBytes_GET_SIZE(output);
1908 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001909 if (size != strlen(data)) {
1910 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1911 Py_DECREF(output);
1912 return 0;
1913 }
1914 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001915 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001916}
1917
1918
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001919int
1920PyUnicode_FSDecoder(PyObject* arg, void* addr)
1921{
1922 PyObject *output = NULL;
1923 Py_ssize_t size;
1924 void *data;
1925 if (arg == NULL) {
1926 Py_DECREF(*(PyObject**)addr);
1927 return 1;
1928 }
1929 if (PyUnicode_Check(arg)) {
1930 output = arg;
1931 Py_INCREF(output);
1932 }
1933 else {
1934 arg = PyBytes_FromObject(arg);
1935 if (!arg)
1936 return 0;
1937 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1938 PyBytes_GET_SIZE(arg));
1939 Py_DECREF(arg);
1940 if (!output)
1941 return 0;
1942 if (!PyUnicode_Check(output)) {
1943 Py_DECREF(output);
1944 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1945 return 0;
1946 }
1947 }
1948 size = PyUnicode_GET_SIZE(output);
1949 data = PyUnicode_AS_UNICODE(output);
1950 if (size != Py_UNICODE_strlen(data)) {
1951 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1952 Py_DECREF(output);
1953 return 0;
1954 }
1955 *(PyObject**)addr = output;
1956 return Py_CLEANUP_SUPPORTED;
1957}
1958
1959
Martin v. Löwis5b222132007-06-10 09:51:05 +00001960char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001961_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001962{
Christian Heimesf3863112007-11-22 07:46:41 +00001963 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001964 if (!PyUnicode_Check(unicode)) {
1965 PyErr_BadArgument();
1966 return NULL;
1967 }
Victor Stinnerf3fd7332011-03-02 01:03:11 +00001968 bytes = _PyUnicode_AsDefaultEncodedString(unicode);
Christian Heimesf3863112007-11-22 07:46:41 +00001969 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001970 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001971 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001972 *psize = PyBytes_GET_SIZE(bytes);
1973 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001974}
1975
1976char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001977_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001978{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001979 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001980}
1981
Alexander Belopolsky40018472011-02-26 01:02:56 +00001982Py_UNICODE *
1983PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984{
1985 if (!PyUnicode_Check(unicode)) {
1986 PyErr_BadArgument();
1987 goto onError;
1988 }
1989 return PyUnicode_AS_UNICODE(unicode);
1990
Benjamin Peterson29060642009-01-31 22:14:21 +00001991 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 return NULL;
1993}
1994
Alexander Belopolsky40018472011-02-26 01:02:56 +00001995Py_ssize_t
1996PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997{
1998 if (!PyUnicode_Check(unicode)) {
1999 PyErr_BadArgument();
2000 goto onError;
2001 }
2002 return PyUnicode_GET_SIZE(unicode);
2003
Benjamin Peterson29060642009-01-31 22:14:21 +00002004 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 return -1;
2006}
2007
Alexander Belopolsky40018472011-02-26 01:02:56 +00002008const char *
2009PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002010{
Victor Stinner42cb4622010-09-01 19:39:01 +00002011 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002012}
2013
Victor Stinner554f3f02010-06-16 23:33:54 +00002014/* create or adjust a UnicodeDecodeError */
2015static void
2016make_decode_exception(PyObject **exceptionObject,
2017 const char *encoding,
2018 const char *input, Py_ssize_t length,
2019 Py_ssize_t startpos, Py_ssize_t endpos,
2020 const char *reason)
2021{
2022 if (*exceptionObject == NULL) {
2023 *exceptionObject = PyUnicodeDecodeError_Create(
2024 encoding, input, length, startpos, endpos, reason);
2025 }
2026 else {
2027 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2028 goto onError;
2029 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2030 goto onError;
2031 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2032 goto onError;
2033 }
2034 return;
2035
2036onError:
2037 Py_DECREF(*exceptionObject);
2038 *exceptionObject = NULL;
2039}
2040
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002041/* error handling callback helper:
2042 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002043 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002044 and adjust various state variables.
2045 return 0 on success, -1 on error
2046*/
2047
Alexander Belopolsky40018472011-02-26 01:02:56 +00002048static int
2049unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2050 const char *encoding, const char *reason,
2051 const char **input, const char **inend, Py_ssize_t *startinpos,
2052 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2053 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002054{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002055 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056
2057 PyObject *restuple = NULL;
2058 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002059 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002060 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002061 Py_ssize_t requiredsize;
2062 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002063 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002064 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002065 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002066 int res = -1;
2067
2068 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002069 *errorHandler = PyCodec_LookupError(errors);
2070 if (*errorHandler == NULL)
2071 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002072 }
2073
Victor Stinner554f3f02010-06-16 23:33:54 +00002074 make_decode_exception(exceptionObject,
2075 encoding,
2076 *input, *inend - *input,
2077 *startinpos, *endinpos,
2078 reason);
2079 if (*exceptionObject == NULL)
2080 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002081
2082 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2083 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002084 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002085 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002086 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002087 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002088 }
2089 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002090 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002091
2092 /* Copy back the bytes variables, which might have been modified by the
2093 callback */
2094 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2095 if (!inputobj)
2096 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002097 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002098 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002099 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002100 *input = PyBytes_AS_STRING(inputobj);
2101 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002102 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002103 /* we can DECREF safely, as the exception has another reference,
2104 so the object won't go away. */
2105 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002106
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002107 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002108 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002109 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002110 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2111 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002112 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002113
2114 /* need more space? (at least enough for what we
2115 have+the replacement+the rest of the string (starting
2116 at the new input position), so we won't have to check space
2117 when there are no errors in the rest of the string) */
2118 repptr = PyUnicode_AS_UNICODE(repunicode);
2119 repsize = PyUnicode_GET_SIZE(repunicode);
2120 requiredsize = *outpos + repsize + insize-newpos;
2121 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002122 if (requiredsize<2*outsize)
2123 requiredsize = 2*outsize;
2124 if (_PyUnicode_Resize(output, requiredsize) < 0)
2125 goto onError;
2126 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127 }
2128 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002129 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002130 Py_UNICODE_COPY(*outptr, repptr, repsize);
2131 *outptr += repsize;
2132 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002133
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002134 /* we made it! */
2135 res = 0;
2136
Benjamin Peterson29060642009-01-31 22:14:21 +00002137 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002138 Py_XDECREF(restuple);
2139 return res;
2140}
2141
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002142/* --- UTF-7 Codec -------------------------------------------------------- */
2143
Antoine Pitrou244651a2009-05-04 18:56:13 +00002144/* See RFC2152 for details. We encode conservatively and decode liberally. */
2145
2146/* Three simple macros defining base-64. */
2147
2148/* Is c a base-64 character? */
2149
2150#define IS_BASE64(c) \
2151 (((c) >= 'A' && (c) <= 'Z') || \
2152 ((c) >= 'a' && (c) <= 'z') || \
2153 ((c) >= '0' && (c) <= '9') || \
2154 (c) == '+' || (c) == '/')
2155
2156/* given that c is a base-64 character, what is its base-64 value? */
2157
2158#define FROM_BASE64(c) \
2159 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2160 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2161 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2162 (c) == '+' ? 62 : 63)
2163
2164/* What is the base-64 character of the bottom 6 bits of n? */
2165
2166#define TO_BASE64(n) \
2167 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2168
2169/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2170 * decoded as itself. We are permissive on decoding; the only ASCII
2171 * byte not decoding to itself is the + which begins a base64
2172 * string. */
2173
2174#define DECODE_DIRECT(c) \
2175 ((c) <= 127 && (c) != '+')
2176
2177/* The UTF-7 encoder treats ASCII characters differently according to
2178 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2179 * the above). See RFC2152. This array identifies these different
2180 * sets:
2181 * 0 : "Set D"
2182 * alphanumeric and '(),-./:?
2183 * 1 : "Set O"
2184 * !"#$%&*;<=>@[]^_`{|}
2185 * 2 : "whitespace"
2186 * ht nl cr sp
2187 * 3 : special (must be base64 encoded)
2188 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2189 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002190
Tim Petersced69f82003-09-16 20:30:58 +00002191static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002192char utf7_category[128] = {
2193/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2194 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2195/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2196 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2197/* sp ! " # $ % & ' ( ) * + , - . / */
2198 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2199/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2201/* @ A B C D E F G H I J K L M N O */
2202 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2203/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2205/* ` a b c d e f g h i j k l m n o */
2206 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2207/* p q r s t u v w x y z { | } ~ del */
2208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002209};
2210
Antoine Pitrou244651a2009-05-04 18:56:13 +00002211/* ENCODE_DIRECT: this character should be encoded as itself. The
2212 * answer depends on whether we are encoding set O as itself, and also
2213 * on whether we are encoding whitespace as itself. RFC2152 makes it
2214 * clear that the answers to these questions vary between
2215 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002216
Antoine Pitrou244651a2009-05-04 18:56:13 +00002217#define ENCODE_DIRECT(c, directO, directWS) \
2218 ((c) < 128 && (c) > 0 && \
2219 ((utf7_category[(c)] == 0) || \
2220 (directWS && (utf7_category[(c)] == 2)) || \
2221 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002222
Alexander Belopolsky40018472011-02-26 01:02:56 +00002223PyObject *
2224PyUnicode_DecodeUTF7(const char *s,
2225 Py_ssize_t size,
2226 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002227{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002228 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2229}
2230
Antoine Pitrou244651a2009-05-04 18:56:13 +00002231/* The decoder. The only state we preserve is our read position,
2232 * i.e. how many characters we have consumed. So if we end in the
2233 * middle of a shift sequence we have to back off the read position
2234 * and the output to the beginning of the sequence, otherwise we lose
2235 * all the shift state (seen bits, number of bits seen, high
2236 * surrogate). */
2237
Alexander Belopolsky40018472011-02-26 01:02:56 +00002238PyObject *
2239PyUnicode_DecodeUTF7Stateful(const char *s,
2240 Py_ssize_t size,
2241 const char *errors,
2242 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002243{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002244 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002245 Py_ssize_t startinpos;
2246 Py_ssize_t endinpos;
2247 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002248 const char *e;
2249 PyUnicodeObject *unicode;
2250 Py_UNICODE *p;
2251 const char *errmsg = "";
2252 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002253 Py_UNICODE *shiftOutStart;
2254 unsigned int base64bits = 0;
2255 unsigned long base64buffer = 0;
2256 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002257 PyObject *errorHandler = NULL;
2258 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002259
2260 unicode = _PyUnicode_New(size);
2261 if (!unicode)
2262 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002263 if (size == 0) {
2264 if (consumed)
2265 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002266 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002267 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002268
2269 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002270 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002271 e = s + size;
2272
2273 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002274 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002275 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002276 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002277
Antoine Pitrou244651a2009-05-04 18:56:13 +00002278 if (inShift) { /* in a base-64 section */
2279 if (IS_BASE64(ch)) { /* consume a base-64 character */
2280 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2281 base64bits += 6;
2282 s++;
2283 if (base64bits >= 16) {
2284 /* we have enough bits for a UTF-16 value */
2285 Py_UNICODE outCh = (Py_UNICODE)
2286 (base64buffer >> (base64bits-16));
2287 base64bits -= 16;
2288 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2289 if (surrogate) {
2290 /* expecting a second surrogate */
2291 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2292#ifdef Py_UNICODE_WIDE
2293 *p++ = (((surrogate & 0x3FF)<<10)
2294 | (outCh & 0x3FF)) + 0x10000;
2295#else
2296 *p++ = surrogate;
2297 *p++ = outCh;
2298#endif
2299 surrogate = 0;
2300 }
2301 else {
2302 surrogate = 0;
2303 errmsg = "second surrogate missing";
2304 goto utf7Error;
2305 }
2306 }
2307 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2308 /* first surrogate */
2309 surrogate = outCh;
2310 }
2311 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2312 errmsg = "unexpected second surrogate";
2313 goto utf7Error;
2314 }
2315 else {
2316 *p++ = outCh;
2317 }
2318 }
2319 }
2320 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002321 inShift = 0;
2322 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002323 if (surrogate) {
2324 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002325 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002326 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002327 if (base64bits > 0) { /* left-over bits */
2328 if (base64bits >= 6) {
2329 /* We've seen at least one base-64 character */
2330 errmsg = "partial character in shift sequence";
2331 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002332 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002333 else {
2334 /* Some bits remain; they should be zero */
2335 if (base64buffer != 0) {
2336 errmsg = "non-zero padding bits in shift sequence";
2337 goto utf7Error;
2338 }
2339 }
2340 }
2341 if (ch != '-') {
2342 /* '-' is absorbed; other terminating
2343 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002344 *p++ = ch;
2345 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002346 }
2347 }
2348 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002349 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002350 s++; /* consume '+' */
2351 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002352 s++;
2353 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002354 }
2355 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002356 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002357 shiftOutStart = p;
2358 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002359 }
2360 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002361 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002362 *p++ = ch;
2363 s++;
2364 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002365 else {
2366 startinpos = s-starts;
2367 s++;
2368 errmsg = "unexpected special character";
2369 goto utf7Error;
2370 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002371 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002372utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002373 outpos = p-PyUnicode_AS_UNICODE(unicode);
2374 endinpos = s-starts;
2375 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002376 errors, &errorHandler,
2377 "utf7", errmsg,
2378 &starts, &e, &startinpos, &endinpos, &exc, &s,
2379 &unicode, &outpos, &p))
2380 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002381 }
2382
Antoine Pitrou244651a2009-05-04 18:56:13 +00002383 /* end of string */
2384
2385 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2386 /* if we're in an inconsistent state, that's an error */
2387 if (surrogate ||
2388 (base64bits >= 6) ||
2389 (base64bits > 0 && base64buffer != 0)) {
2390 outpos = p-PyUnicode_AS_UNICODE(unicode);
2391 endinpos = size;
2392 if (unicode_decode_call_errorhandler(
2393 errors, &errorHandler,
2394 "utf7", "unterminated shift sequence",
2395 &starts, &e, &startinpos, &endinpos, &exc, &s,
2396 &unicode, &outpos, &p))
2397 goto onError;
2398 if (s < e)
2399 goto restart;
2400 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002401 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002402
2403 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002404 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002405 if (inShift) {
2406 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002407 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002408 }
2409 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002410 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002411 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002412 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002413
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002414 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002415 goto onError;
2416
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002417 Py_XDECREF(errorHandler);
2418 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002419 return (PyObject *)unicode;
2420
Benjamin Peterson29060642009-01-31 22:14:21 +00002421 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002422 Py_XDECREF(errorHandler);
2423 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002424 Py_DECREF(unicode);
2425 return NULL;
2426}
2427
2428
Alexander Belopolsky40018472011-02-26 01:02:56 +00002429PyObject *
2430PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2431 Py_ssize_t size,
2432 int base64SetO,
2433 int base64WhiteSpace,
2434 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002435{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002436 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002437 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002438 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002439 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002440 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002441 unsigned int base64bits = 0;
2442 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002443 char * out;
2444 char * start;
2445
2446 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002447 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002448
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002449 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002450 return PyErr_NoMemory();
2451
Antoine Pitrou244651a2009-05-04 18:56:13 +00002452 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002453 if (v == NULL)
2454 return NULL;
2455
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002456 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002457 for (;i < size; ++i) {
2458 Py_UNICODE ch = s[i];
2459
Antoine Pitrou244651a2009-05-04 18:56:13 +00002460 if (inShift) {
2461 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2462 /* shifting out */
2463 if (base64bits) { /* output remaining bits */
2464 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2465 base64buffer = 0;
2466 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002467 }
2468 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002469 /* Characters not in the BASE64 set implicitly unshift the sequence
2470 so no '-' is required, except if the character is itself a '-' */
2471 if (IS_BASE64(ch) || ch == '-') {
2472 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002473 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002474 *out++ = (char) ch;
2475 }
2476 else {
2477 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002478 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002479 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002480 else { /* not in a shift sequence */
2481 if (ch == '+') {
2482 *out++ = '+';
2483 *out++ = '-';
2484 }
2485 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2486 *out++ = (char) ch;
2487 }
2488 else {
2489 *out++ = '+';
2490 inShift = 1;
2491 goto encode_char;
2492 }
2493 }
2494 continue;
2495encode_char:
2496#ifdef Py_UNICODE_WIDE
2497 if (ch >= 0x10000) {
2498 /* code first surrogate */
2499 base64bits += 16;
2500 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2501 while (base64bits >= 6) {
2502 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2503 base64bits -= 6;
2504 }
2505 /* prepare second surrogate */
2506 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2507 }
2508#endif
2509 base64bits += 16;
2510 base64buffer = (base64buffer << 16) | ch;
2511 while (base64bits >= 6) {
2512 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2513 base64bits -= 6;
2514 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002515 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002516 if (base64bits)
2517 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2518 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002519 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002520 if (_PyBytes_Resize(&v, out - start) < 0)
2521 return NULL;
2522 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002523}
2524
Antoine Pitrou244651a2009-05-04 18:56:13 +00002525#undef IS_BASE64
2526#undef FROM_BASE64
2527#undef TO_BASE64
2528#undef DECODE_DIRECT
2529#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002530
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531/* --- UTF-8 Codec -------------------------------------------------------- */
2532
Tim Petersced69f82003-09-16 20:30:58 +00002533static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002535 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2536 illegal prefix. See RFC 3629 for details */
2537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2542 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2543 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002544 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2549 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2550 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2551 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2552 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553};
2554
Alexander Belopolsky40018472011-02-26 01:02:56 +00002555PyObject *
2556PyUnicode_DecodeUTF8(const char *s,
2557 Py_ssize_t size,
2558 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559{
Walter Dörwald69652032004-09-07 20:24:22 +00002560 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2561}
2562
Antoine Pitrouab868312009-01-10 15:40:25 +00002563/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2564#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2565
2566/* Mask to quickly check whether a C 'long' contains a
2567 non-ASCII, UTF8-encoded char. */
2568#if (SIZEOF_LONG == 8)
2569# define ASCII_CHAR_MASK 0x8080808080808080L
2570#elif (SIZEOF_LONG == 4)
2571# define ASCII_CHAR_MASK 0x80808080L
2572#else
2573# error C 'long' size should be either 4 or 8!
2574#endif
2575
Alexander Belopolsky40018472011-02-26 01:02:56 +00002576PyObject *
2577PyUnicode_DecodeUTF8Stateful(const char *s,
2578 Py_ssize_t size,
2579 const char *errors,
2580 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002581{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002582 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002584 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002585 Py_ssize_t startinpos;
2586 Py_ssize_t endinpos;
2587 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002588 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589 PyUnicodeObject *unicode;
2590 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002591 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002592 PyObject *errorHandler = NULL;
2593 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594
2595 /* Note: size will always be longer than the resulting Unicode
2596 character count */
2597 unicode = _PyUnicode_New(size);
2598 if (!unicode)
2599 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002600 if (size == 0) {
2601 if (consumed)
2602 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605
2606 /* Unpack UTF-8 encoded data */
2607 p = unicode->str;
2608 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002609 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610
2611 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002612 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613
2614 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002615 /* Fast path for runs of ASCII characters. Given that common UTF-8
2616 input will consist of an overwhelming majority of ASCII
2617 characters, we try to optimize for this case by checking
2618 as many characters as a C 'long' can contain.
2619 First, check if we can do an aligned read, as most CPUs have
2620 a penalty for unaligned reads.
2621 */
2622 if (!((size_t) s & LONG_PTR_MASK)) {
2623 /* Help register allocation */
2624 register const char *_s = s;
2625 register Py_UNICODE *_p = p;
2626 while (_s < aligned_end) {
2627 /* Read a whole long at a time (either 4 or 8 bytes),
2628 and do a fast unrolled copy if it only contains ASCII
2629 characters. */
2630 unsigned long data = *(unsigned long *) _s;
2631 if (data & ASCII_CHAR_MASK)
2632 break;
2633 _p[0] = (unsigned char) _s[0];
2634 _p[1] = (unsigned char) _s[1];
2635 _p[2] = (unsigned char) _s[2];
2636 _p[3] = (unsigned char) _s[3];
2637#if (SIZEOF_LONG == 8)
2638 _p[4] = (unsigned char) _s[4];
2639 _p[5] = (unsigned char) _s[5];
2640 _p[6] = (unsigned char) _s[6];
2641 _p[7] = (unsigned char) _s[7];
2642#endif
2643 _s += SIZEOF_LONG;
2644 _p += SIZEOF_LONG;
2645 }
2646 s = _s;
2647 p = _p;
2648 if (s == e)
2649 break;
2650 ch = (unsigned char)*s;
2651 }
2652 }
2653
2654 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002655 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002656 s++;
2657 continue;
2658 }
2659
2660 n = utf8_code_length[ch];
2661
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002662 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002663 if (consumed)
2664 break;
2665 else {
2666 errmsg = "unexpected end of data";
2667 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002668 endinpos = startinpos+1;
2669 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2670 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002671 goto utf8Error;
2672 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674
2675 switch (n) {
2676
2677 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002678 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002679 startinpos = s-starts;
2680 endinpos = startinpos+1;
2681 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682
2683 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002684 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002685 startinpos = s-starts;
2686 endinpos = startinpos+1;
2687 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002688
2689 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002690 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002691 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002692 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002693 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002694 goto utf8Error;
2695 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002697 assert ((ch > 0x007F) && (ch <= 0x07FF));
2698 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 break;
2700
2701 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002702 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2703 will result in surrogates in range d800-dfff. Surrogates are
2704 not valid UTF-8 so they are rejected.
2705 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2706 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002707 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002708 (s[2] & 0xc0) != 0x80 ||
2709 ((unsigned char)s[0] == 0xE0 &&
2710 (unsigned char)s[1] < 0xA0) ||
2711 ((unsigned char)s[0] == 0xED &&
2712 (unsigned char)s[1] > 0x9F)) {
2713 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002714 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002715 endinpos = startinpos + 1;
2716
2717 /* if s[1] first two bits are 1 and 0, then the invalid
2718 continuation byte is s[2], so increment endinpos by 1,
2719 if not, s[1] is invalid and endinpos doesn't need to
2720 be incremented. */
2721 if ((s[1] & 0xC0) == 0x80)
2722 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002723 goto utf8Error;
2724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002726 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2727 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002728 break;
2729
2730 case 4:
2731 if ((s[1] & 0xc0) != 0x80 ||
2732 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002733 (s[3] & 0xc0) != 0x80 ||
2734 ((unsigned char)s[0] == 0xF0 &&
2735 (unsigned char)s[1] < 0x90) ||
2736 ((unsigned char)s[0] == 0xF4 &&
2737 (unsigned char)s[1] > 0x8F)) {
2738 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002739 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002740 endinpos = startinpos + 1;
2741 if ((s[1] & 0xC0) == 0x80) {
2742 endinpos++;
2743 if ((s[2] & 0xC0) == 0x80)
2744 endinpos++;
2745 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002746 goto utf8Error;
2747 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002748 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002749 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2750 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2751
Fredrik Lundh8f455852001-06-27 18:59:43 +00002752#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002753 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002754#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002755 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002756
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002757 /* translate from 10000..10FFFF to 0..FFFF */
2758 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002759
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002760 /* high surrogate = top 10 bits added to D800 */
2761 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002762
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002763 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002764 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002765#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 }
2768 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002769 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002770
Benjamin Peterson29060642009-01-31 22:14:21 +00002771 utf8Error:
2772 outpos = p-PyUnicode_AS_UNICODE(unicode);
2773 if (unicode_decode_call_errorhandler(
2774 errors, &errorHandler,
2775 "utf8", errmsg,
2776 &starts, &e, &startinpos, &endinpos, &exc, &s,
2777 &unicode, &outpos, &p))
2778 goto onError;
2779 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 }
Walter Dörwald69652032004-09-07 20:24:22 +00002781 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002782 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783
2784 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002785 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 goto onError;
2787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002788 Py_XDECREF(errorHandler);
2789 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790 return (PyObject *)unicode;
2791
Benjamin Peterson29060642009-01-31 22:14:21 +00002792 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 Py_XDECREF(errorHandler);
2794 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795 Py_DECREF(unicode);
2796 return NULL;
2797}
2798
Antoine Pitrouab868312009-01-10 15:40:25 +00002799#undef ASCII_CHAR_MASK
2800
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002801#ifdef __APPLE__
2802
2803/* Simplified UTF-8 decoder using surrogateescape error handler,
2804 used to decode the command line arguments on Mac OS X. */
2805
2806wchar_t*
2807_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2808{
2809 int n;
2810 const char *e;
2811 wchar_t *unicode, *p;
2812
2813 /* Note: size will always be longer than the resulting Unicode
2814 character count */
2815 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2816 PyErr_NoMemory();
2817 return NULL;
2818 }
2819 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2820 if (!unicode)
2821 return NULL;
2822
2823 /* Unpack UTF-8 encoded data */
2824 p = unicode;
2825 e = s + size;
2826 while (s < e) {
2827 Py_UCS4 ch = (unsigned char)*s;
2828
2829 if (ch < 0x80) {
2830 *p++ = (wchar_t)ch;
2831 s++;
2832 continue;
2833 }
2834
2835 n = utf8_code_length[ch];
2836 if (s + n > e) {
2837 goto surrogateescape;
2838 }
2839
2840 switch (n) {
2841 case 0:
2842 case 1:
2843 goto surrogateescape;
2844
2845 case 2:
2846 if ((s[1] & 0xc0) != 0x80)
2847 goto surrogateescape;
2848 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2849 assert ((ch > 0x007F) && (ch <= 0x07FF));
2850 *p++ = (wchar_t)ch;
2851 break;
2852
2853 case 3:
2854 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2855 will result in surrogates in range d800-dfff. Surrogates are
2856 not valid UTF-8 so they are rejected.
2857 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2858 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2859 if ((s[1] & 0xc0) != 0x80 ||
2860 (s[2] & 0xc0) != 0x80 ||
2861 ((unsigned char)s[0] == 0xE0 &&
2862 (unsigned char)s[1] < 0xA0) ||
2863 ((unsigned char)s[0] == 0xED &&
2864 (unsigned char)s[1] > 0x9F)) {
2865
2866 goto surrogateescape;
2867 }
2868 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2869 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2870 *p++ = (Py_UNICODE)ch;
2871 break;
2872
2873 case 4:
2874 if ((s[1] & 0xc0) != 0x80 ||
2875 (s[2] & 0xc0) != 0x80 ||
2876 (s[3] & 0xc0) != 0x80 ||
2877 ((unsigned char)s[0] == 0xF0 &&
2878 (unsigned char)s[1] < 0x90) ||
2879 ((unsigned char)s[0] == 0xF4 &&
2880 (unsigned char)s[1] > 0x8F)) {
2881 goto surrogateescape;
2882 }
2883 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2884 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2885 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2886
2887#if SIZEOF_WCHAR_T == 4
2888 *p++ = (wchar_t)ch;
2889#else
2890 /* compute and append the two surrogates: */
2891
2892 /* translate from 10000..10FFFF to 0..FFFF */
2893 ch -= 0x10000;
2894
2895 /* high surrogate = top 10 bits added to D800 */
2896 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2897
2898 /* low surrogate = bottom 10 bits added to DC00 */
2899 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2900#endif
2901 break;
2902 }
2903 s += n;
2904 continue;
2905
2906 surrogateescape:
2907 *p++ = 0xDC00 + ch;
2908 s++;
2909 }
2910 *p = L'\0';
2911 return unicode;
2912}
2913
2914#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002915
Tim Peters602f7402002-04-27 18:03:26 +00002916/* Allocation strategy: if the string is short, convert into a stack buffer
2917 and allocate exactly as much space needed at the end. Else allocate the
2918 maximum possible needed (4 result bytes per Unicode character), and return
2919 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002920*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002921PyObject *
2922PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002923 Py_ssize_t size,
2924 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002925{
Tim Peters602f7402002-04-27 18:03:26 +00002926#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002927
Guido van Rossum98297ee2007-11-06 21:34:58 +00002928 Py_ssize_t i; /* index into s of next input byte */
2929 PyObject *result; /* result string object */
2930 char *p; /* next free byte in output buffer */
2931 Py_ssize_t nallocated; /* number of result bytes allocated */
2932 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002933 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002934 PyObject *errorHandler = NULL;
2935 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002936
Tim Peters602f7402002-04-27 18:03:26 +00002937 assert(s != NULL);
2938 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939
Tim Peters602f7402002-04-27 18:03:26 +00002940 if (size <= MAX_SHORT_UNICHARS) {
2941 /* Write into the stack buffer; nallocated can't overflow.
2942 * At the end, we'll allocate exactly as much heap space as it
2943 * turns out we need.
2944 */
2945 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002946 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002947 p = stackbuf;
2948 }
2949 else {
2950 /* Overallocate on the heap, and give the excess back at the end. */
2951 nallocated = size * 4;
2952 if (nallocated / 4 != size) /* overflow! */
2953 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002954 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002955 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002956 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002957 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002958 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002959
Tim Peters602f7402002-04-27 18:03:26 +00002960 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002961 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002962
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002963 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002964 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002966
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002968 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002969 *p++ = (char)(0xc0 | (ch >> 6));
2970 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002971 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002972#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002973 /* Special case: check for high and low surrogate */
2974 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2975 Py_UCS4 ch2 = s[i];
2976 /* Combine the two surrogates to form a UCS4 value */
2977 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2978 i++;
2979
2980 /* Encode UCS4 Unicode ordinals */
2981 *p++ = (char)(0xf0 | (ch >> 18));
2982 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002983 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2984 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002985 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002986#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002987 Py_ssize_t newpos;
2988 PyObject *rep;
2989 Py_ssize_t repsize, k;
2990 rep = unicode_encode_call_errorhandler
2991 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2992 s, size, &exc, i-1, i, &newpos);
2993 if (!rep)
2994 goto error;
2995
2996 if (PyBytes_Check(rep))
2997 repsize = PyBytes_GET_SIZE(rep);
2998 else
2999 repsize = PyUnicode_GET_SIZE(rep);
3000
3001 if (repsize > 4) {
3002 Py_ssize_t offset;
3003
3004 if (result == NULL)
3005 offset = p - stackbuf;
3006 else
3007 offset = p - PyBytes_AS_STRING(result);
3008
3009 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
3010 /* integer overflow */
3011 PyErr_NoMemory();
3012 goto error;
3013 }
3014 nallocated += repsize - 4;
3015 if (result != NULL) {
3016 if (_PyBytes_Resize(&result, nallocated) < 0)
3017 goto error;
3018 } else {
3019 result = PyBytes_FromStringAndSize(NULL, nallocated);
3020 if (result == NULL)
3021 goto error;
3022 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3023 }
3024 p = PyBytes_AS_STRING(result) + offset;
3025 }
3026
3027 if (PyBytes_Check(rep)) {
3028 char *prep = PyBytes_AS_STRING(rep);
3029 for(k = repsize; k > 0; k--)
3030 *p++ = *prep++;
3031 } else /* rep is unicode */ {
3032 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3033 Py_UNICODE c;
3034
3035 for(k=0; k<repsize; k++) {
3036 c = prep[k];
3037 if (0x80 <= c) {
3038 raise_encode_exception(&exc, "utf-8", s, size,
3039 i-1, i, "surrogates not allowed");
3040 goto error;
3041 }
3042 *p++ = (char)prep[k];
3043 }
3044 }
3045 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003046#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003047 }
Victor Stinner445a6232010-04-22 20:01:57 +00003048#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003049 } else if (ch < 0x10000) {
3050 *p++ = (char)(0xe0 | (ch >> 12));
3051 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3052 *p++ = (char)(0x80 | (ch & 0x3f));
3053 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003054 /* Encode UCS4 Unicode ordinals */
3055 *p++ = (char)(0xf0 | (ch >> 18));
3056 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3057 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3058 *p++ = (char)(0x80 | (ch & 0x3f));
3059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003061
Guido van Rossum98297ee2007-11-06 21:34:58 +00003062 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003063 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003064 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003065 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003066 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003067 }
3068 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003069 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003070 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003071 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003072 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003073 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003074 Py_XDECREF(errorHandler);
3075 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003076 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003077 error:
3078 Py_XDECREF(errorHandler);
3079 Py_XDECREF(exc);
3080 Py_XDECREF(result);
3081 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003082
Tim Peters602f7402002-04-27 18:03:26 +00003083#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084}
3085
Alexander Belopolsky40018472011-02-26 01:02:56 +00003086PyObject *
3087PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088{
Victor Stinnera5c68c32011-03-02 01:03:14 +00003089 PyObject *utf8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 if (!PyUnicode_Check(unicode)) {
3091 PyErr_BadArgument();
3092 return NULL;
3093 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003094 utf8 = _PyUnicode_AsDefaultEncodedString(unicode);
3095 if (utf8 == NULL)
3096 return NULL;
3097 Py_INCREF(utf8);
3098 return utf8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099}
3100
Walter Dörwald41980ca2007-08-16 21:55:45 +00003101/* --- UTF-32 Codec ------------------------------------------------------- */
3102
3103PyObject *
3104PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003105 Py_ssize_t size,
3106 const char *errors,
3107 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003108{
3109 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3110}
3111
3112PyObject *
3113PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003114 Py_ssize_t size,
3115 const char *errors,
3116 int *byteorder,
3117 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003118{
3119 const char *starts = s;
3120 Py_ssize_t startinpos;
3121 Py_ssize_t endinpos;
3122 Py_ssize_t outpos;
3123 PyUnicodeObject *unicode;
3124 Py_UNICODE *p;
3125#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003126 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003127 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003128#else
3129 const int pairs = 0;
3130#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003131 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003132 int bo = 0; /* assume native ordering by default */
3133 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003134 /* Offsets from q for retrieving bytes in the right order. */
3135#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3136 int iorder[] = {0, 1, 2, 3};
3137#else
3138 int iorder[] = {3, 2, 1, 0};
3139#endif
3140 PyObject *errorHandler = NULL;
3141 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003142
Walter Dörwald41980ca2007-08-16 21:55:45 +00003143 q = (unsigned char *)s;
3144 e = q + size;
3145
3146 if (byteorder)
3147 bo = *byteorder;
3148
3149 /* Check for BOM marks (U+FEFF) in the input and adjust current
3150 byte order setting accordingly. In native mode, the leading BOM
3151 mark is skipped, in all other modes, it is copied to the output
3152 stream as-is (giving a ZWNBSP character). */
3153 if (bo == 0) {
3154 if (size >= 4) {
3155 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003156 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003157#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003158 if (bom == 0x0000FEFF) {
3159 q += 4;
3160 bo = -1;
3161 }
3162 else if (bom == 0xFFFE0000) {
3163 q += 4;
3164 bo = 1;
3165 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003166#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003167 if (bom == 0x0000FEFF) {
3168 q += 4;
3169 bo = 1;
3170 }
3171 else if (bom == 0xFFFE0000) {
3172 q += 4;
3173 bo = -1;
3174 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003175#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003176 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003177 }
3178
3179 if (bo == -1) {
3180 /* force LE */
3181 iorder[0] = 0;
3182 iorder[1] = 1;
3183 iorder[2] = 2;
3184 iorder[3] = 3;
3185 }
3186 else if (bo == 1) {
3187 /* force BE */
3188 iorder[0] = 3;
3189 iorder[1] = 2;
3190 iorder[2] = 1;
3191 iorder[3] = 0;
3192 }
3193
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003194 /* On narrow builds we split characters outside the BMP into two
3195 codepoints => count how much extra space we need. */
3196#ifndef Py_UNICODE_WIDE
3197 for (qq = q; qq < e; qq += 4)
3198 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3199 pairs++;
3200#endif
3201
3202 /* This might be one to much, because of a BOM */
3203 unicode = _PyUnicode_New((size+3)/4+pairs);
3204 if (!unicode)
3205 return NULL;
3206 if (size == 0)
3207 return (PyObject *)unicode;
3208
3209 /* Unpack UTF-32 encoded data */
3210 p = unicode->str;
3211
Walter Dörwald41980ca2007-08-16 21:55:45 +00003212 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003213 Py_UCS4 ch;
3214 /* remaining bytes at the end? (size should be divisible by 4) */
3215 if (e-q<4) {
3216 if (consumed)
3217 break;
3218 errmsg = "truncated data";
3219 startinpos = ((const char *)q)-starts;
3220 endinpos = ((const char *)e)-starts;
3221 goto utf32Error;
3222 /* The remaining input chars are ignored if the callback
3223 chooses to skip the input */
3224 }
3225 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3226 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003227
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 if (ch >= 0x110000)
3229 {
3230 errmsg = "codepoint not in range(0x110000)";
3231 startinpos = ((const char *)q)-starts;
3232 endinpos = startinpos+4;
3233 goto utf32Error;
3234 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003235#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003236 if (ch >= 0x10000)
3237 {
3238 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3239 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3240 }
3241 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003242#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003243 *p++ = ch;
3244 q += 4;
3245 continue;
3246 utf32Error:
3247 outpos = p-PyUnicode_AS_UNICODE(unicode);
3248 if (unicode_decode_call_errorhandler(
3249 errors, &errorHandler,
3250 "utf32", errmsg,
3251 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3252 &unicode, &outpos, &p))
3253 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003254 }
3255
3256 if (byteorder)
3257 *byteorder = bo;
3258
3259 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003260 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003261
3262 /* Adjust length */
3263 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3264 goto onError;
3265
3266 Py_XDECREF(errorHandler);
3267 Py_XDECREF(exc);
3268 return (PyObject *)unicode;
3269
Benjamin Peterson29060642009-01-31 22:14:21 +00003270 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003271 Py_DECREF(unicode);
3272 Py_XDECREF(errorHandler);
3273 Py_XDECREF(exc);
3274 return NULL;
3275}
3276
3277PyObject *
3278PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003279 Py_ssize_t size,
3280 const char *errors,
3281 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003282{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003283 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003284 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003285 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003286#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003287 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003288#else
3289 const int pairs = 0;
3290#endif
3291 /* Offsets from p for storing byte pairs in the right order. */
3292#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3293 int iorder[] = {0, 1, 2, 3};
3294#else
3295 int iorder[] = {3, 2, 1, 0};
3296#endif
3297
Benjamin Peterson29060642009-01-31 22:14:21 +00003298#define STORECHAR(CH) \
3299 do { \
3300 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3301 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3302 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3303 p[iorder[0]] = (CH) & 0xff; \
3304 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003305 } while(0)
3306
3307 /* In narrow builds we can output surrogate pairs as one codepoint,
3308 so we need less space. */
3309#ifndef Py_UNICODE_WIDE
3310 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003311 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3312 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3313 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003314#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003315 nsize = (size - pairs + (byteorder == 0));
3316 bytesize = nsize * 4;
3317 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003318 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003319 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003320 if (v == NULL)
3321 return NULL;
3322
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003323 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003324 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003325 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003326 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003327 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003328
3329 if (byteorder == -1) {
3330 /* force LE */
3331 iorder[0] = 0;
3332 iorder[1] = 1;
3333 iorder[2] = 2;
3334 iorder[3] = 3;
3335 }
3336 else if (byteorder == 1) {
3337 /* force BE */
3338 iorder[0] = 3;
3339 iorder[1] = 2;
3340 iorder[2] = 1;
3341 iorder[3] = 0;
3342 }
3343
3344 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003345 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003346#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003347 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3348 Py_UCS4 ch2 = *s;
3349 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3350 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3351 s++;
3352 size--;
3353 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003354 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003355#endif
3356 STORECHAR(ch);
3357 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003358
3359 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003360 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003361#undef STORECHAR
3362}
3363
Alexander Belopolsky40018472011-02-26 01:02:56 +00003364PyObject *
3365PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003366{
3367 if (!PyUnicode_Check(unicode)) {
3368 PyErr_BadArgument();
3369 return NULL;
3370 }
3371 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003372 PyUnicode_GET_SIZE(unicode),
3373 NULL,
3374 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003375}
3376
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377/* --- UTF-16 Codec ------------------------------------------------------- */
3378
Tim Peters772747b2001-08-09 22:21:55 +00003379PyObject *
3380PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003381 Py_ssize_t size,
3382 const char *errors,
3383 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384{
Walter Dörwald69652032004-09-07 20:24:22 +00003385 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3386}
3387
Antoine Pitrouab868312009-01-10 15:40:25 +00003388/* Two masks for fast checking of whether a C 'long' may contain
3389 UTF16-encoded surrogate characters. This is an efficient heuristic,
3390 assuming that non-surrogate characters with a code point >= 0x8000 are
3391 rare in most input.
3392 FAST_CHAR_MASK is used when the input is in native byte ordering,
3393 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003394*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003395#if (SIZEOF_LONG == 8)
3396# define FAST_CHAR_MASK 0x8000800080008000L
3397# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3398#elif (SIZEOF_LONG == 4)
3399# define FAST_CHAR_MASK 0x80008000L
3400# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3401#else
3402# error C 'long' size should be either 4 or 8!
3403#endif
3404
Walter Dörwald69652032004-09-07 20:24:22 +00003405PyObject *
3406PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003407 Py_ssize_t size,
3408 const char *errors,
3409 int *byteorder,
3410 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003411{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003412 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003413 Py_ssize_t startinpos;
3414 Py_ssize_t endinpos;
3415 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416 PyUnicodeObject *unicode;
3417 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003418 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003419 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003420 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003421 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003422 /* Offsets from q for retrieving byte pairs in the right order. */
3423#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3424 int ihi = 1, ilo = 0;
3425#else
3426 int ihi = 0, ilo = 1;
3427#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003428 PyObject *errorHandler = NULL;
3429 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430
3431 /* Note: size will always be longer than the resulting Unicode
3432 character count */
3433 unicode = _PyUnicode_New(size);
3434 if (!unicode)
3435 return NULL;
3436 if (size == 0)
3437 return (PyObject *)unicode;
3438
3439 /* Unpack UTF-16 encoded data */
3440 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003441 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003442 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443
3444 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003445 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003447 /* Check for BOM marks (U+FEFF) in the input and adjust current
3448 byte order setting accordingly. In native mode, the leading BOM
3449 mark is skipped, in all other modes, it is copied to the output
3450 stream as-is (giving a ZWNBSP character). */
3451 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003452 if (size >= 2) {
3453 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003454#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003455 if (bom == 0xFEFF) {
3456 q += 2;
3457 bo = -1;
3458 }
3459 else if (bom == 0xFFFE) {
3460 q += 2;
3461 bo = 1;
3462 }
Tim Petersced69f82003-09-16 20:30:58 +00003463#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003464 if (bom == 0xFEFF) {
3465 q += 2;
3466 bo = 1;
3467 }
3468 else if (bom == 0xFFFE) {
3469 q += 2;
3470 bo = -1;
3471 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003472#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003473 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475
Tim Peters772747b2001-08-09 22:21:55 +00003476 if (bo == -1) {
3477 /* force LE */
3478 ihi = 1;
3479 ilo = 0;
3480 }
3481 else if (bo == 1) {
3482 /* force BE */
3483 ihi = 0;
3484 ilo = 1;
3485 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003486#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3487 native_ordering = ilo < ihi;
3488#else
3489 native_ordering = ilo > ihi;
3490#endif
Tim Peters772747b2001-08-09 22:21:55 +00003491
Antoine Pitrouab868312009-01-10 15:40:25 +00003492 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003493 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003494 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003495 /* First check for possible aligned read of a C 'long'. Unaligned
3496 reads are more expensive, better to defer to another iteration. */
3497 if (!((size_t) q & LONG_PTR_MASK)) {
3498 /* Fast path for runs of non-surrogate chars. */
3499 register const unsigned char *_q = q;
3500 Py_UNICODE *_p = p;
3501 if (native_ordering) {
3502 /* Native ordering is simple: as long as the input cannot
3503 possibly contain a surrogate char, do an unrolled copy
3504 of several 16-bit code points to the target object.
3505 The non-surrogate check is done on several input bytes
3506 at a time (as many as a C 'long' can contain). */
3507 while (_q < aligned_end) {
3508 unsigned long data = * (unsigned long *) _q;
3509 if (data & FAST_CHAR_MASK)
3510 break;
3511 _p[0] = ((unsigned short *) _q)[0];
3512 _p[1] = ((unsigned short *) _q)[1];
3513#if (SIZEOF_LONG == 8)
3514 _p[2] = ((unsigned short *) _q)[2];
3515 _p[3] = ((unsigned short *) _q)[3];
3516#endif
3517 _q += SIZEOF_LONG;
3518 _p += SIZEOF_LONG / 2;
3519 }
3520 }
3521 else {
3522 /* Byteswapped ordering is similar, but we must decompose
3523 the copy bytewise, and take care of zero'ing out the
3524 upper bytes if the target object is in 32-bit units
3525 (that is, in UCS-4 builds). */
3526 while (_q < aligned_end) {
3527 unsigned long data = * (unsigned long *) _q;
3528 if (data & SWAPPED_FAST_CHAR_MASK)
3529 break;
3530 /* Zero upper bytes in UCS-4 builds */
3531#if (Py_UNICODE_SIZE > 2)
3532 _p[0] = 0;
3533 _p[1] = 0;
3534#if (SIZEOF_LONG == 8)
3535 _p[2] = 0;
3536 _p[3] = 0;
3537#endif
3538#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003539 /* Issue #4916; UCS-4 builds on big endian machines must
3540 fill the two last bytes of each 4-byte unit. */
3541#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3542# define OFF 2
3543#else
3544# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003545#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003546 ((unsigned char *) _p)[OFF + 1] = _q[0];
3547 ((unsigned char *) _p)[OFF + 0] = _q[1];
3548 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3549 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3550#if (SIZEOF_LONG == 8)
3551 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3552 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3553 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3554 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3555#endif
3556#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003557 _q += SIZEOF_LONG;
3558 _p += SIZEOF_LONG / 2;
3559 }
3560 }
3561 p = _p;
3562 q = _q;
3563 if (q >= e)
3564 break;
3565 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003566 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567
Benjamin Peterson14339b62009-01-31 16:36:08 +00003568 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003569
3570 if (ch < 0xD800 || ch > 0xDFFF) {
3571 *p++ = ch;
3572 continue;
3573 }
3574
3575 /* UTF-16 code pair: */
3576 if (q > e) {
3577 errmsg = "unexpected end of data";
3578 startinpos = (((const char *)q) - 2) - starts;
3579 endinpos = ((const char *)e) + 1 - starts;
3580 goto utf16Error;
3581 }
3582 if (0xD800 <= ch && ch <= 0xDBFF) {
3583 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3584 q += 2;
3585 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003586#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003587 *p++ = ch;
3588 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003589#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003590 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003591#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003592 continue;
3593 }
3594 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003595 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003596 startinpos = (((const char *)q)-4)-starts;
3597 endinpos = startinpos+2;
3598 goto utf16Error;
3599 }
3600
Benjamin Peterson14339b62009-01-31 16:36:08 +00003601 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003602 errmsg = "illegal encoding";
3603 startinpos = (((const char *)q)-2)-starts;
3604 endinpos = startinpos+2;
3605 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003606
Benjamin Peterson29060642009-01-31 22:14:21 +00003607 utf16Error:
3608 outpos = p - PyUnicode_AS_UNICODE(unicode);
3609 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003610 errors,
3611 &errorHandler,
3612 "utf16", errmsg,
3613 &starts,
3614 (const char **)&e,
3615 &startinpos,
3616 &endinpos,
3617 &exc,
3618 (const char **)&q,
3619 &unicode,
3620 &outpos,
3621 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003622 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003624 /* remaining byte at the end? (size should be even) */
3625 if (e == q) {
3626 if (!consumed) {
3627 errmsg = "truncated data";
3628 startinpos = ((const char *)q) - starts;
3629 endinpos = ((const char *)e) + 1 - starts;
3630 outpos = p - PyUnicode_AS_UNICODE(unicode);
3631 if (unicode_decode_call_errorhandler(
3632 errors,
3633 &errorHandler,
3634 "utf16", errmsg,
3635 &starts,
3636 (const char **)&e,
3637 &startinpos,
3638 &endinpos,
3639 &exc,
3640 (const char **)&q,
3641 &unicode,
3642 &outpos,
3643 &p))
3644 goto onError;
3645 /* The remaining input chars are ignored if the callback
3646 chooses to skip the input */
3647 }
3648 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649
3650 if (byteorder)
3651 *byteorder = bo;
3652
Walter Dörwald69652032004-09-07 20:24:22 +00003653 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003654 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003655
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003657 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 goto onError;
3659
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 Py_XDECREF(errorHandler);
3661 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 return (PyObject *)unicode;
3663
Benjamin Peterson29060642009-01-31 22:14:21 +00003664 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003666 Py_XDECREF(errorHandler);
3667 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 return NULL;
3669}
3670
Antoine Pitrouab868312009-01-10 15:40:25 +00003671#undef FAST_CHAR_MASK
3672#undef SWAPPED_FAST_CHAR_MASK
3673
Tim Peters772747b2001-08-09 22:21:55 +00003674PyObject *
3675PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 Py_ssize_t size,
3677 const char *errors,
3678 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003680 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003681 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003682 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003683#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003684 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003685#else
3686 const int pairs = 0;
3687#endif
Tim Peters772747b2001-08-09 22:21:55 +00003688 /* Offsets from p for storing byte pairs in the right order. */
3689#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3690 int ihi = 1, ilo = 0;
3691#else
3692 int ihi = 0, ilo = 1;
3693#endif
3694
Benjamin Peterson29060642009-01-31 22:14:21 +00003695#define STORECHAR(CH) \
3696 do { \
3697 p[ihi] = ((CH) >> 8) & 0xff; \
3698 p[ilo] = (CH) & 0xff; \
3699 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003700 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003702#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003703 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003704 if (s[i] >= 0x10000)
3705 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003706#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003707 /* 2 * (size + pairs + (byteorder == 0)) */
3708 if (size > PY_SSIZE_T_MAX ||
3709 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003710 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003711 nsize = size + pairs + (byteorder == 0);
3712 bytesize = nsize * 2;
3713 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003714 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003715 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716 if (v == NULL)
3717 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003719 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003721 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003722 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003723 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003724
3725 if (byteorder == -1) {
3726 /* force LE */
3727 ihi = 1;
3728 ilo = 0;
3729 }
3730 else if (byteorder == 1) {
3731 /* force BE */
3732 ihi = 0;
3733 ilo = 1;
3734 }
3735
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003736 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003737 Py_UNICODE ch = *s++;
3738 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003739#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003740 if (ch >= 0x10000) {
3741 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3742 ch = 0xD800 | ((ch-0x10000) >> 10);
3743 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003744#endif
Tim Peters772747b2001-08-09 22:21:55 +00003745 STORECHAR(ch);
3746 if (ch2)
3747 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003748 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003749
3750 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003751 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003752#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753}
3754
Alexander Belopolsky40018472011-02-26 01:02:56 +00003755PyObject *
3756PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757{
3758 if (!PyUnicode_Check(unicode)) {
3759 PyErr_BadArgument();
3760 return NULL;
3761 }
3762 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003763 PyUnicode_GET_SIZE(unicode),
3764 NULL,
3765 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766}
3767
3768/* --- Unicode Escape Codec ----------------------------------------------- */
3769
Fredrik Lundh06d12682001-01-24 07:59:11 +00003770static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003771
Alexander Belopolsky40018472011-02-26 01:02:56 +00003772PyObject *
3773PyUnicode_DecodeUnicodeEscape(const char *s,
3774 Py_ssize_t size,
3775 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003777 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003778 Py_ssize_t startinpos;
3779 Py_ssize_t endinpos;
3780 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003785 char* message;
3786 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 PyObject *errorHandler = NULL;
3788 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003789
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 /* Escaped strings will always be longer than the resulting
3791 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792 length after conversion to the true value.
3793 (but if the error callback returns a long replacement string
3794 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795 v = _PyUnicode_New(size);
3796 if (v == NULL)
3797 goto onError;
3798 if (size == 0)
3799 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003800
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003801 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003803
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804 while (s < end) {
3805 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003806 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003807 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808
3809 /* Non-escape characters are interpreted as Unicode ordinals */
3810 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003811 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812 continue;
3813 }
3814
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003815 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 /* \ - Escapes */
3817 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003818 c = *s++;
3819 if (s > end)
3820 c = '\0'; /* Invalid after \ */
3821 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822
Benjamin Peterson29060642009-01-31 22:14:21 +00003823 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824 case '\n': break;
3825 case '\\': *p++ = '\\'; break;
3826 case '\'': *p++ = '\''; break;
3827 case '\"': *p++ = '\"'; break;
3828 case 'b': *p++ = '\b'; break;
3829 case 'f': *p++ = '\014'; break; /* FF */
3830 case 't': *p++ = '\t'; break;
3831 case 'n': *p++ = '\n'; break;
3832 case 'r': *p++ = '\r'; break;
3833 case 'v': *p++ = '\013'; break; /* VT */
3834 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3835
Benjamin Peterson29060642009-01-31 22:14:21 +00003836 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 case '0': case '1': case '2': case '3':
3838 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003839 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003840 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003841 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003842 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003843 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003845 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846 break;
3847
Benjamin Peterson29060642009-01-31 22:14:21 +00003848 /* hex escapes */
3849 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003850 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003851 digits = 2;
3852 message = "truncated \\xXX escape";
3853 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854
Benjamin Peterson29060642009-01-31 22:14:21 +00003855 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003857 digits = 4;
3858 message = "truncated \\uXXXX escape";
3859 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860
Benjamin Peterson29060642009-01-31 22:14:21 +00003861 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003862 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003863 digits = 8;
3864 message = "truncated \\UXXXXXXXX escape";
3865 hexescape:
3866 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003867 outpos = p-PyUnicode_AS_UNICODE(v);
3868 if (s+digits>end) {
3869 endinpos = size;
3870 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003871 errors, &errorHandler,
3872 "unicodeescape", "end of string in escape sequence",
3873 &starts, &end, &startinpos, &endinpos, &exc, &s,
3874 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003875 goto onError;
3876 goto nextByte;
3877 }
3878 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003879 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003880 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 endinpos = (s+i+1)-starts;
3882 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003883 errors, &errorHandler,
3884 "unicodeescape", message,
3885 &starts, &end, &startinpos, &endinpos, &exc, &s,
3886 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003887 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003888 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003889 }
3890 chr = (chr<<4) & ~0xF;
3891 if (c >= '0' && c <= '9')
3892 chr += c - '0';
3893 else if (c >= 'a' && c <= 'f')
3894 chr += 10 + c - 'a';
3895 else
3896 chr += 10 + c - 'A';
3897 }
3898 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003899 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003900 /* _decoding_error will have already written into the
3901 target buffer. */
3902 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003903 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003904 /* when we get here, chr is a 32-bit unicode character */
3905 if (chr <= 0xffff)
3906 /* UCS-2 character */
3907 *p++ = (Py_UNICODE) chr;
3908 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003909 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003910 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003911#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003912 *p++ = chr;
3913#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003914 chr -= 0x10000L;
3915 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003916 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003917#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003918 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003919 endinpos = s-starts;
3920 outpos = p-PyUnicode_AS_UNICODE(v);
3921 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003922 errors, &errorHandler,
3923 "unicodeescape", "illegal Unicode character",
3924 &starts, &end, &startinpos, &endinpos, &exc, &s,
3925 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003926 goto onError;
3927 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003928 break;
3929
Benjamin Peterson29060642009-01-31 22:14:21 +00003930 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003931 case 'N':
3932 message = "malformed \\N character escape";
3933 if (ucnhash_CAPI == NULL) {
3934 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003935 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003936 if (ucnhash_CAPI == NULL)
3937 goto ucnhashError;
3938 }
3939 if (*s == '{') {
3940 const char *start = s+1;
3941 /* look for the closing brace */
3942 while (*s != '}' && s < end)
3943 s++;
3944 if (s > start && s < end && *s == '}') {
3945 /* found a name. look it up in the unicode database */
3946 message = "unknown Unicode character name";
3947 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003948 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003949 goto store;
3950 }
3951 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952 endinpos = s-starts;
3953 outpos = p-PyUnicode_AS_UNICODE(v);
3954 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003955 errors, &errorHandler,
3956 "unicodeescape", message,
3957 &starts, &end, &startinpos, &endinpos, &exc, &s,
3958 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003959 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003960 break;
3961
3962 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003963 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 message = "\\ at end of string";
3965 s--;
3966 endinpos = s-starts;
3967 outpos = p-PyUnicode_AS_UNICODE(v);
3968 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003969 errors, &errorHandler,
3970 "unicodeescape", message,
3971 &starts, &end, &startinpos, &endinpos, &exc, &s,
3972 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003973 goto onError;
3974 }
3975 else {
3976 *p++ = '\\';
3977 *p++ = (unsigned char)s[-1];
3978 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003979 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003981 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003982 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003984 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003986 Py_XDECREF(errorHandler);
3987 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003989
Benjamin Peterson29060642009-01-31 22:14:21 +00003990 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003991 PyErr_SetString(
3992 PyExc_UnicodeError,
3993 "\\N escapes not supported (can't load unicodedata module)"
3994 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003995 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003996 Py_XDECREF(errorHandler);
3997 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003998 return NULL;
3999
Benjamin Peterson29060642009-01-31 22:14:21 +00004000 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004001 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004002 Py_XDECREF(errorHandler);
4003 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 return NULL;
4005}
4006
4007/* Return a Unicode-Escape string version of the Unicode object.
4008
4009 If quotes is true, the string is enclosed in u"" or u'' quotes as
4010 appropriate.
4011
4012*/
4013
Thomas Wouters477c8d52006-05-27 19:21:47 +00004014Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004015 Py_ssize_t size,
4016 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00004017{
4018 /* like wcschr, but doesn't stop at NULL characters */
4019
4020 while (size-- > 0) {
4021 if (*s == ch)
4022 return s;
4023 s++;
4024 }
4025
4026 return NULL;
4027}
Barry Warsaw51ac5802000-03-20 16:36:48 +00004028
Walter Dörwald79e913e2007-05-12 11:08:06 +00004029static const char *hexdigits = "0123456789abcdef";
4030
Alexander Belopolsky40018472011-02-26 01:02:56 +00004031PyObject *
4032PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
4033 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004035 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004038#ifdef Py_UNICODE_WIDE
4039 const Py_ssize_t expandsize = 10;
4040#else
4041 const Py_ssize_t expandsize = 6;
4042#endif
4043
Thomas Wouters89f507f2006-12-13 04:49:30 +00004044 /* XXX(nnorwitz): rather than over-allocating, it would be
4045 better to choose a different scheme. Perhaps scan the
4046 first N-chars of the string and allocate based on that size.
4047 */
4048 /* Initial allocation is based on the longest-possible unichr
4049 escape.
4050
4051 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4052 unichr, so in this case it's the longest unichr escape. In
4053 narrow (UTF-16) builds this is five chars per source unichr
4054 since there are two unichrs in the surrogate pair, so in narrow
4055 (UTF-16) builds it's not the longest unichr escape.
4056
4057 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4058 so in the narrow (UTF-16) build case it's the longest unichr
4059 escape.
4060 */
4061
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004062 if (size == 0)
4063 return PyBytes_FromStringAndSize(NULL, 0);
4064
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004065 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004066 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004067
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004068 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004069 2
4070 + expandsize*size
4071 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 if (repr == NULL)
4073 return NULL;
4074
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004075 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 while (size-- > 0) {
4078 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004079
Walter Dörwald79e913e2007-05-12 11:08:06 +00004080 /* Escape backslashes */
4081 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 *p++ = '\\';
4083 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004084 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004085 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004086
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004087#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004088 /* Map 21-bit characters to '\U00xxxxxx' */
4089 else if (ch >= 0x10000) {
4090 *p++ = '\\';
4091 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004092 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4093 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4094 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4095 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4096 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4097 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4098 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4099 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004100 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004101 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004102#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004103 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4104 else if (ch >= 0xD800 && ch < 0xDC00) {
4105 Py_UNICODE ch2;
4106 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004107
Benjamin Peterson29060642009-01-31 22:14:21 +00004108 ch2 = *s++;
4109 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004110 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4112 *p++ = '\\';
4113 *p++ = 'U';
4114 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4115 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4116 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4117 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4118 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4119 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4120 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4121 *p++ = hexdigits[ucs & 0x0000000F];
4122 continue;
4123 }
4124 /* Fall through: isolated surrogates are copied as-is */
4125 s--;
4126 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004127 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004128#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004129
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004131 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 *p++ = '\\';
4133 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004134 *p++ = hexdigits[(ch >> 12) & 0x000F];
4135 *p++ = hexdigits[(ch >> 8) & 0x000F];
4136 *p++ = hexdigits[(ch >> 4) & 0x000F];
4137 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004139
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004140 /* Map special whitespace to '\t', \n', '\r' */
4141 else if (ch == '\t') {
4142 *p++ = '\\';
4143 *p++ = 't';
4144 }
4145 else if (ch == '\n') {
4146 *p++ = '\\';
4147 *p++ = 'n';
4148 }
4149 else if (ch == '\r') {
4150 *p++ = '\\';
4151 *p++ = 'r';
4152 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004153
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004154 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004155 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004157 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004158 *p++ = hexdigits[(ch >> 4) & 0x000F];
4159 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004160 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004161
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162 /* Copy everything else as-is */
4163 else
4164 *p++ = (char) ch;
4165 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004167 assert(p - PyBytes_AS_STRING(repr) > 0);
4168 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4169 return NULL;
4170 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171}
4172
Alexander Belopolsky40018472011-02-26 01:02:56 +00004173PyObject *
4174PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004176 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 if (!PyUnicode_Check(unicode)) {
4178 PyErr_BadArgument();
4179 return NULL;
4180 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004181 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4182 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004183 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184}
4185
4186/* --- Raw Unicode Escape Codec ------------------------------------------- */
4187
Alexander Belopolsky40018472011-02-26 01:02:56 +00004188PyObject *
4189PyUnicode_DecodeRawUnicodeEscape(const char *s,
4190 Py_ssize_t size,
4191 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004194 Py_ssize_t startinpos;
4195 Py_ssize_t endinpos;
4196 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 const char *end;
4200 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 PyObject *errorHandler = NULL;
4202 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004203
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204 /* Escaped strings will always be longer than the resulting
4205 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 length after conversion to the true value. (But decoding error
4207 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004208 v = _PyUnicode_New(size);
4209 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004210 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004211 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214 end = s + size;
4215 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 unsigned char c;
4217 Py_UCS4 x;
4218 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004219 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004220
Benjamin Peterson29060642009-01-31 22:14:21 +00004221 /* Non-escape characters are interpreted as Unicode ordinals */
4222 if (*s != '\\') {
4223 *p++ = (unsigned char)*s++;
4224 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004225 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 startinpos = s-starts;
4227
4228 /* \u-escapes are only interpreted iff the number of leading
4229 backslashes if odd */
4230 bs = s;
4231 for (;s < end;) {
4232 if (*s != '\\')
4233 break;
4234 *p++ = (unsigned char)*s++;
4235 }
4236 if (((s - bs) & 1) == 0 ||
4237 s >= end ||
4238 (*s != 'u' && *s != 'U')) {
4239 continue;
4240 }
4241 p--;
4242 count = *s=='u' ? 4 : 8;
4243 s++;
4244
4245 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4246 outpos = p-PyUnicode_AS_UNICODE(v);
4247 for (x = 0, i = 0; i < count; ++i, ++s) {
4248 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004249 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 endinpos = s-starts;
4251 if (unicode_decode_call_errorhandler(
4252 errors, &errorHandler,
4253 "rawunicodeescape", "truncated \\uXXXX",
4254 &starts, &end, &startinpos, &endinpos, &exc, &s,
4255 &v, &outpos, &p))
4256 goto onError;
4257 goto nextByte;
4258 }
4259 x = (x<<4) & ~0xF;
4260 if (c >= '0' && c <= '9')
4261 x += c - '0';
4262 else if (c >= 'a' && c <= 'f')
4263 x += 10 + c - 'a';
4264 else
4265 x += 10 + c - 'A';
4266 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004267 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 /* UCS-2 character */
4269 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004270 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004271 /* UCS-4 character. Either store directly, or as
4272 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004273#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004275#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 x -= 0x10000L;
4277 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4278 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004279#endif
4280 } else {
4281 endinpos = s-starts;
4282 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004283 if (unicode_decode_call_errorhandler(
4284 errors, &errorHandler,
4285 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 &starts, &end, &startinpos, &endinpos, &exc, &s,
4287 &v, &outpos, &p))
4288 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004289 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 nextByte:
4291 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004293 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 Py_XDECREF(errorHandler);
4296 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004298
Benjamin Peterson29060642009-01-31 22:14:21 +00004299 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004301 Py_XDECREF(errorHandler);
4302 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303 return NULL;
4304}
4305
Alexander Belopolsky40018472011-02-26 01:02:56 +00004306PyObject *
4307PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4308 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004310 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311 char *p;
4312 char *q;
4313
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004314#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004315 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004316#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004317 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004318#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004319
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004320 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004321 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004322
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004323 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324 if (repr == NULL)
4325 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004326 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004327 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004329 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 while (size-- > 0) {
4331 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004332#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004333 /* Map 32-bit characters to '\Uxxxxxxxx' */
4334 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004335 *p++ = '\\';
4336 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004337 *p++ = hexdigits[(ch >> 28) & 0xf];
4338 *p++ = hexdigits[(ch >> 24) & 0xf];
4339 *p++ = hexdigits[(ch >> 20) & 0xf];
4340 *p++ = hexdigits[(ch >> 16) & 0xf];
4341 *p++ = hexdigits[(ch >> 12) & 0xf];
4342 *p++ = hexdigits[(ch >> 8) & 0xf];
4343 *p++ = hexdigits[(ch >> 4) & 0xf];
4344 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004345 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004346 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004347#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004348 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4349 if (ch >= 0xD800 && ch < 0xDC00) {
4350 Py_UNICODE ch2;
4351 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004352
Benjamin Peterson29060642009-01-31 22:14:21 +00004353 ch2 = *s++;
4354 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004355 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004356 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4357 *p++ = '\\';
4358 *p++ = 'U';
4359 *p++ = hexdigits[(ucs >> 28) & 0xf];
4360 *p++ = hexdigits[(ucs >> 24) & 0xf];
4361 *p++ = hexdigits[(ucs >> 20) & 0xf];
4362 *p++ = hexdigits[(ucs >> 16) & 0xf];
4363 *p++ = hexdigits[(ucs >> 12) & 0xf];
4364 *p++ = hexdigits[(ucs >> 8) & 0xf];
4365 *p++ = hexdigits[(ucs >> 4) & 0xf];
4366 *p++ = hexdigits[ucs & 0xf];
4367 continue;
4368 }
4369 /* Fall through: isolated surrogates are copied as-is */
4370 s--;
4371 size++;
4372 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004373#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004374 /* Map 16-bit characters to '\uxxxx' */
4375 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376 *p++ = '\\';
4377 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004378 *p++ = hexdigits[(ch >> 12) & 0xf];
4379 *p++ = hexdigits[(ch >> 8) & 0xf];
4380 *p++ = hexdigits[(ch >> 4) & 0xf];
4381 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004382 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004383 /* Copy everything else as-is */
4384 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385 *p++ = (char) ch;
4386 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004387 size = p - q;
4388
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004389 assert(size > 0);
4390 if (_PyBytes_Resize(&repr, size) < 0)
4391 return NULL;
4392 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393}
4394
Alexander Belopolsky40018472011-02-26 01:02:56 +00004395PyObject *
4396PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004398 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004400 PyErr_BadArgument();
4401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004403 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4404 PyUnicode_GET_SIZE(unicode));
4405
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004406 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407}
4408
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004409/* --- Unicode Internal Codec ------------------------------------------- */
4410
Alexander Belopolsky40018472011-02-26 01:02:56 +00004411PyObject *
4412_PyUnicode_DecodeUnicodeInternal(const char *s,
4413 Py_ssize_t size,
4414 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004415{
4416 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004417 Py_ssize_t startinpos;
4418 Py_ssize_t endinpos;
4419 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004420 PyUnicodeObject *v;
4421 Py_UNICODE *p;
4422 const char *end;
4423 const char *reason;
4424 PyObject *errorHandler = NULL;
4425 PyObject *exc = NULL;
4426
Neal Norwitzd43069c2006-01-08 01:12:10 +00004427#ifdef Py_UNICODE_WIDE
4428 Py_UNICODE unimax = PyUnicode_GetMax();
4429#endif
4430
Thomas Wouters89f507f2006-12-13 04:49:30 +00004431 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004432 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4433 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004435 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004437 p = PyUnicode_AS_UNICODE(v);
4438 end = s + size;
4439
4440 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004441 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004442 /* We have to sanity check the raw data, otherwise doom looms for
4443 some malformed UCS-4 data. */
4444 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004445#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004446 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004447#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004448 end-s < Py_UNICODE_SIZE
4449 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004450 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004451 startinpos = s - starts;
4452 if (end-s < Py_UNICODE_SIZE) {
4453 endinpos = end-starts;
4454 reason = "truncated input";
4455 }
4456 else {
4457 endinpos = s - starts + Py_UNICODE_SIZE;
4458 reason = "illegal code point (> 0x10FFFF)";
4459 }
4460 outpos = p - PyUnicode_AS_UNICODE(v);
4461 if (unicode_decode_call_errorhandler(
4462 errors, &errorHandler,
4463 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004464 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004465 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004466 goto onError;
4467 }
4468 }
4469 else {
4470 p++;
4471 s += Py_UNICODE_SIZE;
4472 }
4473 }
4474
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004475 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004476 goto onError;
4477 Py_XDECREF(errorHandler);
4478 Py_XDECREF(exc);
4479 return (PyObject *)v;
4480
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004482 Py_XDECREF(v);
4483 Py_XDECREF(errorHandler);
4484 Py_XDECREF(exc);
4485 return NULL;
4486}
4487
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488/* --- Latin-1 Codec ------------------------------------------------------ */
4489
Alexander Belopolsky40018472011-02-26 01:02:56 +00004490PyObject *
4491PyUnicode_DecodeLatin1(const char *s,
4492 Py_ssize_t size,
4493 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494{
4495 PyUnicodeObject *v;
4496 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004497 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004498
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004500 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 Py_UNICODE r = *(unsigned char*)s;
4502 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004503 }
4504
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505 v = _PyUnicode_New(size);
4506 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004511 e = s + size;
4512 /* Unrolling the copy makes it much faster by reducing the looping
4513 overhead. This is similar to what many memcpy() implementations do. */
4514 unrolled_end = e - 4;
4515 while (s < unrolled_end) {
4516 p[0] = (unsigned char) s[0];
4517 p[1] = (unsigned char) s[1];
4518 p[2] = (unsigned char) s[2];
4519 p[3] = (unsigned char) s[3];
4520 s += 4;
4521 p += 4;
4522 }
4523 while (s < e)
4524 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004526
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528 Py_XDECREF(v);
4529 return NULL;
4530}
4531
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004533static void
4534make_encode_exception(PyObject **exceptionObject,
4535 const char *encoding,
4536 const Py_UNICODE *unicode, Py_ssize_t size,
4537 Py_ssize_t startpos, Py_ssize_t endpos,
4538 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004540 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 *exceptionObject = PyUnicodeEncodeError_Create(
4542 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543 }
4544 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4546 goto onError;
4547 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4548 goto onError;
4549 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4550 goto onError;
4551 return;
4552 onError:
4553 Py_DECREF(*exceptionObject);
4554 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555 }
4556}
4557
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004559static void
4560raise_encode_exception(PyObject **exceptionObject,
4561 const char *encoding,
4562 const Py_UNICODE *unicode, Py_ssize_t size,
4563 Py_ssize_t startpos, Py_ssize_t endpos,
4564 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565{
4566 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004567 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570}
4571
4572/* error handling callback helper:
4573 build arguments, call the callback and check the arguments,
4574 put the result into newpos and return the replacement string, which
4575 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004576static PyObject *
4577unicode_encode_call_errorhandler(const char *errors,
4578 PyObject **errorHandler,
4579 const char *encoding, const char *reason,
4580 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4581 Py_ssize_t startpos, Py_ssize_t endpos,
4582 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004584 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585
4586 PyObject *restuple;
4587 PyObject *resunicode;
4588
4589 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004590 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004592 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004593 }
4594
4595 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004596 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004597 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004598 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599
4600 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004601 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004602 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004603 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004605 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004606 Py_DECREF(restuple);
4607 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004608 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004609 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 &resunicode, newpos)) {
4611 Py_DECREF(restuple);
4612 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004613 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004614 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4615 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4616 Py_DECREF(restuple);
4617 return NULL;
4618 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004619 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004620 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004621 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004622 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4623 Py_DECREF(restuple);
4624 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004625 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626 Py_INCREF(resunicode);
4627 Py_DECREF(restuple);
4628 return resunicode;
4629}
4630
Alexander Belopolsky40018472011-02-26 01:02:56 +00004631static PyObject *
4632unicode_encode_ucs1(const Py_UNICODE *p,
4633 Py_ssize_t size,
4634 const char *errors,
4635 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636{
4637 /* output object */
4638 PyObject *res;
4639 /* pointers to the beginning and end+1 of input */
4640 const Py_UNICODE *startp = p;
4641 const Py_UNICODE *endp = p + size;
4642 /* pointer to the beginning of the unencodable characters */
4643 /* const Py_UNICODE *badp = NULL; */
4644 /* pointer into the output */
4645 char *str;
4646 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004647 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004648 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4649 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650 PyObject *errorHandler = NULL;
4651 PyObject *exc = NULL;
4652 /* the following variable is used for caching string comparisons
4653 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4654 int known_errorHandler = -1;
4655
4656 /* allocate enough for a simple encoding without
4657 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004658 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004659 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004660 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004661 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004662 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004663 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664 ressize = size;
4665
4666 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004667 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668
Benjamin Peterson29060642009-01-31 22:14:21 +00004669 /* can we encode this? */
4670 if (c<limit) {
4671 /* no overflow check, because we know that the space is enough */
4672 *str++ = (char)c;
4673 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004674 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 else {
4676 Py_ssize_t unicodepos = p-startp;
4677 Py_ssize_t requiredsize;
4678 PyObject *repunicode;
4679 Py_ssize_t repsize;
4680 Py_ssize_t newpos;
4681 Py_ssize_t respos;
4682 Py_UNICODE *uni2;
4683 /* startpos for collecting unencodable chars */
4684 const Py_UNICODE *collstart = p;
4685 const Py_UNICODE *collend = p;
4686 /* find all unecodable characters */
4687 while ((collend < endp) && ((*collend)>=limit))
4688 ++collend;
4689 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4690 if (known_errorHandler==-1) {
4691 if ((errors==NULL) || (!strcmp(errors, "strict")))
4692 known_errorHandler = 1;
4693 else if (!strcmp(errors, "replace"))
4694 known_errorHandler = 2;
4695 else if (!strcmp(errors, "ignore"))
4696 known_errorHandler = 3;
4697 else if (!strcmp(errors, "xmlcharrefreplace"))
4698 known_errorHandler = 4;
4699 else
4700 known_errorHandler = 0;
4701 }
4702 switch (known_errorHandler) {
4703 case 1: /* strict */
4704 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4705 goto onError;
4706 case 2: /* replace */
4707 while (collstart++<collend)
4708 *str++ = '?'; /* fall through */
4709 case 3: /* ignore */
4710 p = collend;
4711 break;
4712 case 4: /* xmlcharrefreplace */
4713 respos = str - PyBytes_AS_STRING(res);
4714 /* determine replacement size (temporarily (mis)uses p) */
4715 for (p = collstart, repsize = 0; p < collend; ++p) {
4716 if (*p<10)
4717 repsize += 2+1+1;
4718 else if (*p<100)
4719 repsize += 2+2+1;
4720 else if (*p<1000)
4721 repsize += 2+3+1;
4722 else if (*p<10000)
4723 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004724#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004725 else
4726 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004727#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004728 else if (*p<100000)
4729 repsize += 2+5+1;
4730 else if (*p<1000000)
4731 repsize += 2+6+1;
4732 else
4733 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004734#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004735 }
4736 requiredsize = respos+repsize+(endp-collend);
4737 if (requiredsize > ressize) {
4738 if (requiredsize<2*ressize)
4739 requiredsize = 2*ressize;
4740 if (_PyBytes_Resize(&res, requiredsize))
4741 goto onError;
4742 str = PyBytes_AS_STRING(res) + respos;
4743 ressize = requiredsize;
4744 }
4745 /* generate replacement (temporarily (mis)uses p) */
4746 for (p = collstart; p < collend; ++p) {
4747 str += sprintf(str, "&#%d;", (int)*p);
4748 }
4749 p = collend;
4750 break;
4751 default:
4752 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4753 encoding, reason, startp, size, &exc,
4754 collstart-startp, collend-startp, &newpos);
4755 if (repunicode == NULL)
4756 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004757 if (PyBytes_Check(repunicode)) {
4758 /* Directly copy bytes result to output. */
4759 repsize = PyBytes_Size(repunicode);
4760 if (repsize > 1) {
4761 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004762 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004763 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4764 Py_DECREF(repunicode);
4765 goto onError;
4766 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004767 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004768 ressize += repsize-1;
4769 }
4770 memcpy(str, PyBytes_AsString(repunicode), repsize);
4771 str += repsize;
4772 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004773 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004774 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004775 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004776 /* need more space? (at least enough for what we
4777 have+the replacement+the rest of the string, so
4778 we won't have to check space for encodable characters) */
4779 respos = str - PyBytes_AS_STRING(res);
4780 repsize = PyUnicode_GET_SIZE(repunicode);
4781 requiredsize = respos+repsize+(endp-collend);
4782 if (requiredsize > ressize) {
4783 if (requiredsize<2*ressize)
4784 requiredsize = 2*ressize;
4785 if (_PyBytes_Resize(&res, requiredsize)) {
4786 Py_DECREF(repunicode);
4787 goto onError;
4788 }
4789 str = PyBytes_AS_STRING(res) + respos;
4790 ressize = requiredsize;
4791 }
4792 /* check if there is anything unencodable in the replacement
4793 and copy it to the output */
4794 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4795 c = *uni2;
4796 if (c >= limit) {
4797 raise_encode_exception(&exc, encoding, startp, size,
4798 unicodepos, unicodepos+1, reason);
4799 Py_DECREF(repunicode);
4800 goto onError;
4801 }
4802 *str = (char)c;
4803 }
4804 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004805 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004806 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004807 }
4808 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004809 /* Resize if we allocated to much */
4810 size = str - PyBytes_AS_STRING(res);
4811 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004812 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004813 if (_PyBytes_Resize(&res, size) < 0)
4814 goto onError;
4815 }
4816
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004817 Py_XDECREF(errorHandler);
4818 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004819 return res;
4820
4821 onError:
4822 Py_XDECREF(res);
4823 Py_XDECREF(errorHandler);
4824 Py_XDECREF(exc);
4825 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826}
4827
Alexander Belopolsky40018472011-02-26 01:02:56 +00004828PyObject *
4829PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4830 Py_ssize_t size,
4831 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834}
4835
Alexander Belopolsky40018472011-02-26 01:02:56 +00004836PyObject *
4837PyUnicode_AsLatin1String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838{
4839 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004840 PyErr_BadArgument();
4841 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 }
4843 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 PyUnicode_GET_SIZE(unicode),
4845 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846}
4847
4848/* --- 7-bit ASCII Codec -------------------------------------------------- */
4849
Alexander Belopolsky40018472011-02-26 01:02:56 +00004850PyObject *
4851PyUnicode_DecodeASCII(const char *s,
4852 Py_ssize_t size,
4853 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004855 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 PyUnicodeObject *v;
4857 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004858 Py_ssize_t startinpos;
4859 Py_ssize_t endinpos;
4860 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004861 const char *e;
4862 PyObject *errorHandler = NULL;
4863 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004864
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004866 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 Py_UNICODE r = *(unsigned char*)s;
4868 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004869 }
Tim Petersced69f82003-09-16 20:30:58 +00004870
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 v = _PyUnicode_New(size);
4872 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004873 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004875 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004877 e = s + size;
4878 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004879 register unsigned char c = (unsigned char)*s;
4880 if (c < 128) {
4881 *p++ = c;
4882 ++s;
4883 }
4884 else {
4885 startinpos = s-starts;
4886 endinpos = startinpos + 1;
4887 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4888 if (unicode_decode_call_errorhandler(
4889 errors, &errorHandler,
4890 "ascii", "ordinal not in range(128)",
4891 &starts, &e, &startinpos, &endinpos, &exc, &s,
4892 &v, &outpos, &p))
4893 goto onError;
4894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004896 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004897 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4898 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004899 Py_XDECREF(errorHandler);
4900 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004902
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905 Py_XDECREF(errorHandler);
4906 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907 return NULL;
4908}
4909
Alexander Belopolsky40018472011-02-26 01:02:56 +00004910PyObject *
4911PyUnicode_EncodeASCII(const Py_UNICODE *p,
4912 Py_ssize_t size,
4913 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004915 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916}
4917
Alexander Belopolsky40018472011-02-26 01:02:56 +00004918PyObject *
4919PyUnicode_AsASCIIString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920{
4921 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 PyErr_BadArgument();
4923 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924 }
4925 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 PyUnicode_GET_SIZE(unicode),
4927 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928}
4929
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004930#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004931
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004932/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004933
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004934#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004935#define NEED_RETRY
4936#endif
4937
4938/* XXX This code is limited to "true" double-byte encodings, as
4939 a) it assumes an incomplete character consists of a single byte, and
4940 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004942
Alexander Belopolsky40018472011-02-26 01:02:56 +00004943static int
4944is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004945{
4946 const char *curr = s + offset;
4947
4948 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004949 const char *prev = CharPrev(s, curr);
4950 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004951 }
4952 return 0;
4953}
4954
4955/*
4956 * Decode MBCS string into unicode object. If 'final' is set, converts
4957 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4958 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004959static int
4960decode_mbcs(PyUnicodeObject **v,
4961 const char *s, /* MBCS string */
4962 int size, /* sizeof MBCS string */
4963 int final,
4964 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004965{
4966 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004967 Py_ssize_t n;
4968 DWORD usize;
4969 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004970
4971 assert(size >= 0);
4972
Victor Stinner554f3f02010-06-16 23:33:54 +00004973 /* check and handle 'errors' arg */
4974 if (errors==NULL || strcmp(errors, "strict")==0)
4975 flags = MB_ERR_INVALID_CHARS;
4976 else if (strcmp(errors, "ignore")==0)
4977 flags = 0;
4978 else {
4979 PyErr_Format(PyExc_ValueError,
4980 "mbcs encoding does not support errors='%s'",
4981 errors);
4982 return -1;
4983 }
4984
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004985 /* Skip trailing lead-byte unless 'final' is set */
4986 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004987 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004988
4989 /* First get the size of the result */
4990 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004991 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4992 if (usize==0)
4993 goto mbcs_decode_error;
4994 } else
4995 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004996
4997 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 /* Create unicode object */
4999 *v = _PyUnicode_New(usize);
5000 if (*v == NULL)
5001 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005002 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005003 }
5004 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005005 /* Extend unicode object */
5006 n = PyUnicode_GET_SIZE(*v);
5007 if (_PyUnicode_Resize(v, n + usize) < 0)
5008 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005009 }
5010
5011 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00005012 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005014 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
5015 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00005016 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005017 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005018 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00005019
5020mbcs_decode_error:
5021 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
5022 we raise a UnicodeDecodeError - else it is a 'generic'
5023 windows error
5024 */
5025 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
5026 /* Ideally, we should get reason from FormatMessage - this
5027 is the Windows 2000 English version of the message
5028 */
5029 PyObject *exc = NULL;
5030 const char *reason = "No mapping for the Unicode character exists "
5031 "in the target multi-byte code page.";
5032 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
5033 if (exc != NULL) {
5034 PyCodec_StrictErrors(exc);
5035 Py_DECREF(exc);
5036 }
5037 } else {
5038 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5039 }
5040 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005041}
5042
Alexander Belopolsky40018472011-02-26 01:02:56 +00005043PyObject *
5044PyUnicode_DecodeMBCSStateful(const char *s,
5045 Py_ssize_t size,
5046 const char *errors,
5047 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005048{
5049 PyUnicodeObject *v = NULL;
5050 int done;
5051
5052 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005054
5055#ifdef NEED_RETRY
5056 retry:
5057 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005058 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005059 else
5060#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005061 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005062
5063 if (done < 0) {
5064 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005066 }
5067
5068 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005069 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005070
5071#ifdef NEED_RETRY
5072 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005073 s += done;
5074 size -= done;
5075 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005076 }
5077#endif
5078
5079 return (PyObject *)v;
5080}
5081
Alexander Belopolsky40018472011-02-26 01:02:56 +00005082PyObject *
5083PyUnicode_DecodeMBCS(const char *s,
5084 Py_ssize_t size,
5085 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005086{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005087 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5088}
5089
5090/*
5091 * Convert unicode into string object (MBCS).
5092 * Returns 0 if succeed, -1 otherwise.
5093 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005094static int
5095encode_mbcs(PyObject **repr,
5096 const Py_UNICODE *p, /* unicode */
5097 int size, /* size of unicode */
5098 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005099{
Victor Stinner554f3f02010-06-16 23:33:54 +00005100 BOOL usedDefaultChar = FALSE;
5101 BOOL *pusedDefaultChar;
5102 int mbcssize;
5103 Py_ssize_t n;
5104 PyObject *exc = NULL;
5105 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005106
5107 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005108
Victor Stinner554f3f02010-06-16 23:33:54 +00005109 /* check and handle 'errors' arg */
5110 if (errors==NULL || strcmp(errors, "strict")==0) {
5111 flags = WC_NO_BEST_FIT_CHARS;
5112 pusedDefaultChar = &usedDefaultChar;
5113 } else if (strcmp(errors, "replace")==0) {
5114 flags = 0;
5115 pusedDefaultChar = NULL;
5116 } else {
5117 PyErr_Format(PyExc_ValueError,
5118 "mbcs encoding does not support errors='%s'",
5119 errors);
5120 return -1;
5121 }
5122
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005123 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005124 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005125 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5126 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005127 if (mbcssize == 0) {
5128 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5129 return -1;
5130 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005131 /* If we used a default char, then we failed! */
5132 if (pusedDefaultChar && *pusedDefaultChar)
5133 goto mbcs_encode_error;
5134 } else {
5135 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005136 }
5137
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005138 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005139 /* Create string object */
5140 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5141 if (*repr == NULL)
5142 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005143 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005144 }
5145 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005146 /* Extend string object */
5147 n = PyBytes_Size(*repr);
5148 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5149 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005150 }
5151
5152 /* Do the conversion */
5153 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005154 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005155 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5156 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5158 return -1;
5159 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005160 if (pusedDefaultChar && *pusedDefaultChar)
5161 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005162 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005163 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005164
5165mbcs_encode_error:
5166 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5167 Py_XDECREF(exc);
5168 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005169}
5170
Alexander Belopolsky40018472011-02-26 01:02:56 +00005171PyObject *
5172PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5173 Py_ssize_t size,
5174 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005175{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005176 PyObject *repr = NULL;
5177 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005178
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005179#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005181 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005182 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005183 else
5184#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005185 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005186
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005187 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 Py_XDECREF(repr);
5189 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005190 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005191
5192#ifdef NEED_RETRY
5193 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 p += INT_MAX;
5195 size -= INT_MAX;
5196 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005197 }
5198#endif
5199
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005200 return repr;
5201}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005202
Alexander Belopolsky40018472011-02-26 01:02:56 +00005203PyObject *
5204PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005205{
5206 if (!PyUnicode_Check(unicode)) {
5207 PyErr_BadArgument();
5208 return NULL;
5209 }
5210 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005211 PyUnicode_GET_SIZE(unicode),
5212 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005213}
5214
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005215#undef NEED_RETRY
5216
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005217#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005218
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219/* --- Character Mapping Codec -------------------------------------------- */
5220
Alexander Belopolsky40018472011-02-26 01:02:56 +00005221PyObject *
5222PyUnicode_DecodeCharmap(const char *s,
5223 Py_ssize_t size,
5224 PyObject *mapping,
5225 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005227 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005228 Py_ssize_t startinpos;
5229 Py_ssize_t endinpos;
5230 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005231 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232 PyUnicodeObject *v;
5233 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005234 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005235 PyObject *errorHandler = NULL;
5236 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005237 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005238 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005239
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 /* Default to Latin-1 */
5241 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243
5244 v = _PyUnicode_New(size);
5245 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005248 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005250 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005251 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005252 mapstring = PyUnicode_AS_UNICODE(mapping);
5253 maplen = PyUnicode_GET_SIZE(mapping);
5254 while (s < e) {
5255 unsigned char ch = *s;
5256 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257
Benjamin Peterson29060642009-01-31 22:14:21 +00005258 if (ch < maplen)
5259 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 if (x == 0xfffe) {
5262 /* undefined mapping */
5263 outpos = p-PyUnicode_AS_UNICODE(v);
5264 startinpos = s-starts;
5265 endinpos = startinpos+1;
5266 if (unicode_decode_call_errorhandler(
5267 errors, &errorHandler,
5268 "charmap", "character maps to <undefined>",
5269 &starts, &e, &startinpos, &endinpos, &exc, &s,
5270 &v, &outpos, &p)) {
5271 goto onError;
5272 }
5273 continue;
5274 }
5275 *p++ = x;
5276 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005277 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005278 }
5279 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005280 while (s < e) {
5281 unsigned char ch = *s;
5282 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005283
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5285 w = PyLong_FromLong((long)ch);
5286 if (w == NULL)
5287 goto onError;
5288 x = PyObject_GetItem(mapping, w);
5289 Py_DECREF(w);
5290 if (x == NULL) {
5291 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5292 /* No mapping found means: mapping is undefined. */
5293 PyErr_Clear();
5294 x = Py_None;
5295 Py_INCREF(x);
5296 } else
5297 goto onError;
5298 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005299
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 /* Apply mapping */
5301 if (PyLong_Check(x)) {
5302 long value = PyLong_AS_LONG(x);
5303 if (value < 0 || value > 65535) {
5304 PyErr_SetString(PyExc_TypeError,
5305 "character mapping must be in range(65536)");
5306 Py_DECREF(x);
5307 goto onError;
5308 }
5309 *p++ = (Py_UNICODE)value;
5310 }
5311 else if (x == Py_None) {
5312 /* undefined mapping */
5313 outpos = p-PyUnicode_AS_UNICODE(v);
5314 startinpos = s-starts;
5315 endinpos = startinpos+1;
5316 if (unicode_decode_call_errorhandler(
5317 errors, &errorHandler,
5318 "charmap", "character maps to <undefined>",
5319 &starts, &e, &startinpos, &endinpos, &exc, &s,
5320 &v, &outpos, &p)) {
5321 Py_DECREF(x);
5322 goto onError;
5323 }
5324 Py_DECREF(x);
5325 continue;
5326 }
5327 else if (PyUnicode_Check(x)) {
5328 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005329
Benjamin Peterson29060642009-01-31 22:14:21 +00005330 if (targetsize == 1)
5331 /* 1-1 mapping */
5332 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005333
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 else if (targetsize > 1) {
5335 /* 1-n mapping */
5336 if (targetsize > extrachars) {
5337 /* resize first */
5338 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5339 Py_ssize_t needed = (targetsize - extrachars) + \
5340 (targetsize << 2);
5341 extrachars += needed;
5342 /* XXX overflow detection missing */
5343 if (_PyUnicode_Resize(&v,
5344 PyUnicode_GET_SIZE(v) + needed) < 0) {
5345 Py_DECREF(x);
5346 goto onError;
5347 }
5348 p = PyUnicode_AS_UNICODE(v) + oldpos;
5349 }
5350 Py_UNICODE_COPY(p,
5351 PyUnicode_AS_UNICODE(x),
5352 targetsize);
5353 p += targetsize;
5354 extrachars -= targetsize;
5355 }
5356 /* 1-0 mapping: skip the character */
5357 }
5358 else {
5359 /* wrong return value */
5360 PyErr_SetString(PyExc_TypeError,
5361 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005362 Py_DECREF(x);
5363 goto onError;
5364 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 Py_DECREF(x);
5366 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005367 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 }
5369 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5371 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005372 Py_XDECREF(errorHandler);
5373 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005375
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377 Py_XDECREF(errorHandler);
5378 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 Py_XDECREF(v);
5380 return NULL;
5381}
5382
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005383/* Charmap encoding: the lookup table */
5384
Alexander Belopolsky40018472011-02-26 01:02:56 +00005385struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00005386 PyObject_HEAD
5387 unsigned char level1[32];
5388 int count2, count3;
5389 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005390};
5391
5392static PyObject*
5393encoding_map_size(PyObject *obj, PyObject* args)
5394{
5395 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005396 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005397 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005398}
5399
5400static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005401 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005402 PyDoc_STR("Return the size (in bytes) of this object") },
5403 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005404};
5405
5406static void
5407encoding_map_dealloc(PyObject* o)
5408{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005409 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005410}
5411
5412static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005413 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 "EncodingMap", /*tp_name*/
5415 sizeof(struct encoding_map), /*tp_basicsize*/
5416 0, /*tp_itemsize*/
5417 /* methods */
5418 encoding_map_dealloc, /*tp_dealloc*/
5419 0, /*tp_print*/
5420 0, /*tp_getattr*/
5421 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005422 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005423 0, /*tp_repr*/
5424 0, /*tp_as_number*/
5425 0, /*tp_as_sequence*/
5426 0, /*tp_as_mapping*/
5427 0, /*tp_hash*/
5428 0, /*tp_call*/
5429 0, /*tp_str*/
5430 0, /*tp_getattro*/
5431 0, /*tp_setattro*/
5432 0, /*tp_as_buffer*/
5433 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5434 0, /*tp_doc*/
5435 0, /*tp_traverse*/
5436 0, /*tp_clear*/
5437 0, /*tp_richcompare*/
5438 0, /*tp_weaklistoffset*/
5439 0, /*tp_iter*/
5440 0, /*tp_iternext*/
5441 encoding_map_methods, /*tp_methods*/
5442 0, /*tp_members*/
5443 0, /*tp_getset*/
5444 0, /*tp_base*/
5445 0, /*tp_dict*/
5446 0, /*tp_descr_get*/
5447 0, /*tp_descr_set*/
5448 0, /*tp_dictoffset*/
5449 0, /*tp_init*/
5450 0, /*tp_alloc*/
5451 0, /*tp_new*/
5452 0, /*tp_free*/
5453 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005454};
5455
5456PyObject*
5457PyUnicode_BuildEncodingMap(PyObject* string)
5458{
5459 Py_UNICODE *decode;
5460 PyObject *result;
5461 struct encoding_map *mresult;
5462 int i;
5463 int need_dict = 0;
5464 unsigned char level1[32];
5465 unsigned char level2[512];
5466 unsigned char *mlevel1, *mlevel2, *mlevel3;
5467 int count2 = 0, count3 = 0;
5468
5469 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5470 PyErr_BadArgument();
5471 return NULL;
5472 }
5473 decode = PyUnicode_AS_UNICODE(string);
5474 memset(level1, 0xFF, sizeof level1);
5475 memset(level2, 0xFF, sizeof level2);
5476
5477 /* If there isn't a one-to-one mapping of NULL to \0,
5478 or if there are non-BMP characters, we need to use
5479 a mapping dictionary. */
5480 if (decode[0] != 0)
5481 need_dict = 1;
5482 for (i = 1; i < 256; i++) {
5483 int l1, l2;
5484 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005485#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005486 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005487#endif
5488 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005489 need_dict = 1;
5490 break;
5491 }
5492 if (decode[i] == 0xFFFE)
5493 /* unmapped character */
5494 continue;
5495 l1 = decode[i] >> 11;
5496 l2 = decode[i] >> 7;
5497 if (level1[l1] == 0xFF)
5498 level1[l1] = count2++;
5499 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005500 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005501 }
5502
5503 if (count2 >= 0xFF || count3 >= 0xFF)
5504 need_dict = 1;
5505
5506 if (need_dict) {
5507 PyObject *result = PyDict_New();
5508 PyObject *key, *value;
5509 if (!result)
5510 return NULL;
5511 for (i = 0; i < 256; i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00005512 key = PyLong_FromLong(decode[i]);
5513 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005514 if (!key || !value)
5515 goto failed1;
5516 if (PyDict_SetItem(result, key, value) == -1)
5517 goto failed1;
5518 Py_DECREF(key);
5519 Py_DECREF(value);
5520 }
5521 return result;
5522 failed1:
5523 Py_XDECREF(key);
5524 Py_XDECREF(value);
5525 Py_DECREF(result);
5526 return NULL;
5527 }
5528
5529 /* Create a three-level trie */
5530 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5531 16*count2 + 128*count3 - 1);
5532 if (!result)
5533 return PyErr_NoMemory();
5534 PyObject_Init(result, &EncodingMapType);
5535 mresult = (struct encoding_map*)result;
5536 mresult->count2 = count2;
5537 mresult->count3 = count3;
5538 mlevel1 = mresult->level1;
5539 mlevel2 = mresult->level23;
5540 mlevel3 = mresult->level23 + 16*count2;
5541 memcpy(mlevel1, level1, 32);
5542 memset(mlevel2, 0xFF, 16*count2);
5543 memset(mlevel3, 0, 128*count3);
5544 count3 = 0;
5545 for (i = 1; i < 256; i++) {
5546 int o1, o2, o3, i2, i3;
5547 if (decode[i] == 0xFFFE)
5548 /* unmapped character */
5549 continue;
5550 o1 = decode[i]>>11;
5551 o2 = (decode[i]>>7) & 0xF;
5552 i2 = 16*mlevel1[o1] + o2;
5553 if (mlevel2[i2] == 0xFF)
5554 mlevel2[i2] = count3++;
5555 o3 = decode[i] & 0x7F;
5556 i3 = 128*mlevel2[i2] + o3;
5557 mlevel3[i3] = i;
5558 }
5559 return result;
5560}
5561
5562static int
5563encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5564{
5565 struct encoding_map *map = (struct encoding_map*)mapping;
5566 int l1 = c>>11;
5567 int l2 = (c>>7) & 0xF;
5568 int l3 = c & 0x7F;
5569 int i;
5570
5571#ifdef Py_UNICODE_WIDE
5572 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005573 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005574 }
5575#endif
5576 if (c == 0)
5577 return 0;
5578 /* level 1*/
5579 i = map->level1[l1];
5580 if (i == 0xFF) {
5581 return -1;
5582 }
5583 /* level 2*/
5584 i = map->level23[16*i+l2];
5585 if (i == 0xFF) {
5586 return -1;
5587 }
5588 /* level 3 */
5589 i = map->level23[16*map->count2 + 128*i + l3];
5590 if (i == 0) {
5591 return -1;
5592 }
5593 return i;
5594}
5595
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005596/* Lookup the character ch in the mapping. If the character
5597 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005598 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005599static PyObject *
5600charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601{
Christian Heimes217cfd12007-12-02 14:31:20 +00005602 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603 PyObject *x;
5604
5605 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005606 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005607 x = PyObject_GetItem(mapping, w);
5608 Py_DECREF(w);
5609 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5611 /* No mapping found means: mapping is undefined. */
5612 PyErr_Clear();
5613 x = Py_None;
5614 Py_INCREF(x);
5615 return x;
5616 } else
5617 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005619 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005621 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 long value = PyLong_AS_LONG(x);
5623 if (value < 0 || value > 255) {
5624 PyErr_SetString(PyExc_TypeError,
5625 "character mapping must be in range(256)");
5626 Py_DECREF(x);
5627 return NULL;
5628 }
5629 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005631 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 /* wrong return value */
5635 PyErr_Format(PyExc_TypeError,
5636 "character mapping must return integer, bytes or None, not %.400s",
5637 x->ob_type->tp_name);
5638 Py_DECREF(x);
5639 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 }
5641}
5642
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005643static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005644charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005645{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005646 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5647 /* exponentially overallocate to minimize reallocations */
5648 if (requiredsize < 2*outsize)
5649 requiredsize = 2*outsize;
5650 if (_PyBytes_Resize(outobj, requiredsize))
5651 return -1;
5652 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005653}
5654
Benjamin Peterson14339b62009-01-31 16:36:08 +00005655typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00005657} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005658/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005659 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005660 space is available. Return a new reference to the object that
5661 was put in the output buffer, or Py_None, if the mapping was undefined
5662 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005663 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005664static charmapencode_result
5665charmapencode_output(Py_UNICODE c, PyObject *mapping,
5666 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005667{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005668 PyObject *rep;
5669 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005670 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005671
Christian Heimes90aa7642007-12-19 02:45:37 +00005672 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005673 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005675 if (res == -1)
5676 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 if (outsize<requiredsize)
5678 if (charmapencode_resize(outobj, outpos, requiredsize))
5679 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005680 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 outstart[(*outpos)++] = (char)res;
5682 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005683 }
5684
5685 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005688 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 Py_DECREF(rep);
5690 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005691 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 if (PyLong_Check(rep)) {
5693 Py_ssize_t requiredsize = *outpos+1;
5694 if (outsize<requiredsize)
5695 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5696 Py_DECREF(rep);
5697 return enc_EXCEPTION;
5698 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005699 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005701 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 else {
5703 const char *repchars = PyBytes_AS_STRING(rep);
5704 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5705 Py_ssize_t requiredsize = *outpos+repsize;
5706 if (outsize<requiredsize)
5707 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5708 Py_DECREF(rep);
5709 return enc_EXCEPTION;
5710 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005711 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 memcpy(outstart + *outpos, repchars, repsize);
5713 *outpos += repsize;
5714 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005716 Py_DECREF(rep);
5717 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005718}
5719
5720/* handle an error in PyUnicode_EncodeCharmap
5721 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005722static int
5723charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005724 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005725 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005726 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005727 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005728{
5729 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005730 Py_ssize_t repsize;
5731 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005732 Py_UNICODE *uni2;
5733 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005734 Py_ssize_t collstartpos = *inpos;
5735 Py_ssize_t collendpos = *inpos+1;
5736 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737 char *encoding = "charmap";
5738 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005739 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005740
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741 /* find all unencodable characters */
5742 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005743 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005744 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 int res = encoding_map_lookup(p[collendpos], mapping);
5746 if (res != -1)
5747 break;
5748 ++collendpos;
5749 continue;
5750 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005751
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 rep = charmapencode_lookup(p[collendpos], mapping);
5753 if (rep==NULL)
5754 return -1;
5755 else if (rep!=Py_None) {
5756 Py_DECREF(rep);
5757 break;
5758 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005759 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 }
5762 /* cache callback name lookup
5763 * (if not done yet, i.e. it's the first error) */
5764 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 if ((errors==NULL) || (!strcmp(errors, "strict")))
5766 *known_errorHandler = 1;
5767 else if (!strcmp(errors, "replace"))
5768 *known_errorHandler = 2;
5769 else if (!strcmp(errors, "ignore"))
5770 *known_errorHandler = 3;
5771 else if (!strcmp(errors, "xmlcharrefreplace"))
5772 *known_errorHandler = 4;
5773 else
5774 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005775 }
5776 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005777 case 1: /* strict */
5778 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5779 return -1;
5780 case 2: /* replace */
5781 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 x = charmapencode_output('?', mapping, res, respos);
5783 if (x==enc_EXCEPTION) {
5784 return -1;
5785 }
5786 else if (x==enc_FAILED) {
5787 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5788 return -1;
5789 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005790 }
5791 /* fall through */
5792 case 3: /* ignore */
5793 *inpos = collendpos;
5794 break;
5795 case 4: /* xmlcharrefreplace */
5796 /* generate replacement (temporarily (mis)uses p) */
5797 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 char buffer[2+29+1+1];
5799 char *cp;
5800 sprintf(buffer, "&#%d;", (int)p[collpos]);
5801 for (cp = buffer; *cp; ++cp) {
5802 x = charmapencode_output(*cp, mapping, res, respos);
5803 if (x==enc_EXCEPTION)
5804 return -1;
5805 else if (x==enc_FAILED) {
5806 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5807 return -1;
5808 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005809 }
5810 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005811 *inpos = collendpos;
5812 break;
5813 default:
5814 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 encoding, reason, p, size, exceptionObject,
5816 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005817 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005819 if (PyBytes_Check(repunicode)) {
5820 /* Directly copy bytes result to output. */
5821 Py_ssize_t outsize = PyBytes_Size(*res);
5822 Py_ssize_t requiredsize;
5823 repsize = PyBytes_Size(repunicode);
5824 requiredsize = *respos + repsize;
5825 if (requiredsize > outsize)
5826 /* Make room for all additional bytes. */
5827 if (charmapencode_resize(res, respos, requiredsize)) {
5828 Py_DECREF(repunicode);
5829 return -1;
5830 }
5831 memcpy(PyBytes_AsString(*res) + *respos,
5832 PyBytes_AsString(repunicode), repsize);
5833 *respos += repsize;
5834 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005835 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005836 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005837 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005838 /* generate replacement */
5839 repsize = PyUnicode_GET_SIZE(repunicode);
5840 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 x = charmapencode_output(*uni2, mapping, res, respos);
5842 if (x==enc_EXCEPTION) {
5843 return -1;
5844 }
5845 else if (x==enc_FAILED) {
5846 Py_DECREF(repunicode);
5847 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5848 return -1;
5849 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005850 }
5851 *inpos = newpos;
5852 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005853 }
5854 return 0;
5855}
5856
Alexander Belopolsky40018472011-02-26 01:02:56 +00005857PyObject *
5858PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5859 Py_ssize_t size,
5860 PyObject *mapping,
5861 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005863 /* output object */
5864 PyObject *res = NULL;
5865 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005866 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005869 PyObject *errorHandler = NULL;
5870 PyObject *exc = NULL;
5871 /* the following variable is used for caching string comparisons
5872 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5873 * 3=ignore, 4=xmlcharrefreplace */
5874 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875
5876 /* Default to Latin-1 */
5877 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005880 /* allocate enough for a simple encoding without
5881 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005882 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005883 if (res == NULL)
5884 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005885 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005886 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005888 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 /* try to encode it */
5890 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5891 if (x==enc_EXCEPTION) /* error */
5892 goto onError;
5893 if (x==enc_FAILED) { /* unencodable character */
5894 if (charmap_encoding_error(p, size, &inpos, mapping,
5895 &exc,
5896 &known_errorHandler, &errorHandler, errors,
5897 &res, &respos)) {
5898 goto onError;
5899 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005900 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 else
5902 /* done with this character => adjust input position */
5903 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005907 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005908 if (_PyBytes_Resize(&res, respos) < 0)
5909 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005910
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005911 Py_XDECREF(exc);
5912 Py_XDECREF(errorHandler);
5913 return res;
5914
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916 Py_XDECREF(res);
5917 Py_XDECREF(exc);
5918 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 return NULL;
5920}
5921
Alexander Belopolsky40018472011-02-26 01:02:56 +00005922PyObject *
5923PyUnicode_AsCharmapString(PyObject *unicode,
5924 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925{
5926 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 PyErr_BadArgument();
5928 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 }
5930 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 PyUnicode_GET_SIZE(unicode),
5932 mapping,
5933 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934}
5935
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005936/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005937static void
5938make_translate_exception(PyObject **exceptionObject,
5939 const Py_UNICODE *unicode, Py_ssize_t size,
5940 Py_ssize_t startpos, Py_ssize_t endpos,
5941 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005943 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005944 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 }
5947 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5949 goto onError;
5950 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5951 goto onError;
5952 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5953 goto onError;
5954 return;
5955 onError:
5956 Py_DECREF(*exceptionObject);
5957 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 }
5959}
5960
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005961/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005962static void
5963raise_translate_exception(PyObject **exceptionObject,
5964 const Py_UNICODE *unicode, Py_ssize_t size,
5965 Py_ssize_t startpos, Py_ssize_t endpos,
5966 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005967{
5968 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005970 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005972}
5973
5974/* error handling callback helper:
5975 build arguments, call the callback and check the arguments,
5976 put the result into newpos and return the replacement string, which
5977 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005978static PyObject *
5979unicode_translate_call_errorhandler(const char *errors,
5980 PyObject **errorHandler,
5981 const char *reason,
5982 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5983 Py_ssize_t startpos, Py_ssize_t endpos,
5984 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005985{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005986 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005988 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005989 PyObject *restuple;
5990 PyObject *resunicode;
5991
5992 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005993 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005994 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005996 }
5997
5998 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006000 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006002
6003 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006005 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006006 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006007 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00006008 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 Py_DECREF(restuple);
6010 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011 }
6012 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 &resunicode, &i_newpos)) {
6014 Py_DECREF(restuple);
6015 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006016 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006017 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006019 else
6020 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006021 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6023 Py_DECREF(restuple);
6024 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006025 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026 Py_INCREF(resunicode);
6027 Py_DECREF(restuple);
6028 return resunicode;
6029}
6030
6031/* Lookup the character ch in the mapping and put the result in result,
6032 which must be decrefed by the caller.
6033 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006034static int
6035charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006036{
Christian Heimes217cfd12007-12-02 14:31:20 +00006037 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006038 PyObject *x;
6039
6040 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042 x = PyObject_GetItem(mapping, w);
6043 Py_DECREF(w);
6044 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6046 /* No mapping found means: use 1:1 mapping. */
6047 PyErr_Clear();
6048 *result = NULL;
6049 return 0;
6050 } else
6051 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006052 }
6053 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 *result = x;
6055 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 }
Christian Heimes217cfd12007-12-02 14:31:20 +00006057 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 long value = PyLong_AS_LONG(x);
6059 long max = PyUnicode_GetMax();
6060 if (value < 0 || value > max) {
6061 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00006062 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 Py_DECREF(x);
6064 return -1;
6065 }
6066 *result = x;
6067 return 0;
6068 }
6069 else if (PyUnicode_Check(x)) {
6070 *result = x;
6071 return 0;
6072 }
6073 else {
6074 /* wrong return value */
6075 PyErr_SetString(PyExc_TypeError,
6076 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006077 Py_DECREF(x);
6078 return -1;
6079 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006080}
6081/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 if not reallocate and adjust various state variables.
6083 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006084static int
6085charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006087{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006088 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006089 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 /* remember old output position */
6091 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6092 /* exponentially overallocate to minimize reallocations */
6093 if (requiredsize < 2 * oldsize)
6094 requiredsize = 2 * oldsize;
6095 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6096 return -1;
6097 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006098 }
6099 return 0;
6100}
6101/* lookup the character, put the result in the output string and adjust
6102 various state variables. Return a new reference to the object that
6103 was put in the output buffer in *result, or Py_None, if the mapping was
6104 undefined (in which case no character was written).
6105 The called must decref result.
6106 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006107static int
6108charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
6109 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6110 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111{
Walter Dörwald4894c302003-10-24 14:25:28 +00006112 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006114 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 /* not found => default to 1:1 mapping */
6116 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006117 }
6118 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006120 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 /* no overflow check, because we know that the space is enough */
6122 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006123 }
6124 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6126 if (repsize==1) {
6127 /* no overflow check, because we know that the space is enough */
6128 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6129 }
6130 else if (repsize!=0) {
6131 /* more than one character */
6132 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6133 (insize - (curinp-startinp)) +
6134 repsize - 1;
6135 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6136 return -1;
6137 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6138 *outp += repsize;
6139 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140 }
6141 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143 return 0;
6144}
6145
Alexander Belopolsky40018472011-02-26 01:02:56 +00006146PyObject *
6147PyUnicode_TranslateCharmap(const Py_UNICODE *p,
6148 Py_ssize_t size,
6149 PyObject *mapping,
6150 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006152 /* output object */
6153 PyObject *res = NULL;
6154 /* pointers to the beginning and end+1 of input */
6155 const Py_UNICODE *startp = p;
6156 const Py_UNICODE *endp = p + size;
6157 /* pointer into the output */
6158 Py_UNICODE *str;
6159 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006160 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006161 char *reason = "character maps to <undefined>";
6162 PyObject *errorHandler = NULL;
6163 PyObject *exc = NULL;
6164 /* the following variable is used for caching string comparisons
6165 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6166 * 3=ignore, 4=xmlcharrefreplace */
6167 int known_errorHandler = -1;
6168
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006170 PyErr_BadArgument();
6171 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006173
6174 /* allocate enough for a simple 1:1 translation without
6175 replacements, if we need more, we'll resize */
6176 res = PyUnicode_FromUnicode(NULL, size);
6177 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006180 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006181 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006183 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 /* try to encode it */
6185 PyObject *x = NULL;
6186 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6187 Py_XDECREF(x);
6188 goto onError;
6189 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006190 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 if (x!=Py_None) /* it worked => adjust input pointer */
6192 ++p;
6193 else { /* untranslatable character */
6194 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6195 Py_ssize_t repsize;
6196 Py_ssize_t newpos;
6197 Py_UNICODE *uni2;
6198 /* startpos for collecting untranslatable chars */
6199 const Py_UNICODE *collstart = p;
6200 const Py_UNICODE *collend = p+1;
6201 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 /* find all untranslatable characters */
6204 while (collend < endp) {
6205 if (charmaptranslate_lookup(*collend, mapping, &x))
6206 goto onError;
6207 Py_XDECREF(x);
6208 if (x!=Py_None)
6209 break;
6210 ++collend;
6211 }
6212 /* cache callback name lookup
6213 * (if not done yet, i.e. it's the first error) */
6214 if (known_errorHandler==-1) {
6215 if ((errors==NULL) || (!strcmp(errors, "strict")))
6216 known_errorHandler = 1;
6217 else if (!strcmp(errors, "replace"))
6218 known_errorHandler = 2;
6219 else if (!strcmp(errors, "ignore"))
6220 known_errorHandler = 3;
6221 else if (!strcmp(errors, "xmlcharrefreplace"))
6222 known_errorHandler = 4;
6223 else
6224 known_errorHandler = 0;
6225 }
6226 switch (known_errorHandler) {
6227 case 1: /* strict */
6228 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006229 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 case 2: /* replace */
6231 /* No need to check for space, this is a 1:1 replacement */
6232 for (coll = collstart; coll<collend; ++coll)
6233 *str++ = '?';
6234 /* fall through */
6235 case 3: /* ignore */
6236 p = collend;
6237 break;
6238 case 4: /* xmlcharrefreplace */
6239 /* generate replacement (temporarily (mis)uses p) */
6240 for (p = collstart; p < collend; ++p) {
6241 char buffer[2+29+1+1];
6242 char *cp;
6243 sprintf(buffer, "&#%d;", (int)*p);
6244 if (charmaptranslate_makespace(&res, &str,
6245 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6246 goto onError;
6247 for (cp = buffer; *cp; ++cp)
6248 *str++ = *cp;
6249 }
6250 p = collend;
6251 break;
6252 default:
6253 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6254 reason, startp, size, &exc,
6255 collstart-startp, collend-startp, &newpos);
6256 if (repunicode == NULL)
6257 goto onError;
6258 /* generate replacement */
6259 repsize = PyUnicode_GET_SIZE(repunicode);
6260 if (charmaptranslate_makespace(&res, &str,
6261 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6262 Py_DECREF(repunicode);
6263 goto onError;
6264 }
6265 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6266 *str++ = *uni2;
6267 p = startp + newpos;
6268 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006269 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006270 }
6271 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006272 /* Resize if we allocated to much */
6273 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006274 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 if (PyUnicode_Resize(&res, respos) < 0)
6276 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006277 }
6278 Py_XDECREF(exc);
6279 Py_XDECREF(errorHandler);
6280 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006283 Py_XDECREF(res);
6284 Py_XDECREF(exc);
6285 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 return NULL;
6287}
6288
Alexander Belopolsky40018472011-02-26 01:02:56 +00006289PyObject *
6290PyUnicode_Translate(PyObject *str,
6291 PyObject *mapping,
6292 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293{
6294 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006295
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 str = PyUnicode_FromObject(str);
6297 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 PyUnicode_GET_SIZE(str),
6301 mapping,
6302 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 Py_DECREF(str);
6304 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006305
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307 Py_XDECREF(str);
6308 return NULL;
6309}
Tim Petersced69f82003-09-16 20:30:58 +00006310
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006311PyObject *
6312PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6313 Py_ssize_t length)
6314{
6315 PyObject *result;
6316 Py_UNICODE *p; /* write pointer into result */
6317 Py_ssize_t i;
6318 /* Copy to a new string */
6319 result = (PyObject *)_PyUnicode_New(length);
6320 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6321 if (result == NULL)
6322 return result;
6323 p = PyUnicode_AS_UNICODE(result);
6324 /* Iterate over code points */
6325 for (i = 0; i < length; i++) {
6326 Py_UNICODE ch =s[i];
6327 if (ch > 127) {
6328 int decimal = Py_UNICODE_TODECIMAL(ch);
6329 if (decimal >= 0)
6330 p[i] = '0' + decimal;
6331 }
6332 }
6333 return result;
6334}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006335/* --- Decimal Encoder ---------------------------------------------------- */
6336
Alexander Belopolsky40018472011-02-26 01:02:56 +00006337int
6338PyUnicode_EncodeDecimal(Py_UNICODE *s,
6339 Py_ssize_t length,
6340 char *output,
6341 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006342{
6343 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006344 PyObject *errorHandler = NULL;
6345 PyObject *exc = NULL;
6346 const char *encoding = "decimal";
6347 const char *reason = "invalid decimal Unicode string";
6348 /* the following variable is used for caching string comparisons
6349 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6350 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006351
6352 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 PyErr_BadArgument();
6354 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006355 }
6356
6357 p = s;
6358 end = s + length;
6359 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 register Py_UNICODE ch = *p;
6361 int decimal;
6362 PyObject *repunicode;
6363 Py_ssize_t repsize;
6364 Py_ssize_t newpos;
6365 Py_UNICODE *uni2;
6366 Py_UNICODE *collstart;
6367 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006368
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006370 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 ++p;
6372 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006373 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 decimal = Py_UNICODE_TODECIMAL(ch);
6375 if (decimal >= 0) {
6376 *output++ = '0' + decimal;
6377 ++p;
6378 continue;
6379 }
6380 if (0 < ch && ch < 256) {
6381 *output++ = (char)ch;
6382 ++p;
6383 continue;
6384 }
6385 /* All other characters are considered unencodable */
6386 collstart = p;
6387 collend = p+1;
6388 while (collend < end) {
6389 if ((0 < *collend && *collend < 256) ||
6390 !Py_UNICODE_ISSPACE(*collend) ||
6391 Py_UNICODE_TODECIMAL(*collend))
6392 break;
6393 }
6394 /* cache callback name lookup
6395 * (if not done yet, i.e. it's the first error) */
6396 if (known_errorHandler==-1) {
6397 if ((errors==NULL) || (!strcmp(errors, "strict")))
6398 known_errorHandler = 1;
6399 else if (!strcmp(errors, "replace"))
6400 known_errorHandler = 2;
6401 else if (!strcmp(errors, "ignore"))
6402 known_errorHandler = 3;
6403 else if (!strcmp(errors, "xmlcharrefreplace"))
6404 known_errorHandler = 4;
6405 else
6406 known_errorHandler = 0;
6407 }
6408 switch (known_errorHandler) {
6409 case 1: /* strict */
6410 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6411 goto onError;
6412 case 2: /* replace */
6413 for (p = collstart; p < collend; ++p)
6414 *output++ = '?';
6415 /* fall through */
6416 case 3: /* ignore */
6417 p = collend;
6418 break;
6419 case 4: /* xmlcharrefreplace */
6420 /* generate replacement (temporarily (mis)uses p) */
6421 for (p = collstart; p < collend; ++p)
6422 output += sprintf(output, "&#%d;", (int)*p);
6423 p = collend;
6424 break;
6425 default:
6426 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6427 encoding, reason, s, length, &exc,
6428 collstart-s, collend-s, &newpos);
6429 if (repunicode == NULL)
6430 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006431 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006432 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006433 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6434 Py_DECREF(repunicode);
6435 goto onError;
6436 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 /* generate replacement */
6438 repsize = PyUnicode_GET_SIZE(repunicode);
6439 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6440 Py_UNICODE ch = *uni2;
6441 if (Py_UNICODE_ISSPACE(ch))
6442 *output++ = ' ';
6443 else {
6444 decimal = Py_UNICODE_TODECIMAL(ch);
6445 if (decimal >= 0)
6446 *output++ = '0' + decimal;
6447 else if (0 < ch && ch < 256)
6448 *output++ = (char)ch;
6449 else {
6450 Py_DECREF(repunicode);
6451 raise_encode_exception(&exc, encoding,
6452 s, length, collstart-s, collend-s, reason);
6453 goto onError;
6454 }
6455 }
6456 }
6457 p = s + newpos;
6458 Py_DECREF(repunicode);
6459 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006460 }
6461 /* 0-terminate the output string */
6462 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463 Py_XDECREF(exc);
6464 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006465 return 0;
6466
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006468 Py_XDECREF(exc);
6469 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006470 return -1;
6471}
6472
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473/* --- Helpers ------------------------------------------------------------ */
6474
Eric Smith8c663262007-08-25 02:26:07 +00006475#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006476#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006477
Thomas Wouters477c8d52006-05-27 19:21:47 +00006478#include "stringlib/count.h"
6479#include "stringlib/find.h"
6480#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006481#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006482
Eric Smith5807c412008-05-11 21:00:57 +00006483#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006484#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006485#include "stringlib/localeutil.h"
6486
Thomas Wouters477c8d52006-05-27 19:21:47 +00006487/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006488#define ADJUST_INDICES(start, end, len) \
6489 if (end > len) \
6490 end = len; \
6491 else if (end < 0) { \
6492 end += len; \
6493 if (end < 0) \
6494 end = 0; \
6495 } \
6496 if (start < 0) { \
6497 start += len; \
6498 if (start < 0) \
6499 start = 0; \
6500 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006501
Alexander Belopolsky40018472011-02-26 01:02:56 +00006502Py_ssize_t
6503PyUnicode_Count(PyObject *str,
6504 PyObject *substr,
6505 Py_ssize_t start,
6506 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006508 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006509 PyUnicodeObject* str_obj;
6510 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006511
Thomas Wouters477c8d52006-05-27 19:21:47 +00006512 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6513 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006515 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6516 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 Py_DECREF(str_obj);
6518 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 }
Tim Petersced69f82003-09-16 20:30:58 +00006520
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006521 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006522 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006523 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6524 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006525 );
6526
6527 Py_DECREF(sub_obj);
6528 Py_DECREF(str_obj);
6529
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 return result;
6531}
6532
Alexander Belopolsky40018472011-02-26 01:02:56 +00006533Py_ssize_t
6534PyUnicode_Find(PyObject *str,
6535 PyObject *sub,
6536 Py_ssize_t start,
6537 Py_ssize_t end,
6538 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006540 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006541
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006543 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006545 sub = PyUnicode_FromObject(sub);
6546 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 Py_DECREF(str);
6548 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 }
Tim Petersced69f82003-09-16 20:30:58 +00006550
Thomas Wouters477c8d52006-05-27 19:21:47 +00006551 if (direction > 0)
6552 result = stringlib_find_slice(
6553 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6554 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6555 start, end
6556 );
6557 else
6558 result = stringlib_rfind_slice(
6559 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6560 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6561 start, end
6562 );
6563
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006565 Py_DECREF(sub);
6566
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 return result;
6568}
6569
Alexander Belopolsky40018472011-02-26 01:02:56 +00006570static int
6571tailmatch(PyUnicodeObject *self,
6572 PyUnicodeObject *substring,
6573 Py_ssize_t start,
6574 Py_ssize_t end,
6575 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 if (substring->length == 0)
6578 return 1;
6579
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006580 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 end -= substring->length;
6582 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006583 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584
6585 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 if (Py_UNICODE_MATCH(self, end, substring))
6587 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 } else {
6589 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 }
6592
6593 return 0;
6594}
6595
Alexander Belopolsky40018472011-02-26 01:02:56 +00006596Py_ssize_t
6597PyUnicode_Tailmatch(PyObject *str,
6598 PyObject *substr,
6599 Py_ssize_t start,
6600 Py_ssize_t end,
6601 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006603 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006604
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 str = PyUnicode_FromObject(str);
6606 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 substr = PyUnicode_FromObject(substr);
6609 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 Py_DECREF(str);
6611 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 }
Tim Petersced69f82003-09-16 20:30:58 +00006613
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006615 (PyUnicodeObject *)substr,
6616 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 Py_DECREF(str);
6618 Py_DECREF(substr);
6619 return result;
6620}
6621
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622/* Apply fixfct filter to the Unicode object self and return a
6623 reference to the modified object */
6624
Alexander Belopolsky40018472011-02-26 01:02:56 +00006625static PyObject *
6626fixup(PyUnicodeObject *self,
6627 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628{
6629
6630 PyUnicodeObject *u;
6631
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006632 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006635
6636 Py_UNICODE_COPY(u->str, self->str, self->length);
6637
Tim Peters7a29bd52001-09-12 03:03:31 +00006638 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006639 /* fixfct should return TRUE if it modified the buffer. If
6640 FALSE, return a reference to the original buffer instead
6641 (to save space, not time) */
6642 Py_INCREF(self);
6643 Py_DECREF(u);
6644 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 }
6646 return (PyObject*) u;
6647}
6648
Alexander Belopolsky40018472011-02-26 01:02:56 +00006649static int
6650fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006652 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 Py_UNICODE *s = self->str;
6654 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006655
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006658
Benjamin Peterson29060642009-01-31 22:14:21 +00006659 ch = Py_UNICODE_TOUPPER(*s);
6660 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 *s = ch;
6663 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 s++;
6665 }
6666
6667 return status;
6668}
6669
Alexander Belopolsky40018472011-02-26 01:02:56 +00006670static int
6671fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006673 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 Py_UNICODE *s = self->str;
6675 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006676
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006679
Benjamin Peterson29060642009-01-31 22:14:21 +00006680 ch = Py_UNICODE_TOLOWER(*s);
6681 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006683 *s = ch;
6684 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685 s++;
6686 }
6687
6688 return status;
6689}
6690
Alexander Belopolsky40018472011-02-26 01:02:56 +00006691static int
6692fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006694 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 Py_UNICODE *s = self->str;
6696 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006697
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 while (len-- > 0) {
6699 if (Py_UNICODE_ISUPPER(*s)) {
6700 *s = Py_UNICODE_TOLOWER(*s);
6701 status = 1;
6702 } else if (Py_UNICODE_ISLOWER(*s)) {
6703 *s = Py_UNICODE_TOUPPER(*s);
6704 status = 1;
6705 }
6706 s++;
6707 }
6708
6709 return status;
6710}
6711
Alexander Belopolsky40018472011-02-26 01:02:56 +00006712static int
6713fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006715 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006716 Py_UNICODE *s = self->str;
6717 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006718
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006719 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006720 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006721 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 *s = Py_UNICODE_TOUPPER(*s);
6723 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006725 s++;
6726 while (--len > 0) {
6727 if (Py_UNICODE_ISUPPER(*s)) {
6728 *s = Py_UNICODE_TOLOWER(*s);
6729 status = 1;
6730 }
6731 s++;
6732 }
6733 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734}
6735
Alexander Belopolsky40018472011-02-26 01:02:56 +00006736static int
6737fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738{
6739 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6740 register Py_UNICODE *e;
6741 int previous_is_cased;
6742
6743 /* Shortcut for single character strings */
6744 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6746 if (*p != ch) {
6747 *p = ch;
6748 return 1;
6749 }
6750 else
6751 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 }
Tim Petersced69f82003-09-16 20:30:58 +00006753
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 e = p + PyUnicode_GET_SIZE(self);
6755 previous_is_cased = 0;
6756 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006758
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 if (previous_is_cased)
6760 *p = Py_UNICODE_TOLOWER(ch);
6761 else
6762 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006763
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 if (Py_UNICODE_ISLOWER(ch) ||
6765 Py_UNICODE_ISUPPER(ch) ||
6766 Py_UNICODE_ISTITLE(ch))
6767 previous_is_cased = 1;
6768 else
6769 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 }
6771 return 1;
6772}
6773
Tim Peters8ce9f162004-08-27 01:49:32 +00006774PyObject *
6775PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776{
Skip Montanaro6543b452004-09-16 03:28:13 +00006777 const Py_UNICODE blank = ' ';
6778 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006779 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006780 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006781 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6782 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006783 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6784 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006785 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006786 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787
Tim Peters05eba1f2004-08-27 21:32:02 +00006788 fseq = PySequence_Fast(seq, "");
6789 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006790 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006791 }
6792
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006793 /* NOTE: the following code can't call back into Python code,
6794 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006795 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006796
Tim Peters05eba1f2004-08-27 21:32:02 +00006797 seqlen = PySequence_Fast_GET_SIZE(fseq);
6798 /* If empty sequence, return u"". */
6799 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006800 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6801 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006802 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006803 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006804 /* If singleton sequence with an exact Unicode, return that. */
6805 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006806 item = items[0];
6807 if (PyUnicode_CheckExact(item)) {
6808 Py_INCREF(item);
6809 res = (PyUnicodeObject *)item;
6810 goto Done;
6811 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006812 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006813 else {
6814 /* Set up sep and seplen */
6815 if (separator == NULL) {
6816 sep = &blank;
6817 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006818 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006819 else {
6820 if (!PyUnicode_Check(separator)) {
6821 PyErr_Format(PyExc_TypeError,
6822 "separator: expected str instance,"
6823 " %.80s found",
6824 Py_TYPE(separator)->tp_name);
6825 goto onError;
6826 }
6827 sep = PyUnicode_AS_UNICODE(separator);
6828 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006829 }
6830 }
6831
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006832 /* There are at least two things to join, or else we have a subclass
6833 * of str in the sequence.
6834 * Do a pre-pass to figure out the total amount of space we'll
6835 * need (sz), and see whether all argument are strings.
6836 */
6837 sz = 0;
6838 for (i = 0; i < seqlen; i++) {
6839 const Py_ssize_t old_sz = sz;
6840 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 if (!PyUnicode_Check(item)) {
6842 PyErr_Format(PyExc_TypeError,
6843 "sequence item %zd: expected str instance,"
6844 " %.80s found",
6845 i, Py_TYPE(item)->tp_name);
6846 goto onError;
6847 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006848 sz += PyUnicode_GET_SIZE(item);
6849 if (i != 0)
6850 sz += seplen;
6851 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6852 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006853 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006854 goto onError;
6855 }
6856 }
Tim Petersced69f82003-09-16 20:30:58 +00006857
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006858 res = _PyUnicode_New(sz);
6859 if (res == NULL)
6860 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006861
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006862 /* Catenate everything. */
6863 res_p = PyUnicode_AS_UNICODE(res);
6864 for (i = 0; i < seqlen; ++i) {
6865 Py_ssize_t itemlen;
6866 item = items[i];
6867 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 /* Copy item, and maybe the separator. */
6869 if (i) {
6870 Py_UNICODE_COPY(res_p, sep, seplen);
6871 res_p += seplen;
6872 }
6873 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6874 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006875 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006876
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006878 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 return (PyObject *)res;
6880
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006882 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006883 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884 return NULL;
6885}
6886
Alexander Belopolsky40018472011-02-26 01:02:56 +00006887static PyUnicodeObject *
6888pad(PyUnicodeObject *self,
6889 Py_ssize_t left,
6890 Py_ssize_t right,
6891 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892{
6893 PyUnicodeObject *u;
6894
6895 if (left < 0)
6896 left = 0;
6897 if (right < 0)
6898 right = 0;
6899
Tim Peters7a29bd52001-09-12 03:03:31 +00006900 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 Py_INCREF(self);
6902 return self;
6903 }
6904
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006905 if (left > PY_SSIZE_T_MAX - self->length ||
6906 right > PY_SSIZE_T_MAX - (left + self->length)) {
6907 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6908 return NULL;
6909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 u = _PyUnicode_New(left + self->length + right);
6911 if (u) {
6912 if (left)
6913 Py_UNICODE_FILL(u->str, fill, left);
6914 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6915 if (right)
6916 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6917 }
6918
6919 return u;
6920}
6921
Alexander Belopolsky40018472011-02-26 01:02:56 +00006922PyObject *
6923PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926
6927 string = PyUnicode_FromObject(string);
6928 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006931 list = stringlib_splitlines(
6932 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6933 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934
6935 Py_DECREF(string);
6936 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937}
6938
Alexander Belopolsky40018472011-02-26 01:02:56 +00006939static PyObject *
6940split(PyUnicodeObject *self,
6941 PyUnicodeObject *substring,
6942 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006945 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006948 return stringlib_split_whitespace(
6949 (PyObject*) self, self->str, self->length, maxcount
6950 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006952 return stringlib_split(
6953 (PyObject*) self, self->str, self->length,
6954 substring->str, substring->length,
6955 maxcount
6956 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957}
6958
Alexander Belopolsky40018472011-02-26 01:02:56 +00006959static PyObject *
6960rsplit(PyUnicodeObject *self,
6961 PyUnicodeObject *substring,
6962 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006963{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006964 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006965 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006966
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006967 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006968 return stringlib_rsplit_whitespace(
6969 (PyObject*) self, self->str, self->length, maxcount
6970 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006971
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006972 return stringlib_rsplit(
6973 (PyObject*) self, self->str, self->length,
6974 substring->str, substring->length,
6975 maxcount
6976 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006977}
6978
Alexander Belopolsky40018472011-02-26 01:02:56 +00006979static PyObject *
6980replace(PyUnicodeObject *self,
6981 PyUnicodeObject *str1,
6982 PyUnicodeObject *str2,
6983 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984{
6985 PyUnicodeObject *u;
6986
6987 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006989 else if (maxcount == 0 || self->length == 0)
6990 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991
Thomas Wouters477c8d52006-05-27 19:21:47 +00006992 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006993 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006994 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006995 if (str1->length == 0)
6996 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006997 if (str1->length == 1) {
6998 /* replace characters */
6999 Py_UNICODE u1, u2;
7000 if (!findchar(self->str, self->length, str1->str[0]))
7001 goto nothing;
7002 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7003 if (!u)
7004 return NULL;
7005 Py_UNICODE_COPY(u->str, self->str, self->length);
7006 u1 = str1->str[0];
7007 u2 = str2->str[0];
7008 for (i = 0; i < u->length; i++)
7009 if (u->str[i] == u1) {
7010 if (--maxcount < 0)
7011 break;
7012 u->str[i] = u2;
7013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007015 i = stringlib_find(
7016 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00007018 if (i < 0)
7019 goto nothing;
7020 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7021 if (!u)
7022 return NULL;
7023 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007024
7025 /* change everything in-place, starting with this one */
7026 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7027 i += str1->length;
7028
7029 while ( --maxcount > 0) {
7030 i = stringlib_find(self->str+i, self->length-i,
7031 str1->str, str1->length,
7032 i);
7033 if (i == -1)
7034 break;
7035 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7036 i += str1->length;
7037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007040
Brett Cannonb94767f2011-02-22 20:15:44 +00007041 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007042 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043 Py_UNICODE *p;
7044
7045 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007046 n = stringlib_count(self->str, self->length, str1->str, str1->length,
7047 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007048 if (n == 0)
7049 goto nothing;
7050 /* new_size = self->length + n * (str2->length - str1->length)); */
7051 delta = (str2->length - str1->length);
7052 if (delta == 0) {
7053 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007055 product = n * (str2->length - str1->length);
7056 if ((product / (str2->length - str1->length)) != n) {
7057 PyErr_SetString(PyExc_OverflowError,
7058 "replace string is too long");
7059 return NULL;
7060 }
7061 new_size = self->length + product;
7062 if (new_size < 0) {
7063 PyErr_SetString(PyExc_OverflowError,
7064 "replace string is too long");
7065 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 }
7067 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007068 u = _PyUnicode_New(new_size);
7069 if (!u)
7070 return NULL;
7071 i = 0;
7072 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007073 if (str1->length > 0) {
7074 while (n-- > 0) {
7075 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007076 j = stringlib_find(self->str+i, self->length-i,
7077 str1->str, str1->length,
7078 i);
7079 if (j == -1)
7080 break;
7081 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007082 /* copy unchanged part [i:j] */
7083 Py_UNICODE_COPY(p, self->str+i, j-i);
7084 p += j - i;
7085 }
7086 /* copy substitution string */
7087 if (str2->length > 0) {
7088 Py_UNICODE_COPY(p, str2->str, str2->length);
7089 p += str2->length;
7090 }
7091 i = j + str1->length;
7092 }
7093 if (i < self->length)
7094 /* copy tail [i:] */
7095 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7096 } else {
7097 /* interleave */
7098 while (n > 0) {
7099 Py_UNICODE_COPY(p, str2->str, str2->length);
7100 p += str2->length;
7101 if (--n <= 0)
7102 break;
7103 *p++ = self->str[i++];
7104 }
7105 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007109
Benjamin Peterson29060642009-01-31 22:14:21 +00007110 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007111 /* nothing to replace; return original string (when possible) */
7112 if (PyUnicode_CheckExact(self)) {
7113 Py_INCREF(self);
7114 return (PyObject *) self;
7115 }
7116 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117}
7118
7119/* --- Unicode Object Methods --------------------------------------------- */
7120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007121PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007122 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123\n\
7124Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007125characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126
7127static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007128unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 return fixup(self, fixtitle);
7131}
7132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007133PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135\n\
7136Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007137have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138
7139static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007140unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142 return fixup(self, fixcapitalize);
7143}
7144
7145#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007146PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007147 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148\n\
7149Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007150normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151
7152static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007153unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154{
7155 PyObject *list;
7156 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007157 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 /* Split into words */
7160 list = split(self, NULL, -1);
7161 if (!list)
7162 return NULL;
7163
7164 /* Capitalize each word */
7165 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7166 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007167 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168 if (item == NULL)
7169 goto onError;
7170 Py_DECREF(PyList_GET_ITEM(list, i));
7171 PyList_SET_ITEM(list, i, item);
7172 }
7173
7174 /* Join the words to form a new string */
7175 item = PyUnicode_Join(NULL, list);
7176
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178 Py_DECREF(list);
7179 return (PyObject *)item;
7180}
7181#endif
7182
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007183/* Argument converter. Coerces to a single unicode character */
7184
7185static int
7186convert_uc(PyObject *obj, void *addr)
7187{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007188 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7189 PyObject *uniobj;
7190 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007191
Benjamin Peterson14339b62009-01-31 16:36:08 +00007192 uniobj = PyUnicode_FromObject(obj);
7193 if (uniobj == NULL) {
7194 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007195 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007196 return 0;
7197 }
7198 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7199 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007201 Py_DECREF(uniobj);
7202 return 0;
7203 }
7204 unistr = PyUnicode_AS_UNICODE(uniobj);
7205 *fillcharloc = unistr[0];
7206 Py_DECREF(uniobj);
7207 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007208}
7209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007210PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007213Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007214done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215
7216static PyObject *
7217unicode_center(PyUnicodeObject *self, PyObject *args)
7218{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007219 Py_ssize_t marg, left;
7220 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007221 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222
Thomas Woutersde017742006-02-16 19:34:37 +00007223 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 return NULL;
7225
Tim Peters7a29bd52001-09-12 03:03:31 +00007226 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227 Py_INCREF(self);
7228 return (PyObject*) self;
7229 }
7230
7231 marg = width - self->length;
7232 left = marg / 2 + (marg & width & 1);
7233
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007234 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235}
7236
Marc-André Lemburge5034372000-08-08 08:04:29 +00007237#if 0
7238
7239/* This code should go into some future Unicode collation support
7240 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007241 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007242
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007243/* speedy UTF-16 code point order comparison */
7244/* gleaned from: */
7245/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7246
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007247static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007248{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007249 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007250 0, 0, 0, 0, 0, 0, 0, 0,
7251 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007252 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007253};
7254
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255static int
7256unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7257{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007258 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007259
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260 Py_UNICODE *s1 = str1->str;
7261 Py_UNICODE *s2 = str2->str;
7262
7263 len1 = str1->length;
7264 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007265
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007267 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007268
7269 c1 = *s1++;
7270 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007271
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 if (c1 > (1<<11) * 26)
7273 c1 += utf16Fixup[c1>>11];
7274 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007275 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007276 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007277
7278 if (c1 != c2)
7279 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007280
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007281 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282 }
7283
7284 return (len1 < len2) ? -1 : (len1 != len2);
7285}
7286
Marc-André Lemburge5034372000-08-08 08:04:29 +00007287#else
7288
7289static int
7290unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7291{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007292 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007293
7294 Py_UNICODE *s1 = str1->str;
7295 Py_UNICODE *s2 = str2->str;
7296
7297 len1 = str1->length;
7298 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007299
Marc-André Lemburge5034372000-08-08 08:04:29 +00007300 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007301 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007302
Fredrik Lundh45714e92001-06-26 16:39:36 +00007303 c1 = *s1++;
7304 c2 = *s2++;
7305
7306 if (c1 != c2)
7307 return (c1 < c2) ? -1 : 1;
7308
Marc-André Lemburge5034372000-08-08 08:04:29 +00007309 len1--; len2--;
7310 }
7311
7312 return (len1 < len2) ? -1 : (len1 != len2);
7313}
7314
7315#endif
7316
Alexander Belopolsky40018472011-02-26 01:02:56 +00007317int
7318PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007320 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7321 return unicode_compare((PyUnicodeObject *)left,
7322 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007323 PyErr_Format(PyExc_TypeError,
7324 "Can't compare %.100s and %.100s",
7325 left->ob_type->tp_name,
7326 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327 return -1;
7328}
7329
Martin v. Löwis5b222132007-06-10 09:51:05 +00007330int
7331PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7332{
7333 int i;
7334 Py_UNICODE *id;
7335 assert(PyUnicode_Check(uni));
7336 id = PyUnicode_AS_UNICODE(uni);
7337 /* Compare Unicode string and source character set string */
7338 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 if (id[i] != str[i])
7340 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007341 /* This check keeps Python strings that end in '\0' from comparing equal
7342 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007343 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007345 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007346 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007347 return 0;
7348}
7349
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007350
Benjamin Peterson29060642009-01-31 22:14:21 +00007351#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007352 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007353
Alexander Belopolsky40018472011-02-26 01:02:56 +00007354PyObject *
7355PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007356{
7357 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007358
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007359 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7360 PyObject *v;
Benjamin Peterson5fd4bd32011-03-06 09:06:34 -06007361 if (PyUnicode_GET_SIZE(left) != PyUnicode_GET_SIZE(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007362 if (op == Py_EQ) {
7363 Py_INCREF(Py_False);
7364 return Py_False;
7365 }
7366 if (op == Py_NE) {
7367 Py_INCREF(Py_True);
7368 return Py_True;
7369 }
7370 }
7371 if (left == right)
7372 result = 0;
7373 else
7374 result = unicode_compare((PyUnicodeObject *)left,
7375 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007376
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007377 /* Convert the return value to a Boolean */
7378 switch (op) {
7379 case Py_EQ:
7380 v = TEST_COND(result == 0);
7381 break;
7382 case Py_NE:
7383 v = TEST_COND(result != 0);
7384 break;
7385 case Py_LE:
7386 v = TEST_COND(result <= 0);
7387 break;
7388 case Py_GE:
7389 v = TEST_COND(result >= 0);
7390 break;
7391 case Py_LT:
7392 v = TEST_COND(result == -1);
7393 break;
7394 case Py_GT:
7395 v = TEST_COND(result == 1);
7396 break;
7397 default:
7398 PyErr_BadArgument();
7399 return NULL;
7400 }
7401 Py_INCREF(v);
7402 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007403 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007404
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007405 Py_INCREF(Py_NotImplemented);
7406 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007407}
7408
Alexander Belopolsky40018472011-02-26 01:02:56 +00007409int
7410PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007411{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007412 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007413 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007414
7415 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007416 sub = PyUnicode_FromObject(element);
7417 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 PyErr_Format(PyExc_TypeError,
7419 "'in <string>' requires string as left operand, not %s",
7420 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007421 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007422 }
7423
Thomas Wouters477c8d52006-05-27 19:21:47 +00007424 str = PyUnicode_FromObject(container);
7425 if (!str) {
7426 Py_DECREF(sub);
7427 return -1;
7428 }
7429
7430 result = stringlib_contains_obj(str, sub);
7431
7432 Py_DECREF(str);
7433 Py_DECREF(sub);
7434
Guido van Rossum403d68b2000-03-13 15:55:09 +00007435 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007436}
7437
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438/* Concat to string or Unicode object giving a new Unicode object. */
7439
Alexander Belopolsky40018472011-02-26 01:02:56 +00007440PyObject *
7441PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442{
7443 PyUnicodeObject *u = NULL, *v = NULL, *w;
7444
7445 /* Coerce the two arguments */
7446 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7447 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7450 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452
7453 /* Shortcuts */
7454 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 Py_DECREF(v);
7456 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457 }
7458 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 Py_DECREF(u);
7460 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461 }
7462
7463 /* Concat the two Unicode strings */
7464 w = _PyUnicode_New(u->length + v->length);
7465 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467 Py_UNICODE_COPY(w->str, u->str, u->length);
7468 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7469
7470 Py_DECREF(u);
7471 Py_DECREF(v);
7472 return (PyObject *)w;
7473
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 Py_XDECREF(u);
7476 Py_XDECREF(v);
7477 return NULL;
7478}
7479
Walter Dörwald1ab83302007-05-18 17:15:44 +00007480void
7481PyUnicode_Append(PyObject **pleft, PyObject *right)
7482{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007483 PyObject *new;
7484 if (*pleft == NULL)
7485 return;
7486 if (right == NULL || !PyUnicode_Check(*pleft)) {
7487 Py_DECREF(*pleft);
7488 *pleft = NULL;
7489 return;
7490 }
7491 new = PyUnicode_Concat(*pleft, right);
7492 Py_DECREF(*pleft);
7493 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007494}
7495
7496void
7497PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7498{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007499 PyUnicode_Append(pleft, right);
7500 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007501}
7502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007503PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007504 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007506Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007507string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007508interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509
7510static PyObject *
7511unicode_count(PyUnicodeObject *self, PyObject *args)
7512{
7513 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007514 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007515 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516 PyObject *result;
7517
Jesus Ceaac451502011-04-20 17:09:23 +02007518 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
7519 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00007520 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007521
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007522 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007523 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007524 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007525 substring->str, substring->length,
7526 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007527 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528
7529 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007530
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531 return result;
7532}
7533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007534PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007535 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007537Encode S using the codec registered for encoding. Default encoding\n\
7538is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007539handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007540a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7541'xmlcharrefreplace' as well as any other name registered with\n\
7542codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543
7544static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007545unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007547 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548 char *encoding = NULL;
7549 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007550
Benjamin Peterson308d6372009-09-18 21:42:35 +00007551 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7552 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007554 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007555}
7556
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007557PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007558 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559\n\
7560Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007561If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562
7563static PyObject*
7564unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7565{
7566 Py_UNICODE *e;
7567 Py_UNICODE *p;
7568 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007569 Py_UNICODE *qe;
7570 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007571 PyUnicodeObject *u;
7572 int tabsize = 8;
7573
7574 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007575 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576
Thomas Wouters7e474022000-07-16 12:04:32 +00007577 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007578 i = 0; /* chars up to and including most recent \n or \r */
7579 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7580 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581 for (p = self->str; p < e; p++)
7582 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 if (tabsize > 0) {
7584 incr = tabsize - (j % tabsize); /* cannot overflow */
7585 if (j > PY_SSIZE_T_MAX - incr)
7586 goto overflow1;
7587 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007588 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007591 if (j > PY_SSIZE_T_MAX - 1)
7592 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007593 j++;
7594 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 if (i > PY_SSIZE_T_MAX - j)
7596 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007598 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599 }
7600 }
7601
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007602 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007604
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605 /* Second pass: create output string and fill it */
7606 u = _PyUnicode_New(i + j);
7607 if (!u)
7608 return NULL;
7609
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007610 j = 0; /* same as in first pass */
7611 q = u->str; /* next output char */
7612 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613
7614 for (p = self->str; p < e; p++)
7615 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007616 if (tabsize > 0) {
7617 i = tabsize - (j % tabsize);
7618 j += i;
7619 while (i--) {
7620 if (q >= qe)
7621 goto overflow2;
7622 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007623 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007624 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007625 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 else {
7627 if (q >= qe)
7628 goto overflow2;
7629 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007630 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631 if (*p == '\n' || *p == '\r')
7632 j = 0;
7633 }
7634
7635 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007636
7637 overflow2:
7638 Py_DECREF(u);
7639 overflow1:
7640 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7641 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007642}
7643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007644PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646\n\
7647Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007648such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649arguments start and end are interpreted as in slice notation.\n\
7650\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007651Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652
7653static PyObject *
7654unicode_find(PyUnicodeObject *self, PyObject *args)
7655{
Jesus Ceaac451502011-04-20 17:09:23 +02007656 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007657 Py_ssize_t start;
7658 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007659 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660
Jesus Ceaac451502011-04-20 17:09:23 +02007661 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
7662 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007664
Thomas Wouters477c8d52006-05-27 19:21:47 +00007665 result = stringlib_find_slice(
7666 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7667 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7668 start, end
7669 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670
7671 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007672
Christian Heimes217cfd12007-12-02 14:31:20 +00007673 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674}
7675
7676static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007677unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678{
7679 if (index < 0 || index >= self->length) {
7680 PyErr_SetString(PyExc_IndexError, "string index out of range");
7681 return NULL;
7682 }
7683
7684 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7685}
7686
Guido van Rossumc2504932007-09-18 19:42:40 +00007687/* Believe it or not, this produces the same value for ASCII strings
7688 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007689static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007690unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691{
Guido van Rossumc2504932007-09-18 19:42:40 +00007692 Py_ssize_t len;
7693 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007694 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007695
7696 if (self->hash != -1)
7697 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007698 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007699 p = self->str;
7700 x = *p << 7;
7701 while (--len >= 0)
7702 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007703 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007704 if (x == -1)
7705 x = -2;
7706 self->hash = x;
7707 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708}
7709
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007710PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007711 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007713Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714
7715static PyObject *
7716unicode_index(PyUnicodeObject *self, PyObject *args)
7717{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007718 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02007719 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007720 Py_ssize_t start;
7721 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722
Jesus Ceaac451502011-04-20 17:09:23 +02007723 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
7724 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726
Thomas Wouters477c8d52006-05-27 19:21:47 +00007727 result = stringlib_find_slice(
7728 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7729 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7730 start, end
7731 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732
7733 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007734
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735 if (result < 0) {
7736 PyErr_SetString(PyExc_ValueError, "substring not found");
7737 return NULL;
7738 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007739
Christian Heimes217cfd12007-12-02 14:31:20 +00007740 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741}
7742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007743PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007746Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007747at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748
7749static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007750unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751{
7752 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7753 register const Py_UNICODE *e;
7754 int cased;
7755
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756 /* Shortcut for single character strings */
7757 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007758 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007760 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007761 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007763
Guido van Rossumd57fd912000-03-10 22:53:23 +00007764 e = p + PyUnicode_GET_SIZE(self);
7765 cased = 0;
7766 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007767 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007768
Benjamin Peterson29060642009-01-31 22:14:21 +00007769 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7770 return PyBool_FromLong(0);
7771 else if (!cased && Py_UNICODE_ISLOWER(ch))
7772 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007774 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007775}
7776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007777PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007780Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007781at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782
7783static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007784unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785{
7786 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7787 register const Py_UNICODE *e;
7788 int cased;
7789
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790 /* Shortcut for single character strings */
7791 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007794 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007795 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007797
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798 e = p + PyUnicode_GET_SIZE(self);
7799 cased = 0;
7800 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007802
Benjamin Peterson29060642009-01-31 22:14:21 +00007803 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7804 return PyBool_FromLong(0);
7805 else if (!cased && Py_UNICODE_ISUPPER(ch))
7806 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007808 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007809}
7810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007811PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007812 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007814Return True if S is a titlecased string and there is at least one\n\
7815character in S, i.e. upper- and titlecase characters may only\n\
7816follow uncased characters and lowercase characters only cased ones.\n\
7817Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007818
7819static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007820unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821{
7822 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7823 register const Py_UNICODE *e;
7824 int cased, previous_is_cased;
7825
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826 /* Shortcut for single character strings */
7827 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7829 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007831 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007832 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007833 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007834
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835 e = p + PyUnicode_GET_SIZE(self);
7836 cased = 0;
7837 previous_is_cased = 0;
7838 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007840
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7842 if (previous_is_cased)
7843 return PyBool_FromLong(0);
7844 previous_is_cased = 1;
7845 cased = 1;
7846 }
7847 else if (Py_UNICODE_ISLOWER(ch)) {
7848 if (!previous_is_cased)
7849 return PyBool_FromLong(0);
7850 previous_is_cased = 1;
7851 cased = 1;
7852 }
7853 else
7854 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007856 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857}
7858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007859PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007860 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007862Return True if all characters in S are whitespace\n\
7863and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864
7865static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007866unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867{
7868 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7869 register const Py_UNICODE *e;
7870
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871 /* Shortcut for single character strings */
7872 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 Py_UNICODE_ISSPACE(*p))
7874 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007876 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007877 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007879
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880 e = p + PyUnicode_GET_SIZE(self);
7881 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 if (!Py_UNICODE_ISSPACE(*p))
7883 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007885 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886}
7887
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007888PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007890\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007891Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007892and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007893
7894static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007895unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007896{
7897 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7898 register const Py_UNICODE *e;
7899
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007900 /* Shortcut for single character strings */
7901 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 Py_UNICODE_ISALPHA(*p))
7903 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007904
7905 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007906 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007908
7909 e = p + PyUnicode_GET_SIZE(self);
7910 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 if (!Py_UNICODE_ISALPHA(*p))
7912 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007913 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007914 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007915}
7916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007917PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007919\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007920Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007921and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007922
7923static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007924unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007925{
7926 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7927 register const Py_UNICODE *e;
7928
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007929 /* Shortcut for single character strings */
7930 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 Py_UNICODE_ISALNUM(*p))
7932 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007933
7934 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007935 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007936 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007937
7938 e = p + PyUnicode_GET_SIZE(self);
7939 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007940 if (!Py_UNICODE_ISALNUM(*p))
7941 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007942 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007943 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007944}
7945
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007946PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007947 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007949Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007950False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951
7952static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007953unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954{
7955 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7956 register const Py_UNICODE *e;
7957
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958 /* Shortcut for single character strings */
7959 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 Py_UNICODE_ISDECIMAL(*p))
7961 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007963 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007964 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007966
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967 e = p + PyUnicode_GET_SIZE(self);
7968 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007969 if (!Py_UNICODE_ISDECIMAL(*p))
7970 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007972 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973}
7974
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007975PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007978Return True if all characters in S are digits\n\
7979and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980
7981static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007982unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983{
7984 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7985 register const Py_UNICODE *e;
7986
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 /* Shortcut for single character strings */
7988 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 Py_UNICODE_ISDIGIT(*p))
7990 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007992 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007993 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007994 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007995
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 e = p + PyUnicode_GET_SIZE(self);
7997 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 if (!Py_UNICODE_ISDIGIT(*p))
7999 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008001 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002}
8003
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008004PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00008007Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008008False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009
8010static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008011unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012{
8013 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8014 register const Py_UNICODE *e;
8015
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 /* Shortcut for single character strings */
8017 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008018 Py_UNICODE_ISNUMERIC(*p))
8019 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008021 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008022 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008024
Guido van Rossumd57fd912000-03-10 22:53:23 +00008025 e = p + PyUnicode_GET_SIZE(self);
8026 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 if (!Py_UNICODE_ISNUMERIC(*p))
8028 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008030 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031}
8032
Martin v. Löwis47383402007-08-15 07:32:56 +00008033int
8034PyUnicode_IsIdentifier(PyObject *self)
8035{
8036 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
8037 register const Py_UNICODE *e;
8038
8039 /* Special case for empty strings */
8040 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008042
8043 /* PEP 3131 says that the first character must be in
8044 XID_Start and subsequent characters in XID_Continue,
8045 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00008046 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00008047 letters, digits, underscore). However, given the current
8048 definition of XID_Start and XID_Continue, it is sufficient
8049 to check just for these, except that _ must be allowed
8050 as starting an identifier. */
8051 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
8052 return 0;
8053
8054 e = p + PyUnicode_GET_SIZE(self);
8055 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 if (!_PyUnicode_IsXidContinue(*p))
8057 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008058 }
8059 return 1;
8060}
8061
8062PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008064\n\
8065Return True if S is a valid identifier according\n\
8066to the language definition.");
8067
8068static PyObject*
8069unicode_isidentifier(PyObject *self)
8070{
8071 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8072}
8073
Georg Brandl559e5d72008-06-11 18:37:52 +00008074PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008076\n\
8077Return True if all characters in S are considered\n\
8078printable in repr() or S is empty, False otherwise.");
8079
8080static PyObject*
8081unicode_isprintable(PyObject *self)
8082{
8083 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8084 register const Py_UNICODE *e;
8085
8086 /* Shortcut for single character strings */
8087 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8088 Py_RETURN_TRUE;
8089 }
8090
8091 e = p + PyUnicode_GET_SIZE(self);
8092 for (; p < e; p++) {
8093 if (!Py_UNICODE_ISPRINTABLE(*p)) {
8094 Py_RETURN_FALSE;
8095 }
8096 }
8097 Py_RETURN_TRUE;
8098}
8099
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008100PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008101 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102\n\
8103Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008104iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105
8106static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008107unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008109 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110}
8111
Martin v. Löwis18e16552006-02-15 17:27:45 +00008112static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113unicode_length(PyUnicodeObject *self)
8114{
8115 return self->length;
8116}
8117
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008118PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008121Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008122done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123
8124static PyObject *
8125unicode_ljust(PyUnicodeObject *self, PyObject *args)
8126{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008127 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008128 Py_UNICODE fillchar = ' ';
8129
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008130 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008131 return NULL;
8132
Tim Peters7a29bd52001-09-12 03:03:31 +00008133 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134 Py_INCREF(self);
8135 return (PyObject*) self;
8136 }
8137
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008138 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139}
8140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008141PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008142 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008144Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145
8146static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008147unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 return fixup(self, fixlower);
8150}
8151
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008152#define LEFTSTRIP 0
8153#define RIGHTSTRIP 1
8154#define BOTHSTRIP 2
8155
8156/* Arrays indexed by above */
8157static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8158
8159#define STRIPNAME(i) (stripformat[i]+3)
8160
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008161/* externally visible for str.strip(unicode) */
8162PyObject *
8163_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8164{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008165 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8166 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8167 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8168 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8169 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008170
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008172
Benjamin Peterson14339b62009-01-31 16:36:08 +00008173 i = 0;
8174 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8176 i++;
8177 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008178 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008179
Benjamin Peterson14339b62009-01-31 16:36:08 +00008180 j = len;
8181 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 do {
8183 j--;
8184 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8185 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008186 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008187
Benjamin Peterson14339b62009-01-31 16:36:08 +00008188 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 Py_INCREF(self);
8190 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008191 }
8192 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008193 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008194}
8195
Guido van Rossumd57fd912000-03-10 22:53:23 +00008196
8197static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008198do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008200 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8201 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008202
Benjamin Peterson14339b62009-01-31 16:36:08 +00008203 i = 0;
8204 if (striptype != RIGHTSTRIP) {
8205 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8206 i++;
8207 }
8208 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008209
Benjamin Peterson14339b62009-01-31 16:36:08 +00008210 j = len;
8211 if (striptype != LEFTSTRIP) {
8212 do {
8213 j--;
8214 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8215 j++;
8216 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008217
Benjamin Peterson14339b62009-01-31 16:36:08 +00008218 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8219 Py_INCREF(self);
8220 return (PyObject*)self;
8221 }
8222 else
8223 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224}
8225
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008226
8227static PyObject *
8228do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8229{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008230 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008231
Benjamin Peterson14339b62009-01-31 16:36:08 +00008232 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8233 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008234
Benjamin Peterson14339b62009-01-31 16:36:08 +00008235 if (sep != NULL && sep != Py_None) {
8236 if (PyUnicode_Check(sep))
8237 return _PyUnicode_XStrip(self, striptype, sep);
8238 else {
8239 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 "%s arg must be None or str",
8241 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008242 return NULL;
8243 }
8244 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008245
Benjamin Peterson14339b62009-01-31 16:36:08 +00008246 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008247}
8248
8249
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008250PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008252\n\
8253Return a copy of the string S with leading and trailing\n\
8254whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008255If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008256
8257static PyObject *
8258unicode_strip(PyUnicodeObject *self, PyObject *args)
8259{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008260 if (PyTuple_GET_SIZE(args) == 0)
8261 return do_strip(self, BOTHSTRIP); /* Common case */
8262 else
8263 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008264}
8265
8266
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008267PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008269\n\
8270Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008271If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008272
8273static PyObject *
8274unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8275{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008276 if (PyTuple_GET_SIZE(args) == 0)
8277 return do_strip(self, LEFTSTRIP); /* Common case */
8278 else
8279 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008280}
8281
8282
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008283PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008285\n\
8286Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008287If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008288
8289static PyObject *
8290unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8291{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008292 if (PyTuple_GET_SIZE(args) == 0)
8293 return do_strip(self, RIGHTSTRIP); /* Common case */
8294 else
8295 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008296}
8297
8298
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008300unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301{
8302 PyUnicodeObject *u;
8303 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008304 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008305 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306
Georg Brandl222de0f2009-04-12 12:01:50 +00008307 if (len < 1) {
8308 Py_INCREF(unicode_empty);
8309 return (PyObject *)unicode_empty;
8310 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311
Tim Peters7a29bd52001-09-12 03:03:31 +00008312 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313 /* no repeat, return original string */
8314 Py_INCREF(str);
8315 return (PyObject*) str;
8316 }
Tim Peters8f422462000-09-09 06:13:41 +00008317
8318 /* ensure # of chars needed doesn't overflow int and # of bytes
8319 * needed doesn't overflow size_t
8320 */
8321 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008322 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008323 PyErr_SetString(PyExc_OverflowError,
8324 "repeated string is too long");
8325 return NULL;
8326 }
8327 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8328 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8329 PyErr_SetString(PyExc_OverflowError,
8330 "repeated string is too long");
8331 return NULL;
8332 }
8333 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334 if (!u)
8335 return NULL;
8336
8337 p = u->str;
8338
Georg Brandl222de0f2009-04-12 12:01:50 +00008339 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008340 Py_UNICODE_FILL(p, str->str[0], len);
8341 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008342 Py_ssize_t done = str->length; /* number of characters copied this far */
8343 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008345 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008346 Py_UNICODE_COPY(p+done, p, n);
8347 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349 }
8350
8351 return (PyObject*) u;
8352}
8353
Alexander Belopolsky40018472011-02-26 01:02:56 +00008354PyObject *
8355PyUnicode_Replace(PyObject *obj,
8356 PyObject *subobj,
8357 PyObject *replobj,
8358 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359{
8360 PyObject *self;
8361 PyObject *str1;
8362 PyObject *str2;
8363 PyObject *result;
8364
8365 self = PyUnicode_FromObject(obj);
8366 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368 str1 = PyUnicode_FromObject(subobj);
8369 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 Py_DECREF(self);
8371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372 }
8373 str2 = PyUnicode_FromObject(replobj);
8374 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 Py_DECREF(self);
8376 Py_DECREF(str1);
8377 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 }
Tim Petersced69f82003-09-16 20:30:58 +00008379 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 (PyUnicodeObject *)str1,
8381 (PyUnicodeObject *)str2,
8382 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 Py_DECREF(self);
8384 Py_DECREF(str1);
8385 Py_DECREF(str2);
8386 return result;
8387}
8388
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008389PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008390 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391\n\
8392Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008393old replaced by new. If the optional argument count is\n\
8394given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395
8396static PyObject*
8397unicode_replace(PyUnicodeObject *self, PyObject *args)
8398{
8399 PyUnicodeObject *str1;
8400 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008401 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008402 PyObject *result;
8403
Martin v. Löwis18e16552006-02-15 17:27:45 +00008404 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405 return NULL;
8406 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8407 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008409 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008410 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 Py_DECREF(str1);
8412 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008413 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008414
8415 result = replace(self, str1, str2, maxcount);
8416
8417 Py_DECREF(str1);
8418 Py_DECREF(str2);
8419 return result;
8420}
8421
Alexander Belopolsky40018472011-02-26 01:02:56 +00008422static PyObject *
8423unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008424{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008425 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008426 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008427 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8428 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8429
8430 /* XXX(nnorwitz): rather than over-allocating, it would be
8431 better to choose a different scheme. Perhaps scan the
8432 first N-chars of the string and allocate based on that size.
8433 */
8434 /* Initial allocation is based on the longest-possible unichr
8435 escape.
8436
8437 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8438 unichr, so in this case it's the longest unichr escape. In
8439 narrow (UTF-16) builds this is five chars per source unichr
8440 since there are two unichrs in the surrogate pair, so in narrow
8441 (UTF-16) builds it's not the longest unichr escape.
8442
8443 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8444 so in the narrow (UTF-16) build case it's the longest unichr
8445 escape.
8446 */
8447
Walter Dörwald1ab83302007-05-18 17:15:44 +00008448 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008450#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008452#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008454#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008456 if (repr == NULL)
8457 return NULL;
8458
Walter Dörwald1ab83302007-05-18 17:15:44 +00008459 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008460
8461 /* Add quote */
8462 *p++ = (findchar(s, size, '\'') &&
8463 !findchar(s, size, '"')) ? '"' : '\'';
8464 while (size-- > 0) {
8465 Py_UNICODE ch = *s++;
8466
8467 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008468 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008469 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008470 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008471 continue;
8472 }
8473
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008475 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008476 *p++ = '\\';
8477 *p++ = 't';
8478 }
8479 else if (ch == '\n') {
8480 *p++ = '\\';
8481 *p++ = 'n';
8482 }
8483 else if (ch == '\r') {
8484 *p++ = '\\';
8485 *p++ = 'r';
8486 }
8487
8488 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008489 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008490 *p++ = '\\';
8491 *p++ = 'x';
8492 *p++ = hexdigits[(ch >> 4) & 0x000F];
8493 *p++ = hexdigits[ch & 0x000F];
8494 }
8495
Georg Brandl559e5d72008-06-11 18:37:52 +00008496 /* Copy ASCII characters as-is */
8497 else if (ch < 0x7F) {
8498 *p++ = ch;
8499 }
8500
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008502 else {
8503 Py_UCS4 ucs = ch;
8504
8505#ifndef Py_UNICODE_WIDE
8506 Py_UNICODE ch2 = 0;
8507 /* Get code point from surrogate pair */
8508 if (size > 0) {
8509 ch2 = *s;
8510 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008512 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008514 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008515 size--;
8516 }
8517 }
8518#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008519 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008520 (categories Z* and C* except ASCII space)
8521 */
8522 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8523 /* Map 8-bit characters to '\xhh' */
8524 if (ucs <= 0xff) {
8525 *p++ = '\\';
8526 *p++ = 'x';
8527 *p++ = hexdigits[(ch >> 4) & 0x000F];
8528 *p++ = hexdigits[ch & 0x000F];
8529 }
8530 /* Map 21-bit characters to '\U00xxxxxx' */
8531 else if (ucs >= 0x10000) {
8532 *p++ = '\\';
8533 *p++ = 'U';
8534 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8535 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8536 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8537 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8538 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8539 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8540 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8541 *p++ = hexdigits[ucs & 0x0000000F];
8542 }
8543 /* Map 16-bit characters to '\uxxxx' */
8544 else {
8545 *p++ = '\\';
8546 *p++ = 'u';
8547 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8548 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8549 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8550 *p++ = hexdigits[ucs & 0x000F];
8551 }
8552 }
8553 /* Copy characters as-is */
8554 else {
8555 *p++ = ch;
8556#ifndef Py_UNICODE_WIDE
8557 if (ucs >= 0x10000)
8558 *p++ = ch2;
8559#endif
8560 }
8561 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008562 }
8563 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008564 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008565
8566 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008567 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008568 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569}
8570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008571PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573\n\
8574Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008575such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576arguments start and end are interpreted as in slice notation.\n\
8577\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008578Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579
8580static PyObject *
8581unicode_rfind(PyUnicodeObject *self, PyObject *args)
8582{
Jesus Ceaac451502011-04-20 17:09:23 +02008583 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008584 Py_ssize_t start;
8585 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008586 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587
Jesus Ceaac451502011-04-20 17:09:23 +02008588 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
8589 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008590 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591
Thomas Wouters477c8d52006-05-27 19:21:47 +00008592 result = stringlib_rfind_slice(
8593 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8594 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8595 start, end
8596 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597
8598 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008599
Christian Heimes217cfd12007-12-02 14:31:20 +00008600 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601}
8602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008603PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008604 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008606Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607
8608static PyObject *
8609unicode_rindex(PyUnicodeObject *self, PyObject *args)
8610{
Jesus Ceaac451502011-04-20 17:09:23 +02008611 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008612 Py_ssize_t start;
8613 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008614 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615
Jesus Ceaac451502011-04-20 17:09:23 +02008616 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
8617 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008618 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619
Thomas Wouters477c8d52006-05-27 19:21:47 +00008620 result = stringlib_rfind_slice(
8621 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8622 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8623 start, end
8624 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625
8626 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008627
Guido van Rossumd57fd912000-03-10 22:53:23 +00008628 if (result < 0) {
8629 PyErr_SetString(PyExc_ValueError, "substring not found");
8630 return NULL;
8631 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008632 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633}
8634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008635PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008638Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008639done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640
8641static PyObject *
8642unicode_rjust(PyUnicodeObject *self, PyObject *args)
8643{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008644 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008645 Py_UNICODE fillchar = ' ';
8646
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008647 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 return NULL;
8649
Tim Peters7a29bd52001-09-12 03:03:31 +00008650 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651 Py_INCREF(self);
8652 return (PyObject*) self;
8653 }
8654
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008655 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656}
8657
Alexander Belopolsky40018472011-02-26 01:02:56 +00008658PyObject *
8659PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660{
8661 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008662
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663 s = PyUnicode_FromObject(s);
8664 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008665 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 if (sep != NULL) {
8667 sep = PyUnicode_FromObject(sep);
8668 if (sep == NULL) {
8669 Py_DECREF(s);
8670 return NULL;
8671 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672 }
8673
8674 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8675
8676 Py_DECREF(s);
8677 Py_XDECREF(sep);
8678 return result;
8679}
8680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008681PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683\n\
8684Return a list of the words in S, using sep as the\n\
8685delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008686splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008687whitespace string is a separator and empty strings are\n\
8688removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008689
8690static PyObject*
8691unicode_split(PyUnicodeObject *self, PyObject *args)
8692{
8693 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008694 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008695
Martin v. Löwis18e16552006-02-15 17:27:45 +00008696 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697 return NULL;
8698
8699 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705}
8706
Thomas Wouters477c8d52006-05-27 19:21:47 +00008707PyObject *
8708PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8709{
8710 PyObject* str_obj;
8711 PyObject* sep_obj;
8712 PyObject* out;
8713
8714 str_obj = PyUnicode_FromObject(str_in);
8715 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008717 sep_obj = PyUnicode_FromObject(sep_in);
8718 if (!sep_obj) {
8719 Py_DECREF(str_obj);
8720 return NULL;
8721 }
8722
8723 out = stringlib_partition(
8724 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8725 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8726 );
8727
8728 Py_DECREF(sep_obj);
8729 Py_DECREF(str_obj);
8730
8731 return out;
8732}
8733
8734
8735PyObject *
8736PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8737{
8738 PyObject* str_obj;
8739 PyObject* sep_obj;
8740 PyObject* out;
8741
8742 str_obj = PyUnicode_FromObject(str_in);
8743 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008745 sep_obj = PyUnicode_FromObject(sep_in);
8746 if (!sep_obj) {
8747 Py_DECREF(str_obj);
8748 return NULL;
8749 }
8750
8751 out = stringlib_rpartition(
8752 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8753 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8754 );
8755
8756 Py_DECREF(sep_obj);
8757 Py_DECREF(str_obj);
8758
8759 return out;
8760}
8761
8762PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008764\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008765Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008766the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008767found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008768
8769static PyObject*
8770unicode_partition(PyUnicodeObject *self, PyObject *separator)
8771{
8772 return PyUnicode_Partition((PyObject *)self, separator);
8773}
8774
8775PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008776 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008777\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008778Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008779the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008780separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008781
8782static PyObject*
8783unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8784{
8785 return PyUnicode_RPartition((PyObject *)self, separator);
8786}
8787
Alexander Belopolsky40018472011-02-26 01:02:56 +00008788PyObject *
8789PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008790{
8791 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008792
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008793 s = PyUnicode_FromObject(s);
8794 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008795 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 if (sep != NULL) {
8797 sep = PyUnicode_FromObject(sep);
8798 if (sep == NULL) {
8799 Py_DECREF(s);
8800 return NULL;
8801 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008802 }
8803
8804 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8805
8806 Py_DECREF(s);
8807 Py_XDECREF(sep);
8808 return result;
8809}
8810
8811PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008812 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008813\n\
8814Return a list of the words in S, using sep as the\n\
8815delimiter string, starting at the end of the string and\n\
8816working to the front. If maxsplit is given, at most maxsplit\n\
8817splits are done. If sep is not specified, any whitespace string\n\
8818is a separator.");
8819
8820static PyObject*
8821unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8822{
8823 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008824 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008825
Martin v. Löwis18e16552006-02-15 17:27:45 +00008826 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008827 return NULL;
8828
8829 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008831 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008833 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008835}
8836
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008837PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008838 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008839\n\
8840Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008841Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008842is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008843
8844static PyObject*
8845unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8846{
Guido van Rossum86662912000-04-11 15:38:46 +00008847 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008848
Guido van Rossum86662912000-04-11 15:38:46 +00008849 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850 return NULL;
8851
Guido van Rossum86662912000-04-11 15:38:46 +00008852 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853}
8854
8855static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008856PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857{
Walter Dörwald346737f2007-05-31 10:44:43 +00008858 if (PyUnicode_CheckExact(self)) {
8859 Py_INCREF(self);
8860 return self;
8861 } else
8862 /* Subtype -- return genuine unicode string with the same value. */
8863 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8864 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865}
8866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008867PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008868 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869\n\
8870Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008871and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872
8873static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008874unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008875{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008876 return fixup(self, fixswapcase);
8877}
8878
Georg Brandlceee0772007-11-27 23:48:05 +00008879PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008881\n\
8882Return a translation table usable for str.translate().\n\
8883If there is only one argument, it must be a dictionary mapping Unicode\n\
8884ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008885Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008886If there are two arguments, they must be strings of equal length, and\n\
8887in the resulting dictionary, each character in x will be mapped to the\n\
8888character at the same position in y. If there is a third argument, it\n\
8889must be a string, whose characters will be mapped to None in the result.");
8890
8891static PyObject*
8892unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8893{
8894 PyObject *x, *y = NULL, *z = NULL;
8895 PyObject *new = NULL, *key, *value;
8896 Py_ssize_t i = 0;
8897 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008898
Georg Brandlceee0772007-11-27 23:48:05 +00008899 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8900 return NULL;
8901 new = PyDict_New();
8902 if (!new)
8903 return NULL;
8904 if (y != NULL) {
8905 /* x must be a string too, of equal length */
8906 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8907 if (!PyUnicode_Check(x)) {
8908 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8909 "be a string if there is a second argument");
8910 goto err;
8911 }
8912 if (PyUnicode_GET_SIZE(x) != ylen) {
8913 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8914 "arguments must have equal length");
8915 goto err;
8916 }
8917 /* create entries for translating chars in x to those in y */
8918 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008919 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8920 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008921 if (!key || !value)
8922 goto err;
8923 res = PyDict_SetItem(new, key, value);
8924 Py_DECREF(key);
8925 Py_DECREF(value);
8926 if (res < 0)
8927 goto err;
8928 }
8929 /* create entries for deleting chars in z */
8930 if (z != NULL) {
8931 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008932 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008933 if (!key)
8934 goto err;
8935 res = PyDict_SetItem(new, key, Py_None);
8936 Py_DECREF(key);
8937 if (res < 0)
8938 goto err;
8939 }
8940 }
8941 } else {
8942 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008943 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008944 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8945 "to maketrans it must be a dict");
8946 goto err;
8947 }
8948 /* copy entries into the new dict, converting string keys to int keys */
8949 while (PyDict_Next(x, &i, &key, &value)) {
8950 if (PyUnicode_Check(key)) {
8951 /* convert string keys to integer keys */
8952 PyObject *newkey;
8953 if (PyUnicode_GET_SIZE(key) != 1) {
8954 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8955 "table must be of length 1");
8956 goto err;
8957 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008958 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008959 if (!newkey)
8960 goto err;
8961 res = PyDict_SetItem(new, newkey, value);
8962 Py_DECREF(newkey);
8963 if (res < 0)
8964 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008965 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008966 /* just keep integer keys */
8967 if (PyDict_SetItem(new, key, value) < 0)
8968 goto err;
8969 } else {
8970 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8971 "be strings or integers");
8972 goto err;
8973 }
8974 }
8975 }
8976 return new;
8977 err:
8978 Py_DECREF(new);
8979 return NULL;
8980}
8981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008982PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008983 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984\n\
8985Return a copy of the string S, where all characters have been mapped\n\
8986through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008987Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008988Unmapped characters are left untouched. Characters mapped to None\n\
8989are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990
8991static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008992unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993{
Georg Brandlceee0772007-11-27 23:48:05 +00008994 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995}
8996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008997PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009000Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001
9002static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009003unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005 return fixup(self, fixupper);
9006}
9007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009008PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009009 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00009011Pad a numeric string S with zeros on the left, to fill a field\n\
9012of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013
9014static PyObject *
9015unicode_zfill(PyUnicodeObject *self, PyObject *args)
9016{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009017 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018 PyUnicodeObject *u;
9019
Martin v. Löwis18e16552006-02-15 17:27:45 +00009020 Py_ssize_t width;
9021 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009022 return NULL;
9023
9024 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00009025 if (PyUnicode_CheckExact(self)) {
9026 Py_INCREF(self);
9027 return (PyObject*) self;
9028 }
9029 else
9030 return PyUnicode_FromUnicode(
9031 PyUnicode_AS_UNICODE(self),
9032 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00009033 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034 }
9035
9036 fill = width - self->length;
9037
9038 u = pad(self, fill, 0, '0');
9039
Walter Dörwald068325e2002-04-15 13:36:47 +00009040 if (u == NULL)
9041 return NULL;
9042
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043 if (u->str[fill] == '+' || u->str[fill] == '-') {
9044 /* move sign to beginning of string */
9045 u->str[0] = u->str[fill];
9046 u->str[fill] = '0';
9047 }
9048
9049 return (PyObject*) u;
9050}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051
9052#if 0
9053static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009054unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055{
Christian Heimes2202f872008-02-06 14:31:34 +00009056 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009058
9059static PyObject *
9060unicode__decimal2ascii(PyObject *self)
9061{
9062 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9063 PyUnicode_GET_SIZE(self));
9064}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065#endif
9066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009067PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009068 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009070Return True if S starts with the specified prefix, False otherwise.\n\
9071With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009072With optional end, stop comparing S at that position.\n\
9073prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074
9075static PyObject *
9076unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009077 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009078{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009079 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009081 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009082 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009083 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084
Jesus Ceaac451502011-04-20 17:09:23 +02009085 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009086 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009087 if (PyTuple_Check(subobj)) {
9088 Py_ssize_t i;
9089 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9090 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009091 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009092 if (substring == NULL)
9093 return NULL;
9094 result = tailmatch(self, substring, start, end, -1);
9095 Py_DECREF(substring);
9096 if (result) {
9097 Py_RETURN_TRUE;
9098 }
9099 }
9100 /* nothing matched */
9101 Py_RETURN_FALSE;
9102 }
9103 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009104 if (substring == NULL) {
9105 if (PyErr_ExceptionMatches(PyExc_TypeError))
9106 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
9107 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009108 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009109 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009110 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009112 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113}
9114
9115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009116PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009117 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009118\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009119Return True if S ends with the specified suffix, False otherwise.\n\
9120With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009121With optional end, stop comparing S at that position.\n\
9122suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123
9124static PyObject *
9125unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009126 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009128 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009129 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009130 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009131 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009132 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133
Jesus Ceaac451502011-04-20 17:09:23 +02009134 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009136 if (PyTuple_Check(subobj)) {
9137 Py_ssize_t i;
9138 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9139 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009140 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009141 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009142 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009143 result = tailmatch(self, substring, start, end, +1);
9144 Py_DECREF(substring);
9145 if (result) {
9146 Py_RETURN_TRUE;
9147 }
9148 }
9149 Py_RETURN_FALSE;
9150 }
9151 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009152 if (substring == NULL) {
9153 if (PyErr_ExceptionMatches(PyExc_TypeError))
9154 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
9155 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009156 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009157 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009158 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009159 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009160 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161}
9162
Eric Smith8c663262007-08-25 02:26:07 +00009163#include "stringlib/string_format.h"
9164
9165PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009166 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009167\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009168Return a formatted version of S, using substitutions from args and kwargs.\n\
9169The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009170
Eric Smith27bbca62010-11-04 17:06:58 +00009171PyDoc_STRVAR(format_map__doc__,
9172 "S.format_map(mapping) -> str\n\
9173\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009174Return a formatted version of S, using substitutions from mapping.\n\
9175The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009176
Eric Smith4a7d76d2008-05-30 18:10:19 +00009177static PyObject *
9178unicode__format__(PyObject* self, PyObject* args)
9179{
9180 PyObject *format_spec;
9181
9182 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9183 return NULL;
9184
9185 return _PyUnicode_FormatAdvanced(self,
9186 PyUnicode_AS_UNICODE(format_spec),
9187 PyUnicode_GET_SIZE(format_spec));
9188}
9189
Eric Smith8c663262007-08-25 02:26:07 +00009190PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009191 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009192\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009193Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009194
9195static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009196unicode__sizeof__(PyUnicodeObject *v)
9197{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009198 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9199 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009200}
9201
9202PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009203 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009204
9205static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009206unicode_getnewargs(PyUnicodeObject *v)
9207{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009208 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009209}
9210
Guido van Rossumd57fd912000-03-10 22:53:23 +00009211static PyMethodDef unicode_methods[] = {
9212
9213 /* Order is according to common usage: often used methods should
9214 appear first, since lookup is done sequentially. */
9215
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009216 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009217 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9218 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009219 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009220 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9221 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9222 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9223 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9224 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9225 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9226 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009227 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009228 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9229 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9230 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009231 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009232 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9233 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9234 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009235 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009236 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009237 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009238 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009239 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9240 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9241 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9242 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9243 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9244 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9245 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9246 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9247 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9248 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9249 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9250 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9251 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9252 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009253 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009254 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009255 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009256 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009257 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009258 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009259 {"maketrans", (PyCFunction) unicode_maketrans,
9260 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009261 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009262#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009263 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264#endif
9265
9266#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009267 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009268 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009269 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270#endif
9271
Benjamin Peterson14339b62009-01-31 16:36:08 +00009272 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009273 {NULL, NULL}
9274};
9275
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009276static PyObject *
9277unicode_mod(PyObject *v, PyObject *w)
9278{
Benjamin Peterson29060642009-01-31 22:14:21 +00009279 if (!PyUnicode_Check(v)) {
9280 Py_INCREF(Py_NotImplemented);
9281 return Py_NotImplemented;
9282 }
9283 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009284}
9285
9286static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009287 0, /*nb_add*/
9288 0, /*nb_subtract*/
9289 0, /*nb_multiply*/
9290 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009291};
9292
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009294 (lenfunc) unicode_length, /* sq_length */
9295 PyUnicode_Concat, /* sq_concat */
9296 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9297 (ssizeargfunc) unicode_getitem, /* sq_item */
9298 0, /* sq_slice */
9299 0, /* sq_ass_item */
9300 0, /* sq_ass_slice */
9301 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009302};
9303
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009304static PyObject*
9305unicode_subscript(PyUnicodeObject* self, PyObject* item)
9306{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009307 if (PyIndex_Check(item)) {
9308 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009309 if (i == -1 && PyErr_Occurred())
9310 return NULL;
9311 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009312 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009313 return unicode_getitem(self, i);
9314 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009315 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009316 Py_UNICODE* source_buf;
9317 Py_UNICODE* result_buf;
9318 PyObject* result;
9319
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009320 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009321 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009322 return NULL;
9323 }
9324
9325 if (slicelength <= 0) {
9326 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009327 } else if (start == 0 && step == 1 && slicelength == self->length &&
9328 PyUnicode_CheckExact(self)) {
9329 Py_INCREF(self);
9330 return (PyObject *)self;
9331 } else if (step == 1) {
9332 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009333 } else {
9334 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009335 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9336 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009337
Benjamin Peterson29060642009-01-31 22:14:21 +00009338 if (result_buf == NULL)
9339 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009340
9341 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9342 result_buf[i] = source_buf[cur];
9343 }
Tim Petersced69f82003-09-16 20:30:58 +00009344
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009345 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009346 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009347 return result;
9348 }
9349 } else {
9350 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9351 return NULL;
9352 }
9353}
9354
9355static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009356 (lenfunc)unicode_length, /* mp_length */
9357 (binaryfunc)unicode_subscript, /* mp_subscript */
9358 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009359};
9360
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362/* Helpers for PyUnicode_Format() */
9363
9364static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009365getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009366{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009367 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009369 (*p_argidx)++;
9370 if (arglen < 0)
9371 return args;
9372 else
9373 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374 }
9375 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009376 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377 return NULL;
9378}
9379
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009380/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009382static PyObject *
9383formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009385 char *p;
9386 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009387 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009388
Guido van Rossumd57fd912000-03-10 22:53:23 +00009389 x = PyFloat_AsDouble(v);
9390 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009391 return NULL;
9392
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009394 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009395
Eric Smith0923d1d2009-04-16 20:16:10 +00009396 p = PyOS_double_to_string(x, type, prec,
9397 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009398 if (p == NULL)
9399 return NULL;
9400 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009401 PyMem_Free(p);
9402 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403}
9404
Tim Peters38fd5b62000-09-21 05:43:11 +00009405static PyObject*
9406formatlong(PyObject *val, int flags, int prec, int type)
9407{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009408 char *buf;
9409 int len;
9410 PyObject *str; /* temporary string object. */
9411 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009412
Benjamin Peterson14339b62009-01-31 16:36:08 +00009413 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9414 if (!str)
9415 return NULL;
9416 result = PyUnicode_FromStringAndSize(buf, len);
9417 Py_DECREF(str);
9418 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009419}
9420
Guido van Rossumd57fd912000-03-10 22:53:23 +00009421static int
9422formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009423 size_t buflen,
9424 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009425{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009426 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009427 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009428 if (PyUnicode_GET_SIZE(v) == 1) {
9429 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9430 buf[1] = '\0';
9431 return 1;
9432 }
9433#ifndef Py_UNICODE_WIDE
9434 if (PyUnicode_GET_SIZE(v) == 2) {
9435 /* Decode a valid surrogate pair */
9436 int c0 = PyUnicode_AS_UNICODE(v)[0];
9437 int c1 = PyUnicode_AS_UNICODE(v)[1];
9438 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9439 0xDC00 <= c1 && c1 <= 0xDFFF) {
9440 buf[0] = c0;
9441 buf[1] = c1;
9442 buf[2] = '\0';
9443 return 2;
9444 }
9445 }
9446#endif
9447 goto onError;
9448 }
9449 else {
9450 /* Integer input truncated to a character */
9451 long x;
9452 x = PyLong_AsLong(v);
9453 if (x == -1 && PyErr_Occurred())
9454 goto onError;
9455
9456 if (x < 0 || x > 0x10ffff) {
9457 PyErr_SetString(PyExc_OverflowError,
9458 "%c arg not in range(0x110000)");
9459 return -1;
9460 }
9461
9462#ifndef Py_UNICODE_WIDE
9463 if (x > 0xffff) {
9464 x -= 0x10000;
9465 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9466 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9467 return 2;
9468 }
9469#endif
9470 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009471 buf[1] = '\0';
9472 return 1;
9473 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009474
Benjamin Peterson29060642009-01-31 22:14:21 +00009475 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009476 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009477 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009478 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479}
9480
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009481/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009482 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009483*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009484#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009485
Alexander Belopolsky40018472011-02-26 01:02:56 +00009486PyObject *
9487PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488{
9489 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009490 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491 int args_owned = 0;
9492 PyUnicodeObject *result = NULL;
9493 PyObject *dict = NULL;
9494 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009495
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009497 PyErr_BadInternalCall();
9498 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009499 }
9500 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009501 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009502 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503 fmt = PyUnicode_AS_UNICODE(uformat);
9504 fmtcnt = PyUnicode_GET_SIZE(uformat);
9505
9506 reslen = rescnt = fmtcnt + 100;
9507 result = _PyUnicode_New(reslen);
9508 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009509 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510 res = PyUnicode_AS_UNICODE(result);
9511
9512 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009513 arglen = PyTuple_Size(args);
9514 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515 }
9516 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009517 arglen = -1;
9518 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009520 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009521 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009522 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523
9524 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009525 if (*fmt != '%') {
9526 if (--rescnt < 0) {
9527 rescnt = fmtcnt + 100;
9528 reslen += rescnt;
9529 if (_PyUnicode_Resize(&result, reslen) < 0)
9530 goto onError;
9531 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9532 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009533 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009534 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009535 }
9536 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009537 /* Got a format specifier */
9538 int flags = 0;
9539 Py_ssize_t width = -1;
9540 int prec = -1;
9541 Py_UNICODE c = '\0';
9542 Py_UNICODE fill;
9543 int isnumok;
9544 PyObject *v = NULL;
9545 PyObject *temp = NULL;
9546 Py_UNICODE *pbuf;
9547 Py_UNICODE sign;
9548 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009549 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009550
Benjamin Peterson29060642009-01-31 22:14:21 +00009551 fmt++;
9552 if (*fmt == '(') {
9553 Py_UNICODE *keystart;
9554 Py_ssize_t keylen;
9555 PyObject *key;
9556 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009557
Benjamin Peterson29060642009-01-31 22:14:21 +00009558 if (dict == NULL) {
9559 PyErr_SetString(PyExc_TypeError,
9560 "format requires a mapping");
9561 goto onError;
9562 }
9563 ++fmt;
9564 --fmtcnt;
9565 keystart = fmt;
9566 /* Skip over balanced parentheses */
9567 while (pcount > 0 && --fmtcnt >= 0) {
9568 if (*fmt == ')')
9569 --pcount;
9570 else if (*fmt == '(')
9571 ++pcount;
9572 fmt++;
9573 }
9574 keylen = fmt - keystart - 1;
9575 if (fmtcnt < 0 || pcount > 0) {
9576 PyErr_SetString(PyExc_ValueError,
9577 "incomplete format key");
9578 goto onError;
9579 }
9580#if 0
9581 /* keys are converted to strings using UTF-8 and
9582 then looked up since Python uses strings to hold
9583 variables names etc. in its namespaces and we
9584 wouldn't want to break common idioms. */
9585 key = PyUnicode_EncodeUTF8(keystart,
9586 keylen,
9587 NULL);
9588#else
9589 key = PyUnicode_FromUnicode(keystart, keylen);
9590#endif
9591 if (key == NULL)
9592 goto onError;
9593 if (args_owned) {
9594 Py_DECREF(args);
9595 args_owned = 0;
9596 }
9597 args = PyObject_GetItem(dict, key);
9598 Py_DECREF(key);
9599 if (args == NULL) {
9600 goto onError;
9601 }
9602 args_owned = 1;
9603 arglen = -1;
9604 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009605 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009606 while (--fmtcnt >= 0) {
9607 switch (c = *fmt++) {
9608 case '-': flags |= F_LJUST; continue;
9609 case '+': flags |= F_SIGN; continue;
9610 case ' ': flags |= F_BLANK; continue;
9611 case '#': flags |= F_ALT; continue;
9612 case '0': flags |= F_ZERO; continue;
9613 }
9614 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009615 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009616 if (c == '*') {
9617 v = getnextarg(args, arglen, &argidx);
9618 if (v == NULL)
9619 goto onError;
9620 if (!PyLong_Check(v)) {
9621 PyErr_SetString(PyExc_TypeError,
9622 "* wants int");
9623 goto onError;
9624 }
9625 width = PyLong_AsLong(v);
9626 if (width == -1 && PyErr_Occurred())
9627 goto onError;
9628 if (width < 0) {
9629 flags |= F_LJUST;
9630 width = -width;
9631 }
9632 if (--fmtcnt >= 0)
9633 c = *fmt++;
9634 }
9635 else if (c >= '0' && c <= '9') {
9636 width = c - '0';
9637 while (--fmtcnt >= 0) {
9638 c = *fmt++;
9639 if (c < '0' || c > '9')
9640 break;
9641 if ((width*10) / 10 != width) {
9642 PyErr_SetString(PyExc_ValueError,
9643 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009644 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009645 }
9646 width = width*10 + (c - '0');
9647 }
9648 }
9649 if (c == '.') {
9650 prec = 0;
9651 if (--fmtcnt >= 0)
9652 c = *fmt++;
9653 if (c == '*') {
9654 v = getnextarg(args, arglen, &argidx);
9655 if (v == NULL)
9656 goto onError;
9657 if (!PyLong_Check(v)) {
9658 PyErr_SetString(PyExc_TypeError,
9659 "* wants int");
9660 goto onError;
9661 }
9662 prec = PyLong_AsLong(v);
9663 if (prec == -1 && PyErr_Occurred())
9664 goto onError;
9665 if (prec < 0)
9666 prec = 0;
9667 if (--fmtcnt >= 0)
9668 c = *fmt++;
9669 }
9670 else if (c >= '0' && c <= '9') {
9671 prec = c - '0';
9672 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009673 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009674 if (c < '0' || c > '9')
9675 break;
9676 if ((prec*10) / 10 != prec) {
9677 PyErr_SetString(PyExc_ValueError,
9678 "prec too big");
9679 goto onError;
9680 }
9681 prec = prec*10 + (c - '0');
9682 }
9683 }
9684 } /* prec */
9685 if (fmtcnt >= 0) {
9686 if (c == 'h' || c == 'l' || c == 'L') {
9687 if (--fmtcnt >= 0)
9688 c = *fmt++;
9689 }
9690 }
9691 if (fmtcnt < 0) {
9692 PyErr_SetString(PyExc_ValueError,
9693 "incomplete format");
9694 goto onError;
9695 }
9696 if (c != '%') {
9697 v = getnextarg(args, arglen, &argidx);
9698 if (v == NULL)
9699 goto onError;
9700 }
9701 sign = 0;
9702 fill = ' ';
9703 switch (c) {
9704
9705 case '%':
9706 pbuf = formatbuf;
9707 /* presume that buffer length is at least 1 */
9708 pbuf[0] = '%';
9709 len = 1;
9710 break;
9711
9712 case 's':
9713 case 'r':
9714 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009715 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009716 temp = v;
9717 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009718 }
9719 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009720 if (c == 's')
9721 temp = PyObject_Str(v);
9722 else if (c == 'r')
9723 temp = PyObject_Repr(v);
9724 else
9725 temp = PyObject_ASCII(v);
9726 if (temp == NULL)
9727 goto onError;
9728 if (PyUnicode_Check(temp))
9729 /* nothing to do */;
9730 else {
9731 Py_DECREF(temp);
9732 PyErr_SetString(PyExc_TypeError,
9733 "%s argument has non-string str()");
9734 goto onError;
9735 }
9736 }
9737 pbuf = PyUnicode_AS_UNICODE(temp);
9738 len = PyUnicode_GET_SIZE(temp);
9739 if (prec >= 0 && len > prec)
9740 len = prec;
9741 break;
9742
9743 case 'i':
9744 case 'd':
9745 case 'u':
9746 case 'o':
9747 case 'x':
9748 case 'X':
9749 if (c == 'i')
9750 c = 'd';
9751 isnumok = 0;
9752 if (PyNumber_Check(v)) {
9753 PyObject *iobj=NULL;
9754
9755 if (PyLong_Check(v)) {
9756 iobj = v;
9757 Py_INCREF(iobj);
9758 }
9759 else {
9760 iobj = PyNumber_Long(v);
9761 }
9762 if (iobj!=NULL) {
9763 if (PyLong_Check(iobj)) {
9764 isnumok = 1;
9765 temp = formatlong(iobj, flags, prec, c);
9766 Py_DECREF(iobj);
9767 if (!temp)
9768 goto onError;
9769 pbuf = PyUnicode_AS_UNICODE(temp);
9770 len = PyUnicode_GET_SIZE(temp);
9771 sign = 1;
9772 }
9773 else {
9774 Py_DECREF(iobj);
9775 }
9776 }
9777 }
9778 if (!isnumok) {
9779 PyErr_Format(PyExc_TypeError,
9780 "%%%c format: a number is required, "
9781 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9782 goto onError;
9783 }
9784 if (flags & F_ZERO)
9785 fill = '0';
9786 break;
9787
9788 case 'e':
9789 case 'E':
9790 case 'f':
9791 case 'F':
9792 case 'g':
9793 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009794 temp = formatfloat(v, flags, prec, c);
9795 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009796 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009797 pbuf = PyUnicode_AS_UNICODE(temp);
9798 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009799 sign = 1;
9800 if (flags & F_ZERO)
9801 fill = '0';
9802 break;
9803
9804 case 'c':
9805 pbuf = formatbuf;
9806 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9807 if (len < 0)
9808 goto onError;
9809 break;
9810
9811 default:
9812 PyErr_Format(PyExc_ValueError,
9813 "unsupported format character '%c' (0x%x) "
9814 "at index %zd",
9815 (31<=c && c<=126) ? (char)c : '?',
9816 (int)c,
9817 (Py_ssize_t)(fmt - 1 -
9818 PyUnicode_AS_UNICODE(uformat)));
9819 goto onError;
9820 }
9821 if (sign) {
9822 if (*pbuf == '-' || *pbuf == '+') {
9823 sign = *pbuf++;
9824 len--;
9825 }
9826 else if (flags & F_SIGN)
9827 sign = '+';
9828 else if (flags & F_BLANK)
9829 sign = ' ';
9830 else
9831 sign = 0;
9832 }
9833 if (width < len)
9834 width = len;
9835 if (rescnt - (sign != 0) < width) {
9836 reslen -= rescnt;
9837 rescnt = width + fmtcnt + 100;
9838 reslen += rescnt;
9839 if (reslen < 0) {
9840 Py_XDECREF(temp);
9841 PyErr_NoMemory();
9842 goto onError;
9843 }
9844 if (_PyUnicode_Resize(&result, reslen) < 0) {
9845 Py_XDECREF(temp);
9846 goto onError;
9847 }
9848 res = PyUnicode_AS_UNICODE(result)
9849 + reslen - rescnt;
9850 }
9851 if (sign) {
9852 if (fill != ' ')
9853 *res++ = sign;
9854 rescnt--;
9855 if (width > len)
9856 width--;
9857 }
9858 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9859 assert(pbuf[0] == '0');
9860 assert(pbuf[1] == c);
9861 if (fill != ' ') {
9862 *res++ = *pbuf++;
9863 *res++ = *pbuf++;
9864 }
9865 rescnt -= 2;
9866 width -= 2;
9867 if (width < 0)
9868 width = 0;
9869 len -= 2;
9870 }
9871 if (width > len && !(flags & F_LJUST)) {
9872 do {
9873 --rescnt;
9874 *res++ = fill;
9875 } while (--width > len);
9876 }
9877 if (fill == ' ') {
9878 if (sign)
9879 *res++ = sign;
9880 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9881 assert(pbuf[0] == '0');
9882 assert(pbuf[1] == c);
9883 *res++ = *pbuf++;
9884 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009885 }
9886 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009887 Py_UNICODE_COPY(res, pbuf, len);
9888 res += len;
9889 rescnt -= len;
9890 while (--width >= len) {
9891 --rescnt;
9892 *res++ = ' ';
9893 }
9894 if (dict && (argidx < arglen) && c != '%') {
9895 PyErr_SetString(PyExc_TypeError,
9896 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009897 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009898 goto onError;
9899 }
9900 Py_XDECREF(temp);
9901 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009902 } /* until end */
9903 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009904 PyErr_SetString(PyExc_TypeError,
9905 "not all arguments converted during string formatting");
9906 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907 }
9908
Thomas Woutersa96affe2006-03-12 00:29:36 +00009909 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009910 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009912 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009913 }
9914 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009915 return (PyObject *)result;
9916
Benjamin Peterson29060642009-01-31 22:14:21 +00009917 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918 Py_XDECREF(result);
9919 Py_DECREF(uformat);
9920 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009921 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009922 }
9923 return NULL;
9924}
9925
Jeremy Hylton938ace62002-07-17 16:30:39 +00009926static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009927unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9928
Tim Peters6d6c1a32001-08-02 04:15:00 +00009929static PyObject *
9930unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9931{
Benjamin Peterson29060642009-01-31 22:14:21 +00009932 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009933 static char *kwlist[] = {"object", "encoding", "errors", 0};
9934 char *encoding = NULL;
9935 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009936
Benjamin Peterson14339b62009-01-31 16:36:08 +00009937 if (type != &PyUnicode_Type)
9938 return unicode_subtype_new(type, args, kwds);
9939 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009940 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009941 return NULL;
9942 if (x == NULL)
9943 return (PyObject *)_PyUnicode_New(0);
9944 if (encoding == NULL && errors == NULL)
9945 return PyObject_Str(x);
9946 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009947 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009948}
9949
Guido van Rossume023fe02001-08-30 03:12:59 +00009950static PyObject *
9951unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9952{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009953 PyUnicodeObject *tmp, *pnew;
9954 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009955
Benjamin Peterson14339b62009-01-31 16:36:08 +00009956 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9957 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9958 if (tmp == NULL)
9959 return NULL;
9960 assert(PyUnicode_Check(tmp));
9961 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9962 if (pnew == NULL) {
9963 Py_DECREF(tmp);
9964 return NULL;
9965 }
9966 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9967 if (pnew->str == NULL) {
9968 _Py_ForgetReference((PyObject *)pnew);
9969 PyObject_Del(pnew);
9970 Py_DECREF(tmp);
9971 return PyErr_NoMemory();
9972 }
9973 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9974 pnew->length = n;
9975 pnew->hash = tmp->hash;
9976 Py_DECREF(tmp);
9977 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009978}
9979
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009980PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009981 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009982\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009983Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009984encoding defaults to the current default string encoding.\n\
9985errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009986
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009987static PyObject *unicode_iter(PyObject *seq);
9988
Guido van Rossumd57fd912000-03-10 22:53:23 +00009989PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009990 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009991 "str", /* tp_name */
9992 sizeof(PyUnicodeObject), /* tp_size */
9993 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009994 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009995 (destructor)unicode_dealloc, /* tp_dealloc */
9996 0, /* tp_print */
9997 0, /* tp_getattr */
9998 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009999 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010000 unicode_repr, /* tp_repr */
10001 &unicode_as_number, /* tp_as_number */
10002 &unicode_as_sequence, /* tp_as_sequence */
10003 &unicode_as_mapping, /* tp_as_mapping */
10004 (hashfunc) unicode_hash, /* tp_hash*/
10005 0, /* tp_call*/
10006 (reprfunc) unicode_str, /* tp_str */
10007 PyObject_GenericGetAttr, /* tp_getattro */
10008 0, /* tp_setattro */
10009 0, /* tp_as_buffer */
10010 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000010011 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010012 unicode_doc, /* tp_doc */
10013 0, /* tp_traverse */
10014 0, /* tp_clear */
10015 PyUnicode_RichCompare, /* tp_richcompare */
10016 0, /* tp_weaklistoffset */
10017 unicode_iter, /* tp_iter */
10018 0, /* tp_iternext */
10019 unicode_methods, /* tp_methods */
10020 0, /* tp_members */
10021 0, /* tp_getset */
10022 &PyBaseObject_Type, /* tp_base */
10023 0, /* tp_dict */
10024 0, /* tp_descr_get */
10025 0, /* tp_descr_set */
10026 0, /* tp_dictoffset */
10027 0, /* tp_init */
10028 0, /* tp_alloc */
10029 unicode_new, /* tp_new */
10030 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010031};
10032
10033/* Initialize the Unicode implementation */
10034
Thomas Wouters78890102000-07-22 19:25:51 +000010035void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010036{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010037 int i;
10038
Thomas Wouters477c8d52006-05-27 19:21:47 +000010039 /* XXX - move this array to unicodectype.c ? */
10040 Py_UNICODE linebreak[] = {
10041 0x000A, /* LINE FEED */
10042 0x000D, /* CARRIAGE RETURN */
10043 0x001C, /* FILE SEPARATOR */
10044 0x001D, /* GROUP SEPARATOR */
10045 0x001E, /* RECORD SEPARATOR */
10046 0x0085, /* NEXT LINE */
10047 0x2028, /* LINE SEPARATOR */
10048 0x2029, /* PARAGRAPH SEPARATOR */
10049 };
10050
Fred Drakee4315f52000-05-09 19:53:39 +000010051 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +000010052 free_list = NULL;
10053 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010054 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010055 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +000010056 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010057
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010058 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000010059 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000010060 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010061 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000010062
10063 /* initialize the linebreak bloom filter */
10064 bloom_linebreak = make_bloom_mask(
10065 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10066 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010067
10068 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010069}
10070
10071/* Finalize the Unicode implementation */
10072
Christian Heimesa156e092008-02-16 07:38:31 +000010073int
10074PyUnicode_ClearFreeList(void)
10075{
10076 int freelist_size = numfree;
10077 PyUnicodeObject *u;
10078
10079 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010080 PyUnicodeObject *v = u;
10081 u = *(PyUnicodeObject **)u;
10082 if (v->str)
10083 PyObject_DEL(v->str);
10084 Py_XDECREF(v->defenc);
10085 PyObject_Del(v);
10086 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010087 }
10088 free_list = NULL;
10089 assert(numfree == 0);
10090 return freelist_size;
10091}
10092
Guido van Rossumd57fd912000-03-10 22:53:23 +000010093void
Thomas Wouters78890102000-07-22 19:25:51 +000010094_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010095{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010096 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010097
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010098 Py_XDECREF(unicode_empty);
10099 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010100
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010101 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010102 if (unicode_latin1[i]) {
10103 Py_DECREF(unicode_latin1[i]);
10104 unicode_latin1[i] = NULL;
10105 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010106 }
Christian Heimesa156e092008-02-16 07:38:31 +000010107 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010108}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010109
Walter Dörwald16807132007-05-25 13:52:07 +000010110void
10111PyUnicode_InternInPlace(PyObject **p)
10112{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010113 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10114 PyObject *t;
10115 if (s == NULL || !PyUnicode_Check(s))
10116 Py_FatalError(
10117 "PyUnicode_InternInPlace: unicode strings only please!");
10118 /* If it's a subclass, we don't really know what putting
10119 it in the interned dict might do. */
10120 if (!PyUnicode_CheckExact(s))
10121 return;
10122 if (PyUnicode_CHECK_INTERNED(s))
10123 return;
10124 if (interned == NULL) {
10125 interned = PyDict_New();
10126 if (interned == NULL) {
10127 PyErr_Clear(); /* Don't leave an exception */
10128 return;
10129 }
10130 }
10131 /* It might be that the GetItem call fails even
10132 though the key is present in the dictionary,
10133 namely when this happens during a stack overflow. */
10134 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010135 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010136 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010137
Benjamin Peterson29060642009-01-31 22:14:21 +000010138 if (t) {
10139 Py_INCREF(t);
10140 Py_DECREF(*p);
10141 *p = t;
10142 return;
10143 }
Walter Dörwald16807132007-05-25 13:52:07 +000010144
Benjamin Peterson14339b62009-01-31 16:36:08 +000010145 PyThreadState_GET()->recursion_critical = 1;
10146 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10147 PyErr_Clear();
10148 PyThreadState_GET()->recursion_critical = 0;
10149 return;
10150 }
10151 PyThreadState_GET()->recursion_critical = 0;
10152 /* The two references in interned are not counted by refcnt.
10153 The deallocator will take care of this */
10154 Py_REFCNT(s) -= 2;
10155 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010156}
10157
10158void
10159PyUnicode_InternImmortal(PyObject **p)
10160{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010161 PyUnicode_InternInPlace(p);
10162 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10163 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10164 Py_INCREF(*p);
10165 }
Walter Dörwald16807132007-05-25 13:52:07 +000010166}
10167
10168PyObject *
10169PyUnicode_InternFromString(const char *cp)
10170{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010171 PyObject *s = PyUnicode_FromString(cp);
10172 if (s == NULL)
10173 return NULL;
10174 PyUnicode_InternInPlace(&s);
10175 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010176}
10177
Alexander Belopolsky40018472011-02-26 01:02:56 +000010178void
10179_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000010180{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010181 PyObject *keys;
10182 PyUnicodeObject *s;
10183 Py_ssize_t i, n;
10184 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010185
Benjamin Peterson14339b62009-01-31 16:36:08 +000010186 if (interned == NULL || !PyDict_Check(interned))
10187 return;
10188 keys = PyDict_Keys(interned);
10189 if (keys == NULL || !PyList_Check(keys)) {
10190 PyErr_Clear();
10191 return;
10192 }
Walter Dörwald16807132007-05-25 13:52:07 +000010193
Benjamin Peterson14339b62009-01-31 16:36:08 +000010194 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10195 detector, interned unicode strings are not forcibly deallocated;
10196 rather, we give them their stolen references back, and then clear
10197 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010198
Benjamin Peterson14339b62009-01-31 16:36:08 +000010199 n = PyList_GET_SIZE(keys);
10200 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010201 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010202 for (i = 0; i < n; i++) {
10203 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10204 switch (s->state) {
10205 case SSTATE_NOT_INTERNED:
10206 /* XXX Shouldn't happen */
10207 break;
10208 case SSTATE_INTERNED_IMMORTAL:
10209 Py_REFCNT(s) += 1;
10210 immortal_size += s->length;
10211 break;
10212 case SSTATE_INTERNED_MORTAL:
10213 Py_REFCNT(s) += 2;
10214 mortal_size += s->length;
10215 break;
10216 default:
10217 Py_FatalError("Inconsistent interned string state.");
10218 }
10219 s->state = SSTATE_NOT_INTERNED;
10220 }
10221 fprintf(stderr, "total size of all interned strings: "
10222 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10223 "mortal/immortal\n", mortal_size, immortal_size);
10224 Py_DECREF(keys);
10225 PyDict_Clear(interned);
10226 Py_DECREF(interned);
10227 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010228}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010229
10230
10231/********************* Unicode Iterator **************************/
10232
10233typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010234 PyObject_HEAD
10235 Py_ssize_t it_index;
10236 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010237} unicodeiterobject;
10238
10239static void
10240unicodeiter_dealloc(unicodeiterobject *it)
10241{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010242 _PyObject_GC_UNTRACK(it);
10243 Py_XDECREF(it->it_seq);
10244 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010245}
10246
10247static int
10248unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10249{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010250 Py_VISIT(it->it_seq);
10251 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010252}
10253
10254static PyObject *
10255unicodeiter_next(unicodeiterobject *it)
10256{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010257 PyUnicodeObject *seq;
10258 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010259
Benjamin Peterson14339b62009-01-31 16:36:08 +000010260 assert(it != NULL);
10261 seq = it->it_seq;
10262 if (seq == NULL)
10263 return NULL;
10264 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010265
Benjamin Peterson14339b62009-01-31 16:36:08 +000010266 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10267 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010268 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010269 if (item != NULL)
10270 ++it->it_index;
10271 return item;
10272 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010273
Benjamin Peterson14339b62009-01-31 16:36:08 +000010274 Py_DECREF(seq);
10275 it->it_seq = NULL;
10276 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010277}
10278
10279static PyObject *
10280unicodeiter_len(unicodeiterobject *it)
10281{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010282 Py_ssize_t len = 0;
10283 if (it->it_seq)
10284 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10285 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010286}
10287
10288PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10289
10290static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010291 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010292 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010293 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010294};
10295
10296PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010297 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10298 "str_iterator", /* tp_name */
10299 sizeof(unicodeiterobject), /* tp_basicsize */
10300 0, /* tp_itemsize */
10301 /* methods */
10302 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10303 0, /* tp_print */
10304 0, /* tp_getattr */
10305 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010306 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010307 0, /* tp_repr */
10308 0, /* tp_as_number */
10309 0, /* tp_as_sequence */
10310 0, /* tp_as_mapping */
10311 0, /* tp_hash */
10312 0, /* tp_call */
10313 0, /* tp_str */
10314 PyObject_GenericGetAttr, /* tp_getattro */
10315 0, /* tp_setattro */
10316 0, /* tp_as_buffer */
10317 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10318 0, /* tp_doc */
10319 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10320 0, /* tp_clear */
10321 0, /* tp_richcompare */
10322 0, /* tp_weaklistoffset */
10323 PyObject_SelfIter, /* tp_iter */
10324 (iternextfunc)unicodeiter_next, /* tp_iternext */
10325 unicodeiter_methods, /* tp_methods */
10326 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010327};
10328
10329static PyObject *
10330unicode_iter(PyObject *seq)
10331{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010332 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010333
Benjamin Peterson14339b62009-01-31 16:36:08 +000010334 if (!PyUnicode_Check(seq)) {
10335 PyErr_BadInternalCall();
10336 return NULL;
10337 }
10338 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10339 if (it == NULL)
10340 return NULL;
10341 it->it_index = 0;
10342 Py_INCREF(seq);
10343 it->it_seq = (PyUnicodeObject *)seq;
10344 _PyObject_GC_TRACK(it);
10345 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010346}
10347
Martin v. Löwis5b222132007-06-10 09:51:05 +000010348size_t
10349Py_UNICODE_strlen(const Py_UNICODE *u)
10350{
10351 int res = 0;
10352 while(*u++)
10353 res++;
10354 return res;
10355}
10356
10357Py_UNICODE*
10358Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10359{
10360 Py_UNICODE *u = s1;
10361 while ((*u++ = *s2++));
10362 return s1;
10363}
10364
10365Py_UNICODE*
10366Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10367{
10368 Py_UNICODE *u = s1;
10369 while ((*u++ = *s2++))
10370 if (n-- == 0)
10371 break;
10372 return s1;
10373}
10374
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010375Py_UNICODE*
10376Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10377{
10378 Py_UNICODE *u1 = s1;
10379 u1 += Py_UNICODE_strlen(u1);
10380 Py_UNICODE_strcpy(u1, s2);
10381 return s1;
10382}
10383
Martin v. Löwis5b222132007-06-10 09:51:05 +000010384int
10385Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10386{
10387 while (*s1 && *s2 && *s1 == *s2)
10388 s1++, s2++;
10389 if (*s1 && *s2)
10390 return (*s1 < *s2) ? -1 : +1;
10391 if (*s1)
10392 return 1;
10393 if (*s2)
10394 return -1;
10395 return 0;
10396}
10397
Victor Stinneref8d95c2010-08-16 22:03:11 +000010398int
10399Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10400{
10401 register Py_UNICODE u1, u2;
10402 for (; n != 0; n--) {
10403 u1 = *s1;
10404 u2 = *s2;
10405 if (u1 != u2)
10406 return (u1 < u2) ? -1 : +1;
10407 if (u1 == '\0')
10408 return 0;
10409 s1++;
10410 s2++;
10411 }
10412 return 0;
10413}
10414
Martin v. Löwis5b222132007-06-10 09:51:05 +000010415Py_UNICODE*
10416Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10417{
10418 const Py_UNICODE *p;
10419 for (p = s; *p; p++)
10420 if (*p == c)
10421 return (Py_UNICODE*)p;
10422 return NULL;
10423}
10424
Victor Stinner331ea922010-08-10 16:37:20 +000010425Py_UNICODE*
10426Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10427{
10428 const Py_UNICODE *p;
10429 p = s + Py_UNICODE_strlen(s);
10430 while (p != s) {
10431 p--;
10432 if (*p == c)
10433 return (Py_UNICODE*)p;
10434 }
10435 return NULL;
10436}
10437
Victor Stinner71133ff2010-09-01 23:43:53 +000010438Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010439PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010440{
10441 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10442 Py_UNICODE *copy;
10443 Py_ssize_t size;
10444
10445 /* Ensure we won't overflow the size. */
10446 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10447 PyErr_NoMemory();
10448 return NULL;
10449 }
10450 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10451 size *= sizeof(Py_UNICODE);
10452 copy = PyMem_Malloc(size);
10453 if (copy == NULL) {
10454 PyErr_NoMemory();
10455 return NULL;
10456 }
10457 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10458 return copy;
10459}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010460
Georg Brandl66c221e2010-10-14 07:04:07 +000010461/* A _string module, to export formatter_parser and formatter_field_name_split
10462 to the string.Formatter class implemented in Python. */
10463
10464static PyMethodDef _string_methods[] = {
10465 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10466 METH_O, PyDoc_STR("split the argument as a field name")},
10467 {"formatter_parser", (PyCFunction) formatter_parser,
10468 METH_O, PyDoc_STR("parse the argument as a format string")},
10469 {NULL, NULL}
10470};
10471
10472static struct PyModuleDef _string_module = {
10473 PyModuleDef_HEAD_INIT,
10474 "_string",
10475 PyDoc_STR("string helper module"),
10476 0,
10477 _string_methods,
10478 NULL,
10479 NULL,
10480 NULL,
10481 NULL
10482};
10483
10484PyMODINIT_FUNC
10485PyInit__string(void)
10486{
10487 return PyModule_Create(&_string_module);
10488}
10489
10490
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010491#ifdef __cplusplus
10492}
10493#endif