blob: 0918671ec0a9619dd82062cfbe175034265ddf94 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Alexander Belopolsky40018472011-02-26 01:02:56 +0000145static PyObject *
146unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000147 PyObject **errorHandler,const char *encoding, const char *reason,
148 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
149 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
150
Alexander Belopolsky40018472011-02-26 01:02:56 +0000151static void
152raise_encode_exception(PyObject **exceptionObject,
153 const char *encoding,
154 const Py_UNICODE *unicode, Py_ssize_t size,
155 Py_ssize_t startpos, Py_ssize_t endpos,
156 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000157
Christian Heimes190d79e2008-01-30 11:58:22 +0000158/* Same for linebreaks */
159static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000161/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000162/* 0x000B, * LINE TABULATION */
163/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000165 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000166 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000167/* 0x001C, * FILE SEPARATOR */
168/* 0x001D, * GROUP SEPARATOR */
169/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000175
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000184};
185
186
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000188PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000190#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000192#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 /* This is actually an illegal character, so it should
194 not be passed to unichr. */
195 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000196#endif
197}
198
Thomas Wouters477c8d52006-05-27 19:21:47 +0000199/* --- Bloom Filters ----------------------------------------------------- */
200
201/* stuff to implement simple "bloom filters" for Unicode characters.
202 to keep things simple, we use a single bitmask, using the least 5
203 bits from each unicode characters as the bit index. */
204
205/* the linebreak mask is set up by Unicode_Init below */
206
Antoine Pitrouf068f942010-01-13 14:19:12 +0000207#if LONG_BIT >= 128
208#define BLOOM_WIDTH 128
209#elif LONG_BIT >= 64
210#define BLOOM_WIDTH 64
211#elif LONG_BIT >= 32
212#define BLOOM_WIDTH 32
213#else
214#error "LONG_BIT is smaller than 32"
215#endif
216
Thomas Wouters477c8d52006-05-27 19:21:47 +0000217#define BLOOM_MASK unsigned long
218
219static BLOOM_MASK bloom_linebreak;
220
Antoine Pitrouf068f942010-01-13 14:19:12 +0000221#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
222#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223
Benjamin Peterson29060642009-01-31 22:14:21 +0000224#define BLOOM_LINEBREAK(ch) \
225 ((ch) < 128U ? ascii_linebreak[(ch)] : \
226 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227
Alexander Belopolsky40018472011-02-26 01:02:56 +0000228Py_LOCAL_INLINE(BLOOM_MASK)
229make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243Py_LOCAL_INLINE(int)
244unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245{
246 Py_ssize_t i;
247
248 for (i = 0; i < setlen; i++)
249 if (set[i] == chr)
250 return 1;
251
252 return 0;
253}
254
Benjamin Peterson29060642009-01-31 22:14:21 +0000255#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000256 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
257
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258/* --- Unicode Object ----------------------------------------------------- */
259
Alexander Belopolsky40018472011-02-26 01:02:56 +0000260static int
261unicode_resize(register PyUnicodeObject *unicode,
262 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263{
264 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000265
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000268 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000270 /* Resizing shared object (unicode_empty or single character
271 objects) in-place is not allowed. Use PyUnicode_Resize()
272 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000275 (unicode->length == 1 &&
276 unicode->str[0] < 256U &&
277 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000279 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return -1;
281 }
282
Thomas Wouters477c8d52006-05-27 19:21:47 +0000283 /* We allocate one more byte to make sure the string is Ux0000 terminated.
284 The overallocation is also used by fastsearch, which assumes that it's
285 safe to look at str[length] (without making any assumptions about what
286 it contains). */
287
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000289 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000290 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000292 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 PyErr_NoMemory();
294 return -1;
295 }
296 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000297 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000301 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000302 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000303 }
304 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000305
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306 return 0;
307}
308
309/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000310 Ux0000 terminated; some code (e.g. new_identifier)
311 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312
313 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000314 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
316*/
317
Alexander Belopolsky40018472011-02-26 01:02:56 +0000318static PyUnicodeObject *
319_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320{
321 register PyUnicodeObject *unicode;
322
Thomas Wouters477c8d52006-05-27 19:21:47 +0000323 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (length == 0 && unicode_empty != NULL) {
325 Py_INCREF(unicode_empty);
326 return unicode_empty;
327 }
328
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000329 /* Ensure we won't overflow the size. */
330 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
331 return (PyUnicodeObject *)PyErr_NoMemory();
332 }
333
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000335 if (free_list) {
336 unicode = free_list;
337 free_list = *(PyUnicodeObject **)unicode;
338 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 if (unicode->str) {
340 /* Keep-Alive optimization: we only upsize the buffer,
341 never downsize it. */
342 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000343 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000344 PyObject_DEL(unicode->str);
345 unicode->str = NULL;
346 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000347 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000348 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000351 }
352 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 }
354 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000356 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 if (unicode == NULL)
358 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000359 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
360 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 }
362
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000363 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 PyErr_NoMemory();
365 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000366 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000367 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000368 * the caller fails before initializing str -- unicode_resize()
369 * reads str[0], and the Keep-Alive optimization can keep memory
370 * allocated for str alive across a call to unicode_dealloc(unicode).
371 * We don't want unicode_resize to read uninitialized memory in
372 * that case.
373 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000374 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000378 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000379 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000380 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381
Benjamin Peterson29060642009-01-31 22:14:21 +0000382 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000383 /* XXX UNREF/NEWREF interface should be more symmetrical */
384 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000385 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000386 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388}
389
Alexander Belopolsky40018472011-02-26 01:02:56 +0000390static void
391unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392{
Walter Dörwald16807132007-05-25 13:52:07 +0000393 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000394 case SSTATE_NOT_INTERNED:
395 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000396
Benjamin Peterson29060642009-01-31 22:14:21 +0000397 case SSTATE_INTERNED_MORTAL:
398 /* revive dead object temporarily for DelItem */
399 Py_REFCNT(unicode) = 3;
400 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
401 Py_FatalError(
402 "deletion of interned string failed");
403 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 case SSTATE_INTERNED_IMMORTAL:
406 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000407
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 default:
409 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000410 }
411
Guido van Rossum604ddf82001-12-06 20:03:56 +0000412 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000414 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000415 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
416 PyObject_DEL(unicode->str);
417 unicode->str = NULL;
418 unicode->length = 0;
419 }
420 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000421 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 }
423 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000424 *(PyUnicodeObject **)unicode = free_list;
425 free_list = unicode;
426 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 }
428 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000429 PyObject_DEL(unicode->str);
430 Py_XDECREF(unicode->defenc);
431 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 }
433}
434
Alexander Belopolsky40018472011-02-26 01:02:56 +0000435static int
436_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000437{
438 register PyUnicodeObject *v;
439
440 /* Argument checks */
441 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000442 PyErr_BadInternalCall();
443 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000446 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000447 PyErr_BadInternalCall();
448 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000449 }
450
451 /* Resizing unicode_empty and single character objects is not
452 possible since these are being shared. We simply return a fresh
453 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000454 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000455 (v == unicode_empty || v->length == 1)) {
456 PyUnicodeObject *w = _PyUnicode_New(length);
457 if (w == NULL)
458 return -1;
459 Py_UNICODE_COPY(w->str, v->str,
460 length < v->length ? length : v->length);
461 Py_DECREF(*unicode);
462 *unicode = w;
463 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000464 }
465
466 /* Note that we don't have to modify *unicode for unshared Unicode
467 objects, since we can modify them in-place. */
468 return unicode_resize(v, length);
469}
470
Alexander Belopolsky40018472011-02-26 01:02:56 +0000471int
472PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000473{
474 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
475}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476
Alexander Belopolsky40018472011-02-26 01:02:56 +0000477PyObject *
478PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479{
480 PyUnicodeObject *unicode;
481
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects. */
484 if (u != NULL) {
485
Benjamin Peterson29060642009-01-31 22:14:21 +0000486 /* Optimization for empty strings */
487 if (size == 0 && unicode_empty != NULL) {
488 Py_INCREF(unicode_empty);
489 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000490 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000491
492 /* Single character Unicode objects in the Latin-1 range are
493 shared when using this constructor */
494 if (size == 1 && *u < 256) {
495 unicode = unicode_latin1[*u];
496 if (!unicode) {
497 unicode = _PyUnicode_New(1);
498 if (!unicode)
499 return NULL;
500 unicode->str[0] = *u;
501 unicode_latin1[*u] = unicode;
502 }
503 Py_INCREF(unicode);
504 return (PyObject *)unicode;
505 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 unicode = _PyUnicode_New(size);
509 if (!unicode)
510 return NULL;
511
512 /* Copy the Unicode data into the new object */
513 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000514 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000515
516 return (PyObject *)unicode;
517}
518
Alexander Belopolsky40018472011-02-26 01:02:56 +0000519PyObject *
520PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000521{
522 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 if (size < 0) {
525 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000526 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000527 return NULL;
528 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000531 some optimizations which share commonly used objects.
532 Also, this means the input must be UTF-8, so fall back to the
533 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 if (u != NULL) {
535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536 /* Optimization for empty strings */
537 if (size == 0 && unicode_empty != NULL) {
538 Py_INCREF(unicode_empty);
539 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000540 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000541
542 /* Single characters are shared when using this constructor.
543 Restrict to ASCII, since the input must be UTF-8. */
544 if (size == 1 && Py_CHARMASK(*u) < 128) {
545 unicode = unicode_latin1[Py_CHARMASK(*u)];
546 if (!unicode) {
547 unicode = _PyUnicode_New(1);
548 if (!unicode)
549 return NULL;
550 unicode->str[0] = Py_CHARMASK(*u);
551 unicode_latin1[Py_CHARMASK(*u)] = unicode;
552 }
553 Py_INCREF(unicode);
554 return (PyObject *)unicode;
555 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000556
557 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 }
559
Walter Dörwald55507312007-05-18 13:12:10 +0000560 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 if (!unicode)
562 return NULL;
563
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 return (PyObject *)unicode;
565}
566
Alexander Belopolsky40018472011-02-26 01:02:56 +0000567PyObject *
568PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +0000569{
570 size_t size = strlen(u);
571 if (size > PY_SSIZE_T_MAX) {
572 PyErr_SetString(PyExc_OverflowError, "input too long");
573 return NULL;
574 }
575
576 return PyUnicode_FromStringAndSize(u, size);
577}
578
Guido van Rossumd57fd912000-03-10 22:53:23 +0000579#ifdef HAVE_WCHAR_H
580
Mark Dickinson081dfee2009-03-18 14:47:41 +0000581#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
582# define CONVERT_WCHAR_TO_SURROGATES
583#endif
584
585#ifdef CONVERT_WCHAR_TO_SURROGATES
586
587/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
588 to convert from UTF32 to UTF16. */
589
Alexander Belopolsky40018472011-02-26 01:02:56 +0000590PyObject *
591PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +0000592{
593 PyUnicodeObject *unicode;
594 register Py_ssize_t i;
595 Py_ssize_t alloc;
596 const wchar_t *orig_w;
597
598 if (w == NULL) {
599 if (size == 0)
600 return PyUnicode_FromStringAndSize(NULL, 0);
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 if (size == -1) {
606 size = wcslen(w);
607 }
608
609 alloc = size;
610 orig_w = w;
611 for (i = size; i > 0; i--) {
612 if (*w > 0xFFFF)
613 alloc++;
614 w++;
615 }
616 w = orig_w;
617 unicode = _PyUnicode_New(alloc);
618 if (!unicode)
619 return NULL;
620
621 /* Copy the wchar_t data into the new object */
622 {
623 register Py_UNICODE *u;
624 u = PyUnicode_AS_UNICODE(unicode);
625 for (i = size; i > 0; i--) {
626 if (*w > 0xFFFF) {
627 wchar_t ordinal = *w++;
628 ordinal -= 0x10000;
629 *u++ = 0xD800 | (ordinal >> 10);
630 *u++ = 0xDC00 | (ordinal & 0x3FF);
631 }
632 else
633 *u++ = *w++;
634 }
635 }
636 return (PyObject *)unicode;
637}
638
639#else
640
Alexander Belopolsky40018472011-02-26 01:02:56 +0000641PyObject *
642PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643{
644 PyUnicodeObject *unicode;
645
646 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000647 if (size == 0)
648 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 PyErr_BadInternalCall();
650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 }
652
Martin v. Löwis790465f2008-04-05 20:41:37 +0000653 if (size == -1) {
654 size = wcslen(w);
655 }
656
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 unicode = _PyUnicode_New(size);
658 if (!unicode)
659 return NULL;
660
661 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000662#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000664#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000666 register Py_UNICODE *u;
667 register Py_ssize_t i;
668 u = PyUnicode_AS_UNICODE(unicode);
669 for (i = size; i > 0; i--)
670 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671 }
672#endif
673
674 return (PyObject *)unicode;
675}
676
Mark Dickinson081dfee2009-03-18 14:47:41 +0000677#endif /* CONVERT_WCHAR_TO_SURROGATES */
678
679#undef CONVERT_WCHAR_TO_SURROGATES
680
Walter Dörwald346737f2007-05-31 10:44:43 +0000681static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000682makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
683 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000684{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000685 *fmt++ = '%';
686 if (width) {
687 if (zeropad)
688 *fmt++ = '0';
689 fmt += sprintf(fmt, "%d", width);
690 }
691 if (precision)
692 fmt += sprintf(fmt, ".%d", precision);
693 if (longflag)
694 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000695 else if (longlongflag) {
696 /* longlongflag should only ever be nonzero on machines with
697 HAVE_LONG_LONG defined */
698#ifdef HAVE_LONG_LONG
699 char *f = PY_FORMAT_LONG_LONG;
700 while (*f)
701 *fmt++ = *f++;
702#else
703 /* we shouldn't ever get here */
704 assert(0);
705 *fmt++ = 'l';
706#endif
707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000708 else if (size_tflag) {
709 char *f = PY_FORMAT_SIZE_T;
710 while (*f)
711 *fmt++ = *f++;
712 }
713 *fmt++ = c;
714 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000715}
716
Victor Stinner96865452011-03-01 23:44:09 +0000717/* helper for PyUnicode_FromFormatV() */
718
719static const char*
720parse_format_flags(const char *f,
721 int *p_width, int *p_precision,
722 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
723{
724 int width, precision, longflag, longlongflag, size_tflag;
725
726 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
727 f++;
728 width = 0;
729 while (Py_ISDIGIT((unsigned)*f))
730 width = (width*10) + *f++ - '0';
731 precision = 0;
732 if (*f == '.') {
733 f++;
734 while (Py_ISDIGIT((unsigned)*f))
735 precision = (precision*10) + *f++ - '0';
736 if (*f == '%') {
737 /* "%.3%s" => f points to "3" */
738 f--;
739 }
740 }
741 if (*f == '\0') {
742 /* bogus format "%.1" => go backward, f points to "1" */
743 f--;
744 }
745 if (p_width != NULL)
746 *p_width = width;
747 if (p_precision != NULL)
748 *p_precision = precision;
749
750 /* Handle %ld, %lu, %lld and %llu. */
751 longflag = 0;
752 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +0000753 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +0000754
755 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +0000756 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +0000757 longflag = 1;
758 ++f;
759 }
760#ifdef HAVE_LONG_LONG
761 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +0000762 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +0000763 longlongflag = 1;
764 f += 2;
765 }
766#endif
767 }
768 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +0000769 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +0000770 size_tflag = 1;
771 ++f;
772 }
773 if (p_longflag != NULL)
774 *p_longflag = longflag;
775 if (p_longlongflag != NULL)
776 *p_longlongflag = longlongflag;
777 if (p_size_tflag != NULL)
778 *p_size_tflag = size_tflag;
779 return f;
780}
781
Walter Dörwaldd2034312007-05-18 16:29:38 +0000782#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
783
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000784/* size of fixed-size buffer for formatting single arguments */
785#define ITEM_BUFFER_LEN 21
786/* maximum number of characters required for output of %ld. 21 characters
787 allows for 64-bit integers (in decimal) and an optional sign. */
788#define MAX_LONG_CHARS 21
789/* maximum number of characters required for output of %lld.
790 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
791 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
792#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
793
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794PyObject *
795PyUnicode_FromFormatV(const char *format, va_list vargs)
796{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000797 va_list count;
798 Py_ssize_t callcount = 0;
799 PyObject **callresults = NULL;
800 PyObject **callresult = NULL;
801 Py_ssize_t n = 0;
802 int width = 0;
803 int precision = 0;
804 int zeropad;
805 const char* f;
806 Py_UNICODE *s;
807 PyObject *string;
808 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000809 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* use abuffer instead of buffer, if we need more space
811 * (which can happen if there's a format specifier with width). */
812 char *abuffer = NULL;
813 char *realbuffer;
814 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000815 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000816 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000817
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000818 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000819 /* step 1: count the number of %S/%R/%A/%s format specifications
820 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
821 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
822 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000823 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000824 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +0000825 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
826 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
827 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000828 ++callcount;
829 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000830 else if (128 <= (unsigned char)*f) {
831 PyErr_Format(PyExc_ValueError,
832 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000833 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000834 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000835 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000836 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000837 }
838 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000839 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000840 if (callcount) {
841 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
842 if (!callresults) {
843 PyErr_NoMemory();
844 return NULL;
845 }
846 callresult = callresults;
847 }
848 /* step 3: figure out how large a buffer we need */
849 for (f = format; *f; f++) {
850 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000851#ifdef HAVE_LONG_LONG
Victor Stinner96865452011-03-01 23:44:09 +0000852 int longlongflag;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000853#endif
Victor Stinner96865452011-03-01 23:44:09 +0000854 const char* p;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000855
Victor Stinner96865452011-03-01 23:44:09 +0000856 p = f;
857 f = parse_format_flags(f, &width, NULL,
858 NULL, &longlongflag, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859
Benjamin Peterson14339b62009-01-31 16:36:08 +0000860 switch (*f) {
861 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000862 {
863#ifndef Py_UNICODE_WIDE
864 int ordinal = va_arg(count, int);
865 if (ordinal > 0xffff)
866 n += 2;
867 else
868 n++;
869#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000870 (void)va_arg(count, int);
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000871 n++;
872#endif
873 break;
874 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000875 case '%':
876 n++;
877 break;
878 case 'd': case 'u': case 'i': case 'x':
879 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000880#ifdef HAVE_LONG_LONG
881 if (longlongflag) {
882 if (width < MAX_LONG_LONG_CHARS)
883 width = MAX_LONG_LONG_CHARS;
884 }
885 else
886#endif
887 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
888 including sign. Decimal takes the most space. This
889 isn't enough for octal. If a width is specified we
890 need more (which we allocate later). */
891 if (width < MAX_LONG_CHARS)
892 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000893 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000894 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000895 if (abuffersize < width)
896 abuffersize = width;
897 break;
898 case 's':
899 {
900 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000901 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000902 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
903 if (!str)
904 goto fail;
905 n += PyUnicode_GET_SIZE(str);
906 /* Remember the str and switch to the next slot */
907 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000908 break;
909 }
910 case 'U':
911 {
912 PyObject *obj = va_arg(count, PyObject *);
913 assert(obj && PyUnicode_Check(obj));
914 n += PyUnicode_GET_SIZE(obj);
915 break;
916 }
917 case 'V':
918 {
919 PyObject *obj = va_arg(count, PyObject *);
920 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000921 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000922 assert(obj || str);
923 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +0000924 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000925 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000926 *callresult++ = NULL;
927 }
928 else {
929 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
930 if (!str_obj)
931 goto fail;
932 n += PyUnicode_GET_SIZE(str_obj);
933 *callresult++ = str_obj;
934 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000935 break;
936 }
937 case 'S':
938 {
939 PyObject *obj = va_arg(count, PyObject *);
940 PyObject *str;
941 assert(obj);
942 str = PyObject_Str(obj);
943 if (!str)
944 goto fail;
945 n += PyUnicode_GET_SIZE(str);
946 /* Remember the str and switch to the next slot */
947 *callresult++ = str;
948 break;
949 }
950 case 'R':
951 {
952 PyObject *obj = va_arg(count, PyObject *);
953 PyObject *repr;
954 assert(obj);
955 repr = PyObject_Repr(obj);
956 if (!repr)
957 goto fail;
958 n += PyUnicode_GET_SIZE(repr);
959 /* Remember the repr and switch to the next slot */
960 *callresult++ = repr;
961 break;
962 }
963 case 'A':
964 {
965 PyObject *obj = va_arg(count, PyObject *);
966 PyObject *ascii;
967 assert(obj);
968 ascii = PyObject_ASCII(obj);
969 if (!ascii)
970 goto fail;
971 n += PyUnicode_GET_SIZE(ascii);
972 /* Remember the repr and switch to the next slot */
973 *callresult++ = ascii;
974 break;
975 }
976 case 'p':
977 (void) va_arg(count, int);
978 /* maximum 64-bit pointer representation:
979 * 0xffffffffffffffff
980 * so 19 characters is enough.
981 * XXX I count 18 -- what's the extra for?
982 */
983 n += 19;
984 break;
985 default:
986 /* if we stumble upon an unknown
987 formatting code, copy the rest of
988 the format string to the output
989 string. (we cannot just skip the
990 code, since there's no way to know
991 what's in the argument list) */
992 n += strlen(p);
993 goto expand;
994 }
995 } else
996 n++;
997 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000998 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000999 if (abuffersize > ITEM_BUFFER_LEN) {
1000 /* add 1 for sprintf's trailing null byte */
1001 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001002 if (!abuffer) {
1003 PyErr_NoMemory();
1004 goto fail;
1005 }
1006 realbuffer = abuffer;
1007 }
1008 else
1009 realbuffer = buffer;
1010 /* step 4: fill the buffer */
1011 /* Since we've analyzed how much space we need for the worst case,
1012 we don't have to resize the string.
1013 There can be no errors beyond this point. */
1014 string = PyUnicode_FromUnicode(NULL, n);
1015 if (!string)
1016 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001017
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 s = PyUnicode_AS_UNICODE(string);
1019 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001020
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 for (f = format; *f; f++) {
1022 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001023 const char* p;
1024 int longflag;
1025 int longlongflag;
1026 int size_tflag;
1027
1028 p = f;
1029 zeropad = (f[1] == '0');
1030 f = parse_format_flags(f, &width, &precision,
1031 &longflag, &longlongflag, &size_tflag);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001032
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 switch (*f) {
1034 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001035 {
1036 int ordinal = va_arg(vargs, int);
1037#ifndef Py_UNICODE_WIDE
1038 if (ordinal > 0xffff) {
1039 ordinal -= 0x10000;
1040 *s++ = 0xD800 | (ordinal >> 10);
1041 *s++ = 0xDC00 | (ordinal & 0x3FF);
1042 } else
1043#endif
1044 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001046 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001047 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001048 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
Victor Stinner6d970f42011-03-02 00:04:25 +00001050 width, precision, *f);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 if (longflag)
1052 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001053#ifdef HAVE_LONG_LONG
1054 else if (longlongflag)
1055 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1056#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001057 else if (size_tflag)
1058 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1059 else
1060 sprintf(realbuffer, fmt, va_arg(vargs, int));
1061 appendstring(realbuffer);
1062 break;
1063 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001064 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1065 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001066 if (longflag)
1067 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001068#ifdef HAVE_LONG_LONG
1069 else if (longlongflag)
1070 sprintf(realbuffer, fmt, va_arg(vargs,
1071 unsigned PY_LONG_LONG));
1072#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001073 else if (size_tflag)
1074 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1075 else
1076 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1077 appendstring(realbuffer);
1078 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001079 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001080 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001081 sprintf(realbuffer, fmt, va_arg(vargs, int));
1082 appendstring(realbuffer);
1083 break;
1084 case 's':
1085 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001086 /* unused, since we already have the result */
1087 (void) va_arg(vargs, char *);
1088 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1089 PyUnicode_GET_SIZE(*callresult));
1090 s += PyUnicode_GET_SIZE(*callresult);
1091 /* We're done with the unicode()/repr() => forget it */
1092 Py_DECREF(*callresult);
1093 /* switch to next unicode()/repr() result */
1094 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001095 break;
1096 }
1097 case 'U':
1098 {
1099 PyObject *obj = va_arg(vargs, PyObject *);
1100 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1101 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1102 s += size;
1103 break;
1104 }
1105 case 'V':
1106 {
1107 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001108 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001109 if (obj) {
1110 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1111 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1112 s += size;
1113 } else {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001114 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1115 PyUnicode_GET_SIZE(*callresult));
1116 s += PyUnicode_GET_SIZE(*callresult);
1117 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001118 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001119 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001120 break;
1121 }
1122 case 'S':
1123 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001124 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001125 {
1126 Py_UNICODE *ucopy;
1127 Py_ssize_t usize;
1128 Py_ssize_t upos;
1129 /* unused, since we already have the result */
1130 (void) va_arg(vargs, PyObject *);
1131 ucopy = PyUnicode_AS_UNICODE(*callresult);
1132 usize = PyUnicode_GET_SIZE(*callresult);
1133 for (upos = 0; upos<usize;)
1134 *s++ = ucopy[upos++];
1135 /* We're done with the unicode()/repr() => forget it */
1136 Py_DECREF(*callresult);
1137 /* switch to next unicode()/repr() result */
1138 ++callresult;
1139 break;
1140 }
1141 case 'p':
1142 sprintf(buffer, "%p", va_arg(vargs, void*));
1143 /* %p is ill-defined: ensure leading 0x. */
1144 if (buffer[1] == 'X')
1145 buffer[1] = 'x';
1146 else if (buffer[1] != 'x') {
1147 memmove(buffer+2, buffer, strlen(buffer)+1);
1148 buffer[0] = '0';
1149 buffer[1] = 'x';
1150 }
1151 appendstring(buffer);
1152 break;
1153 case '%':
1154 *s++ = '%';
1155 break;
1156 default:
1157 appendstring(p);
1158 goto end;
1159 }
Victor Stinner1205f272010-09-11 00:54:47 +00001160 }
Victor Stinner1205f272010-09-11 00:54:47 +00001161 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 *s++ = *f;
1163 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001164
Benjamin Peterson29060642009-01-31 22:14:21 +00001165 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001166 if (callresults)
1167 PyObject_Free(callresults);
1168 if (abuffer)
1169 PyObject_Free(abuffer);
1170 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1171 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001173 if (callresults) {
1174 PyObject **callresult2 = callresults;
1175 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001176 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001177 ++callresult2;
1178 }
1179 PyObject_Free(callresults);
1180 }
1181 if (abuffer)
1182 PyObject_Free(abuffer);
1183 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001184}
1185
1186#undef appendstring
1187
1188PyObject *
1189PyUnicode_FromFormat(const char *format, ...)
1190{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001191 PyObject* ret;
1192 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001193
1194#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001195 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001196#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001197 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001198#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001199 ret = PyUnicode_FromFormatV(format, vargs);
1200 va_end(vargs);
1201 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001202}
1203
Victor Stinner5593d8a2010-10-02 11:11:27 +00001204/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1205 convert a Unicode object to a wide character string.
1206
1207 - If w is NULL: return the number of wide characters (including the nul
1208 character) required to convert the unicode object. Ignore size argument.
1209
1210 - Otherwise: return the number of wide characters (excluding the nul
1211 character) written into w. Write at most size wide characters (including
1212 the nul character). */
1213static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001214unicode_aswidechar(PyUnicodeObject *unicode,
1215 wchar_t *w,
1216 Py_ssize_t size)
1217{
1218#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001219 Py_ssize_t res;
1220 if (w != NULL) {
1221 res = PyUnicode_GET_SIZE(unicode);
1222 if (size > res)
1223 size = res + 1;
1224 else
1225 res = size;
1226 memcpy(w, unicode->str, size * sizeof(wchar_t));
1227 return res;
1228 }
1229 else
1230 return PyUnicode_GET_SIZE(unicode) + 1;
1231#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1232 register const Py_UNICODE *u;
1233 const Py_UNICODE *uend;
1234 const wchar_t *worig, *wend;
1235 Py_ssize_t nchar;
1236
Victor Stinner137c34c2010-09-29 10:25:54 +00001237 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001238 uend = u + PyUnicode_GET_SIZE(unicode);
1239 if (w != NULL) {
1240 worig = w;
1241 wend = w + size;
1242 while (u != uend && w != wend) {
1243 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1244 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1245 {
1246 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1247 u += 2;
1248 }
1249 else {
1250 *w = *u;
1251 u++;
1252 }
1253 w++;
1254 }
1255 if (w != wend)
1256 *w = L'\0';
1257 return w - worig;
1258 }
1259 else {
1260 nchar = 1; /* nul character at the end */
1261 while (u != uend) {
1262 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1263 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1264 u += 2;
1265 else
1266 u++;
1267 nchar++;
1268 }
1269 }
1270 return nchar;
1271#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1272 register Py_UNICODE *u, *uend, ordinal;
1273 register Py_ssize_t i;
1274 wchar_t *worig, *wend;
1275 Py_ssize_t nchar;
1276
1277 u = PyUnicode_AS_UNICODE(unicode);
1278 uend = u + PyUnicode_GET_SIZE(u);
1279 if (w != NULL) {
1280 worig = w;
1281 wend = w + size;
1282 while (u != uend && w != wend) {
1283 ordinal = *u;
1284 if (ordinal > 0xffff) {
1285 ordinal -= 0x10000;
1286 *w++ = 0xD800 | (ordinal >> 10);
1287 *w++ = 0xDC00 | (ordinal & 0x3FF);
1288 }
1289 else
1290 *w++ = ordinal;
1291 u++;
1292 }
1293 if (w != wend)
1294 *w = 0;
1295 return w - worig;
1296 }
1297 else {
1298 nchar = 1; /* nul character */
1299 while (u != uend) {
1300 if (*u > 0xffff)
1301 nchar += 2;
1302 else
1303 nchar++;
1304 u++;
1305 }
1306 return nchar;
1307 }
1308#else
1309# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001310#endif
1311}
1312
1313Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001314PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001315 wchar_t *w,
1316 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317{
1318 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001319 PyErr_BadInternalCall();
1320 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001322 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323}
1324
Victor Stinner137c34c2010-09-29 10:25:54 +00001325wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001326PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001327 Py_ssize_t *size)
1328{
1329 wchar_t* buffer;
1330 Py_ssize_t buflen;
1331
1332 if (unicode == NULL) {
1333 PyErr_BadInternalCall();
1334 return NULL;
1335 }
1336
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001337 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001338 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001339 PyErr_NoMemory();
1340 return NULL;
1341 }
1342
Victor Stinner137c34c2010-09-29 10:25:54 +00001343 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1344 if (buffer == NULL) {
1345 PyErr_NoMemory();
1346 return NULL;
1347 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001348 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001349 if (size != NULL)
1350 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001351 return buffer;
1352}
1353
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354#endif
1355
Alexander Belopolsky40018472011-02-26 01:02:56 +00001356PyObject *
1357PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001358{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001359 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001360
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001361 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001362 PyErr_SetString(PyExc_ValueError,
1363 "chr() arg not in range(0x110000)");
1364 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001365 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001366
1367#ifndef Py_UNICODE_WIDE
1368 if (ordinal > 0xffff) {
1369 ordinal -= 0x10000;
1370 s[0] = 0xD800 | (ordinal >> 10);
1371 s[1] = 0xDC00 | (ordinal & 0x3FF);
1372 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001373 }
1374#endif
1375
Hye-Shik Chang40574832004-04-06 07:24:51 +00001376 s[0] = (Py_UNICODE)ordinal;
1377 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001378}
1379
Alexander Belopolsky40018472011-02-26 01:02:56 +00001380PyObject *
1381PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001383 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001384 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001385 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001386 Py_INCREF(obj);
1387 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001388 }
1389 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 /* For a Unicode subtype that's not a Unicode object,
1391 return a true Unicode object with the same data. */
1392 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1393 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001394 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001395 PyErr_Format(PyExc_TypeError,
1396 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001397 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001398 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001399}
1400
Alexander Belopolsky40018472011-02-26 01:02:56 +00001401PyObject *
1402PyUnicode_FromEncodedObject(register PyObject *obj,
1403 const char *encoding,
1404 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001405{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001406 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001407 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001408
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001410 PyErr_BadInternalCall();
1411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001413
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001414 /* Decoding bytes objects is the most common case and should be fast */
1415 if (PyBytes_Check(obj)) {
1416 if (PyBytes_GET_SIZE(obj) == 0) {
1417 Py_INCREF(unicode_empty);
1418 v = (PyObject *) unicode_empty;
1419 }
1420 else {
1421 v = PyUnicode_Decode(
1422 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1423 encoding, errors);
1424 }
1425 return v;
1426 }
1427
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001428 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001429 PyErr_SetString(PyExc_TypeError,
1430 "decoding str is not supported");
1431 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001432 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001433
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001434 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1435 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1436 PyErr_Format(PyExc_TypeError,
1437 "coercing to str: need bytes, bytearray "
1438 "or buffer-like object, %.80s found",
1439 Py_TYPE(obj)->tp_name);
1440 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001441 }
Tim Petersced69f82003-09-16 20:30:58 +00001442
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001443 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001444 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001445 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446 }
Tim Petersced69f82003-09-16 20:30:58 +00001447 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001448 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001449
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001450 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001451 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452}
1453
Victor Stinner600d3be2010-06-10 12:00:55 +00001454/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001455 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1456 1 on success. */
1457static int
1458normalize_encoding(const char *encoding,
1459 char *lower,
1460 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001462 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001463 char *l;
1464 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001465
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001466 e = encoding;
1467 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001468 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001469 while (*e) {
1470 if (l == l_end)
1471 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001472 if (Py_ISUPPER(*e)) {
1473 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001474 }
1475 else if (*e == '_') {
1476 *l++ = '-';
1477 e++;
1478 }
1479 else {
1480 *l++ = *e++;
1481 }
1482 }
1483 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001484 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001485}
1486
Alexander Belopolsky40018472011-02-26 01:02:56 +00001487PyObject *
1488PyUnicode_Decode(const char *s,
1489 Py_ssize_t size,
1490 const char *encoding,
1491 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00001492{
1493 PyObject *buffer = NULL, *unicode;
1494 Py_buffer info;
1495 char lower[11]; /* Enough for any encoding shortcut */
1496
1497 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001499
1500 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001501 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001502 if ((strcmp(lower, "utf-8") == 0) ||
1503 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001504 return PyUnicode_DecodeUTF8(s, size, errors);
1505 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001506 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001507 (strcmp(lower, "iso-8859-1") == 0))
1508 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02001509#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00001510 else if (strcmp(lower, "mbcs") == 0)
1511 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001512#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001513 else if (strcmp(lower, "ascii") == 0)
1514 return PyUnicode_DecodeASCII(s, size, errors);
1515 else if (strcmp(lower, "utf-16") == 0)
1516 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1517 else if (strcmp(lower, "utf-32") == 0)
1518 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001520
1521 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001522 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001523 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001524 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001525 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 if (buffer == NULL)
1527 goto onError;
1528 unicode = PyCodec_Decode(buffer, encoding, errors);
1529 if (unicode == NULL)
1530 goto onError;
1531 if (!PyUnicode_Check(unicode)) {
1532 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001533 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001534 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001535 Py_DECREF(unicode);
1536 goto onError;
1537 }
1538 Py_DECREF(buffer);
1539 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001540
Benjamin Peterson29060642009-01-31 22:14:21 +00001541 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 Py_XDECREF(buffer);
1543 return NULL;
1544}
1545
Alexander Belopolsky40018472011-02-26 01:02:56 +00001546PyObject *
1547PyUnicode_AsDecodedObject(PyObject *unicode,
1548 const char *encoding,
1549 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001550{
1551 PyObject *v;
1552
1553 if (!PyUnicode_Check(unicode)) {
1554 PyErr_BadArgument();
1555 goto onError;
1556 }
1557
1558 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001559 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001560
1561 /* Decode via the codec registry */
1562 v = PyCodec_Decode(unicode, encoding, errors);
1563 if (v == NULL)
1564 goto onError;
1565 return v;
1566
Benjamin Peterson29060642009-01-31 22:14:21 +00001567 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001568 return NULL;
1569}
1570
Alexander Belopolsky40018472011-02-26 01:02:56 +00001571PyObject *
1572PyUnicode_AsDecodedUnicode(PyObject *unicode,
1573 const char *encoding,
1574 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001575{
1576 PyObject *v;
1577
1578 if (!PyUnicode_Check(unicode)) {
1579 PyErr_BadArgument();
1580 goto onError;
1581 }
1582
1583 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001584 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001585
1586 /* Decode via the codec registry */
1587 v = PyCodec_Decode(unicode, encoding, errors);
1588 if (v == NULL)
1589 goto onError;
1590 if (!PyUnicode_Check(v)) {
1591 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001592 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001593 Py_TYPE(v)->tp_name);
1594 Py_DECREF(v);
1595 goto onError;
1596 }
1597 return v;
1598
Benjamin Peterson29060642009-01-31 22:14:21 +00001599 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001600 return NULL;
1601}
1602
Alexander Belopolsky40018472011-02-26 01:02:56 +00001603PyObject *
1604PyUnicode_Encode(const Py_UNICODE *s,
1605 Py_ssize_t size,
1606 const char *encoding,
1607 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608{
1609 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001610
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 unicode = PyUnicode_FromUnicode(s, size);
1612 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001613 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1615 Py_DECREF(unicode);
1616 return v;
1617}
1618
Alexander Belopolsky40018472011-02-26 01:02:56 +00001619PyObject *
1620PyUnicode_AsEncodedObject(PyObject *unicode,
1621 const char *encoding,
1622 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001623{
1624 PyObject *v;
1625
1626 if (!PyUnicode_Check(unicode)) {
1627 PyErr_BadArgument();
1628 goto onError;
1629 }
1630
1631 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001632 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001633
1634 /* Encode via the codec registry */
1635 v = PyCodec_Encode(unicode, encoding, errors);
1636 if (v == NULL)
1637 goto onError;
1638 return v;
1639
Benjamin Peterson29060642009-01-31 22:14:21 +00001640 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001641 return NULL;
1642}
1643
Victor Stinnerad158722010-10-27 00:25:46 +00001644PyObject *
1645PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001646{
Victor Stinner99b95382011-07-04 14:23:54 +02001647#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00001648 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1649 PyUnicode_GET_SIZE(unicode),
1650 NULL);
1651#elif defined(__APPLE__)
1652 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1653 PyUnicode_GET_SIZE(unicode),
1654 "surrogateescape");
1655#else
Victor Stinner793b5312011-04-27 00:24:21 +02001656 PyInterpreterState *interp = PyThreadState_GET()->interp;
1657 /* Bootstrap check: if the filesystem codec is implemented in Python, we
1658 cannot use it to encode and decode filenames before it is loaded. Load
1659 the Python codec requires to encode at least its own filename. Use the C
1660 version of the locale codec until the codec registry is initialized and
1661 the Python codec is loaded.
1662
1663 Py_FileSystemDefaultEncoding is shared between all interpreters, we
1664 cannot only rely on it: check also interp->fscodec_initialized for
1665 subinterpreters. */
1666 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001667 return PyUnicode_AsEncodedString(unicode,
1668 Py_FileSystemDefaultEncoding,
1669 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001670 }
1671 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001672 /* locale encoding with surrogateescape */
1673 wchar_t *wchar;
1674 char *bytes;
1675 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001676 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001677
1678 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1679 if (wchar == NULL)
1680 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001681 bytes = _Py_wchar2char(wchar, &error_pos);
1682 if (bytes == NULL) {
1683 if (error_pos != (size_t)-1) {
1684 char *errmsg = strerror(errno);
1685 PyObject *exc = NULL;
1686 if (errmsg == NULL)
1687 errmsg = "Py_wchar2char() failed";
1688 raise_encode_exception(&exc,
1689 "filesystemencoding",
1690 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1691 error_pos, error_pos+1,
1692 errmsg);
1693 Py_XDECREF(exc);
1694 }
1695 else
1696 PyErr_NoMemory();
1697 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001698 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001699 }
1700 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001701
1702 bytes_obj = PyBytes_FromString(bytes);
1703 PyMem_Free(bytes);
1704 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001705 }
Victor Stinnerad158722010-10-27 00:25:46 +00001706#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001707}
1708
Alexander Belopolsky40018472011-02-26 01:02:56 +00001709PyObject *
1710PyUnicode_AsEncodedString(PyObject *unicode,
1711 const char *encoding,
1712 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713{
1714 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001715 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001716
Guido van Rossumd57fd912000-03-10 22:53:23 +00001717 if (!PyUnicode_Check(unicode)) {
1718 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720 }
Fred Drakee4315f52000-05-09 19:53:39 +00001721
Victor Stinner2f283c22011-03-02 01:21:46 +00001722 if (encoding == NULL) {
1723 if (errors == NULL || strcmp(errors, "strict") == 0)
1724 return PyUnicode_AsUTF8String(unicode);
1725 else
1726 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1727 PyUnicode_GET_SIZE(unicode),
1728 errors);
1729 }
Fred Drakee4315f52000-05-09 19:53:39 +00001730
1731 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001732 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001733 if ((strcmp(lower, "utf-8") == 0) ||
1734 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00001735 {
Victor Stinner2f283c22011-03-02 01:21:46 +00001736 if (errors == NULL || strcmp(errors, "strict") == 0)
Victor Stinnera5c68c32011-03-02 01:03:14 +00001737 return PyUnicode_AsUTF8String(unicode);
Victor Stinner2f283c22011-03-02 01:21:46 +00001738 else
Victor Stinnera5c68c32011-03-02 01:03:14 +00001739 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1740 PyUnicode_GET_SIZE(unicode),
1741 errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00001742 }
Victor Stinner37296e82010-06-10 13:36:23 +00001743 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001744 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001745 (strcmp(lower, "iso-8859-1") == 0))
1746 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1747 PyUnicode_GET_SIZE(unicode),
1748 errors);
Victor Stinner99b95382011-07-04 14:23:54 +02001749#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00001750 else if (strcmp(lower, "mbcs") == 0)
1751 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1752 PyUnicode_GET_SIZE(unicode),
1753 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001754#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001755 else if (strcmp(lower, "ascii") == 0)
1756 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1757 PyUnicode_GET_SIZE(unicode),
1758 errors);
1759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760
1761 /* Encode via the codec registry */
1762 v = PyCodec_Encode(unicode, encoding, errors);
1763 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001764 return NULL;
1765
1766 /* The normal path */
1767 if (PyBytes_Check(v))
1768 return v;
1769
1770 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001771 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001772 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001773 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001774
1775 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1776 "encoder %s returned bytearray instead of bytes",
1777 encoding);
1778 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001779 Py_DECREF(v);
1780 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001781 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001782
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001783 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1784 Py_DECREF(v);
1785 return b;
1786 }
1787
1788 PyErr_Format(PyExc_TypeError,
1789 "encoder did not return a bytes object (type=%.400s)",
1790 Py_TYPE(v)->tp_name);
1791 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001792 return NULL;
1793}
1794
Alexander Belopolsky40018472011-02-26 01:02:56 +00001795PyObject *
1796PyUnicode_AsEncodedUnicode(PyObject *unicode,
1797 const char *encoding,
1798 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001799{
1800 PyObject *v;
1801
1802 if (!PyUnicode_Check(unicode)) {
1803 PyErr_BadArgument();
1804 goto onError;
1805 }
1806
1807 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001808 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001809
1810 /* Encode via the codec registry */
1811 v = PyCodec_Encode(unicode, encoding, errors);
1812 if (v == NULL)
1813 goto onError;
1814 if (!PyUnicode_Check(v)) {
1815 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001816 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001817 Py_TYPE(v)->tp_name);
1818 Py_DECREF(v);
1819 goto onError;
1820 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001821 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001822
Benjamin Peterson29060642009-01-31 22:14:21 +00001823 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001824 return NULL;
1825}
1826
Alexander Belopolsky40018472011-02-26 01:02:56 +00001827PyObject *
Victor Stinnerf3fd7332011-03-02 01:03:11 +00001828_PyUnicode_AsDefaultEncodedString(PyObject *unicode)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001829{
1830 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001831 if (v)
1832 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001833 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001834 PyUnicode_GET_SIZE(unicode),
1835 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001836 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001837 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001838 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001839 return v;
1840}
1841
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001842PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001843PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001844 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001845 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1846}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001847
Christian Heimes5894ba72007-11-04 11:43:14 +00001848PyObject*
1849PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1850{
Victor Stinner99b95382011-07-04 14:23:54 +02001851#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00001852 return PyUnicode_DecodeMBCS(s, size, NULL);
1853#elif defined(__APPLE__)
1854 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1855#else
Victor Stinner793b5312011-04-27 00:24:21 +02001856 PyInterpreterState *interp = PyThreadState_GET()->interp;
1857 /* Bootstrap check: if the filesystem codec is implemented in Python, we
1858 cannot use it to encode and decode filenames before it is loaded. Load
1859 the Python codec requires to encode at least its own filename. Use the C
1860 version of the locale codec until the codec registry is initialized and
1861 the Python codec is loaded.
1862
1863 Py_FileSystemDefaultEncoding is shared between all interpreters, we
1864 cannot only rely on it: check also interp->fscodec_initialized for
1865 subinterpreters. */
1866 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001867 return PyUnicode_Decode(s, size,
1868 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001869 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001870 }
1871 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001872 /* locale encoding with surrogateescape */
1873 wchar_t *wchar;
1874 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001875 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001876
1877 if (s[size] != '\0' || size != strlen(s)) {
1878 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1879 return NULL;
1880 }
1881
Victor Stinner168e1172010-10-16 23:16:16 +00001882 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001883 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001884 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001885
Victor Stinner168e1172010-10-16 23:16:16 +00001886 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001887 PyMem_Free(wchar);
1888 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001889 }
Victor Stinnerad158722010-10-27 00:25:46 +00001890#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001891}
1892
Martin v. Löwis011e8422009-05-05 04:43:17 +00001893
1894int
1895PyUnicode_FSConverter(PyObject* arg, void* addr)
1896{
1897 PyObject *output = NULL;
1898 Py_ssize_t size;
1899 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001900 if (arg == NULL) {
1901 Py_DECREF(*(PyObject**)addr);
1902 return 1;
1903 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001904 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001905 output = arg;
1906 Py_INCREF(output);
1907 }
1908 else {
1909 arg = PyUnicode_FromObject(arg);
1910 if (!arg)
1911 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001912 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001913 Py_DECREF(arg);
1914 if (!output)
1915 return 0;
1916 if (!PyBytes_Check(output)) {
1917 Py_DECREF(output);
1918 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1919 return 0;
1920 }
1921 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001922 size = PyBytes_GET_SIZE(output);
1923 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001924 if (size != strlen(data)) {
1925 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1926 Py_DECREF(output);
1927 return 0;
1928 }
1929 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001930 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001931}
1932
1933
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001934int
1935PyUnicode_FSDecoder(PyObject* arg, void* addr)
1936{
1937 PyObject *output = NULL;
1938 Py_ssize_t size;
1939 void *data;
1940 if (arg == NULL) {
1941 Py_DECREF(*(PyObject**)addr);
1942 return 1;
1943 }
1944 if (PyUnicode_Check(arg)) {
1945 output = arg;
1946 Py_INCREF(output);
1947 }
1948 else {
1949 arg = PyBytes_FromObject(arg);
1950 if (!arg)
1951 return 0;
1952 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1953 PyBytes_GET_SIZE(arg));
1954 Py_DECREF(arg);
1955 if (!output)
1956 return 0;
1957 if (!PyUnicode_Check(output)) {
1958 Py_DECREF(output);
1959 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1960 return 0;
1961 }
1962 }
1963 size = PyUnicode_GET_SIZE(output);
1964 data = PyUnicode_AS_UNICODE(output);
1965 if (size != Py_UNICODE_strlen(data)) {
1966 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1967 Py_DECREF(output);
1968 return 0;
1969 }
1970 *(PyObject**)addr = output;
1971 return Py_CLEANUP_SUPPORTED;
1972}
1973
1974
Martin v. Löwis5b222132007-06-10 09:51:05 +00001975char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001976_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001977{
Christian Heimesf3863112007-11-22 07:46:41 +00001978 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001979 if (!PyUnicode_Check(unicode)) {
1980 PyErr_BadArgument();
1981 return NULL;
1982 }
Victor Stinnerf3fd7332011-03-02 01:03:11 +00001983 bytes = _PyUnicode_AsDefaultEncodedString(unicode);
Christian Heimesf3863112007-11-22 07:46:41 +00001984 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001985 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001986 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001987 *psize = PyBytes_GET_SIZE(bytes);
1988 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001989}
1990
1991char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001992_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001993{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001994 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001995}
1996
Alexander Belopolsky40018472011-02-26 01:02:56 +00001997Py_UNICODE *
1998PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001999{
2000 if (!PyUnicode_Check(unicode)) {
2001 PyErr_BadArgument();
2002 goto onError;
2003 }
2004 return PyUnicode_AS_UNICODE(unicode);
2005
Benjamin Peterson29060642009-01-31 22:14:21 +00002006 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002007 return NULL;
2008}
2009
Alexander Belopolsky40018472011-02-26 01:02:56 +00002010Py_ssize_t
2011PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002012{
2013 if (!PyUnicode_Check(unicode)) {
2014 PyErr_BadArgument();
2015 goto onError;
2016 }
2017 return PyUnicode_GET_SIZE(unicode);
2018
Benjamin Peterson29060642009-01-31 22:14:21 +00002019 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002020 return -1;
2021}
2022
Alexander Belopolsky40018472011-02-26 01:02:56 +00002023const char *
2024PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002025{
Victor Stinner42cb4622010-09-01 19:39:01 +00002026 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002027}
2028
Victor Stinner554f3f02010-06-16 23:33:54 +00002029/* create or adjust a UnicodeDecodeError */
2030static void
2031make_decode_exception(PyObject **exceptionObject,
2032 const char *encoding,
2033 const char *input, Py_ssize_t length,
2034 Py_ssize_t startpos, Py_ssize_t endpos,
2035 const char *reason)
2036{
2037 if (*exceptionObject == NULL) {
2038 *exceptionObject = PyUnicodeDecodeError_Create(
2039 encoding, input, length, startpos, endpos, reason);
2040 }
2041 else {
2042 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2043 goto onError;
2044 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2045 goto onError;
2046 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2047 goto onError;
2048 }
2049 return;
2050
2051onError:
2052 Py_DECREF(*exceptionObject);
2053 *exceptionObject = NULL;
2054}
2055
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056/* error handling callback helper:
2057 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002058 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002059 and adjust various state variables.
2060 return 0 on success, -1 on error
2061*/
2062
Alexander Belopolsky40018472011-02-26 01:02:56 +00002063static int
2064unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2065 const char *encoding, const char *reason,
2066 const char **input, const char **inend, Py_ssize_t *startinpos,
2067 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2068 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002069{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002070 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002071
2072 PyObject *restuple = NULL;
2073 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002074 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002075 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002076 Py_ssize_t requiredsize;
2077 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002078 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002079 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002080 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002081 int res = -1;
2082
2083 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002084 *errorHandler = PyCodec_LookupError(errors);
2085 if (*errorHandler == NULL)
2086 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002087 }
2088
Victor Stinner554f3f02010-06-16 23:33:54 +00002089 make_decode_exception(exceptionObject,
2090 encoding,
2091 *input, *inend - *input,
2092 *startinpos, *endinpos,
2093 reason);
2094 if (*exceptionObject == NULL)
2095 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002096
2097 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2098 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002099 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002100 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002101 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002102 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002103 }
2104 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002105 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002106
2107 /* Copy back the bytes variables, which might have been modified by the
2108 callback */
2109 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2110 if (!inputobj)
2111 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002112 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002113 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002114 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002115 *input = PyBytes_AS_STRING(inputobj);
2116 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002117 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002118 /* we can DECREF safely, as the exception has another reference,
2119 so the object won't go away. */
2120 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002121
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002122 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002123 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002124 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002125 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2126 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002127 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002128
2129 /* need more space? (at least enough for what we
2130 have+the replacement+the rest of the string (starting
2131 at the new input position), so we won't have to check space
2132 when there are no errors in the rest of the string) */
2133 repptr = PyUnicode_AS_UNICODE(repunicode);
2134 repsize = PyUnicode_GET_SIZE(repunicode);
2135 requiredsize = *outpos + repsize + insize-newpos;
2136 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002137 if (requiredsize<2*outsize)
2138 requiredsize = 2*outsize;
2139 if (_PyUnicode_Resize(output, requiredsize) < 0)
2140 goto onError;
2141 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002142 }
2143 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002144 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002145 Py_UNICODE_COPY(*outptr, repptr, repsize);
2146 *outptr += repsize;
2147 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002148
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002149 /* we made it! */
2150 res = 0;
2151
Benjamin Peterson29060642009-01-31 22:14:21 +00002152 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002153 Py_XDECREF(restuple);
2154 return res;
2155}
2156
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002157/* --- UTF-7 Codec -------------------------------------------------------- */
2158
Antoine Pitrou244651a2009-05-04 18:56:13 +00002159/* See RFC2152 for details. We encode conservatively and decode liberally. */
2160
2161/* Three simple macros defining base-64. */
2162
2163/* Is c a base-64 character? */
2164
2165#define IS_BASE64(c) \
2166 (((c) >= 'A' && (c) <= 'Z') || \
2167 ((c) >= 'a' && (c) <= 'z') || \
2168 ((c) >= '0' && (c) <= '9') || \
2169 (c) == '+' || (c) == '/')
2170
2171/* given that c is a base-64 character, what is its base-64 value? */
2172
2173#define FROM_BASE64(c) \
2174 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2175 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2176 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2177 (c) == '+' ? 62 : 63)
2178
2179/* What is the base-64 character of the bottom 6 bits of n? */
2180
2181#define TO_BASE64(n) \
2182 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2183
2184/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2185 * decoded as itself. We are permissive on decoding; the only ASCII
2186 * byte not decoding to itself is the + which begins a base64
2187 * string. */
2188
2189#define DECODE_DIRECT(c) \
2190 ((c) <= 127 && (c) != '+')
2191
2192/* The UTF-7 encoder treats ASCII characters differently according to
2193 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2194 * the above). See RFC2152. This array identifies these different
2195 * sets:
2196 * 0 : "Set D"
2197 * alphanumeric and '(),-./:?
2198 * 1 : "Set O"
2199 * !"#$%&*;<=>@[]^_`{|}
2200 * 2 : "whitespace"
2201 * ht nl cr sp
2202 * 3 : special (must be base64 encoded)
2203 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2204 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002205
Tim Petersced69f82003-09-16 20:30:58 +00002206static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002207char utf7_category[128] = {
2208/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2209 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2210/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2211 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2212/* sp ! " # $ % & ' ( ) * + , - . / */
2213 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2214/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2215 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2216/* @ A B C D E F G H I J K L M N O */
2217 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2218/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2219 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2220/* ` a b c d e f g h i j k l m n o */
2221 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2222/* p q r s t u v w x y z { | } ~ del */
2223 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002224};
2225
Antoine Pitrou244651a2009-05-04 18:56:13 +00002226/* ENCODE_DIRECT: this character should be encoded as itself. The
2227 * answer depends on whether we are encoding set O as itself, and also
2228 * on whether we are encoding whitespace as itself. RFC2152 makes it
2229 * clear that the answers to these questions vary between
2230 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002231
Antoine Pitrou244651a2009-05-04 18:56:13 +00002232#define ENCODE_DIRECT(c, directO, directWS) \
2233 ((c) < 128 && (c) > 0 && \
2234 ((utf7_category[(c)] == 0) || \
2235 (directWS && (utf7_category[(c)] == 2)) || \
2236 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002237
Alexander Belopolsky40018472011-02-26 01:02:56 +00002238PyObject *
2239PyUnicode_DecodeUTF7(const char *s,
2240 Py_ssize_t size,
2241 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002242{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002243 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2244}
2245
Antoine Pitrou244651a2009-05-04 18:56:13 +00002246/* The decoder. The only state we preserve is our read position,
2247 * i.e. how many characters we have consumed. So if we end in the
2248 * middle of a shift sequence we have to back off the read position
2249 * and the output to the beginning of the sequence, otherwise we lose
2250 * all the shift state (seen bits, number of bits seen, high
2251 * surrogate). */
2252
Alexander Belopolsky40018472011-02-26 01:02:56 +00002253PyObject *
2254PyUnicode_DecodeUTF7Stateful(const char *s,
2255 Py_ssize_t size,
2256 const char *errors,
2257 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002258{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002259 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002260 Py_ssize_t startinpos;
2261 Py_ssize_t endinpos;
2262 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002263 const char *e;
2264 PyUnicodeObject *unicode;
2265 Py_UNICODE *p;
2266 const char *errmsg = "";
2267 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002268 Py_UNICODE *shiftOutStart;
2269 unsigned int base64bits = 0;
2270 unsigned long base64buffer = 0;
2271 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002272 PyObject *errorHandler = NULL;
2273 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002274
2275 unicode = _PyUnicode_New(size);
2276 if (!unicode)
2277 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002278 if (size == 0) {
2279 if (consumed)
2280 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002281 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002282 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002283
2284 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002285 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002286 e = s + size;
2287
2288 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002289 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002290 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002291 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002292
Antoine Pitrou244651a2009-05-04 18:56:13 +00002293 if (inShift) { /* in a base-64 section */
2294 if (IS_BASE64(ch)) { /* consume a base-64 character */
2295 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2296 base64bits += 6;
2297 s++;
2298 if (base64bits >= 16) {
2299 /* we have enough bits for a UTF-16 value */
2300 Py_UNICODE outCh = (Py_UNICODE)
2301 (base64buffer >> (base64bits-16));
2302 base64bits -= 16;
2303 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2304 if (surrogate) {
2305 /* expecting a second surrogate */
2306 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2307#ifdef Py_UNICODE_WIDE
2308 *p++ = (((surrogate & 0x3FF)<<10)
2309 | (outCh & 0x3FF)) + 0x10000;
2310#else
2311 *p++ = surrogate;
2312 *p++ = outCh;
2313#endif
2314 surrogate = 0;
2315 }
2316 else {
2317 surrogate = 0;
2318 errmsg = "second surrogate missing";
2319 goto utf7Error;
2320 }
2321 }
2322 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2323 /* first surrogate */
2324 surrogate = outCh;
2325 }
2326 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2327 errmsg = "unexpected second surrogate";
2328 goto utf7Error;
2329 }
2330 else {
2331 *p++ = outCh;
2332 }
2333 }
2334 }
2335 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002336 inShift = 0;
2337 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002338 if (surrogate) {
2339 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002340 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002341 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002342 if (base64bits > 0) { /* left-over bits */
2343 if (base64bits >= 6) {
2344 /* We've seen at least one base-64 character */
2345 errmsg = "partial character in shift sequence";
2346 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002347 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002348 else {
2349 /* Some bits remain; they should be zero */
2350 if (base64buffer != 0) {
2351 errmsg = "non-zero padding bits in shift sequence";
2352 goto utf7Error;
2353 }
2354 }
2355 }
2356 if (ch != '-') {
2357 /* '-' is absorbed; other terminating
2358 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002359 *p++ = ch;
2360 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002361 }
2362 }
2363 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002364 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002365 s++; /* consume '+' */
2366 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002367 s++;
2368 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002369 }
2370 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002371 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002372 shiftOutStart = p;
2373 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002374 }
2375 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002376 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002377 *p++ = ch;
2378 s++;
2379 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002380 else {
2381 startinpos = s-starts;
2382 s++;
2383 errmsg = "unexpected special character";
2384 goto utf7Error;
2385 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002386 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002387utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002388 outpos = p-PyUnicode_AS_UNICODE(unicode);
2389 endinpos = s-starts;
2390 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002391 errors, &errorHandler,
2392 "utf7", errmsg,
2393 &starts, &e, &startinpos, &endinpos, &exc, &s,
2394 &unicode, &outpos, &p))
2395 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002396 }
2397
Antoine Pitrou244651a2009-05-04 18:56:13 +00002398 /* end of string */
2399
2400 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2401 /* if we're in an inconsistent state, that's an error */
2402 if (surrogate ||
2403 (base64bits >= 6) ||
2404 (base64bits > 0 && base64buffer != 0)) {
2405 outpos = p-PyUnicode_AS_UNICODE(unicode);
2406 endinpos = size;
2407 if (unicode_decode_call_errorhandler(
2408 errors, &errorHandler,
2409 "utf7", "unterminated shift sequence",
2410 &starts, &e, &startinpos, &endinpos, &exc, &s,
2411 &unicode, &outpos, &p))
2412 goto onError;
2413 if (s < e)
2414 goto restart;
2415 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002416 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002417
2418 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002419 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002420 if (inShift) {
2421 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002422 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002423 }
2424 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002425 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002426 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002427 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002428
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002429 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002430 goto onError;
2431
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002432 Py_XDECREF(errorHandler);
2433 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002434 return (PyObject *)unicode;
2435
Benjamin Peterson29060642009-01-31 22:14:21 +00002436 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002437 Py_XDECREF(errorHandler);
2438 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002439 Py_DECREF(unicode);
2440 return NULL;
2441}
2442
2443
Alexander Belopolsky40018472011-02-26 01:02:56 +00002444PyObject *
2445PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2446 Py_ssize_t size,
2447 int base64SetO,
2448 int base64WhiteSpace,
2449 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002450{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002451 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002452 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002453 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002454 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002455 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002456 unsigned int base64bits = 0;
2457 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002458 char * out;
2459 char * start;
2460
2461 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002462 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002463
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002464 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002465 return PyErr_NoMemory();
2466
Antoine Pitrou244651a2009-05-04 18:56:13 +00002467 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002468 if (v == NULL)
2469 return NULL;
2470
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002471 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002472 for (;i < size; ++i) {
2473 Py_UNICODE ch = s[i];
2474
Antoine Pitrou244651a2009-05-04 18:56:13 +00002475 if (inShift) {
2476 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2477 /* shifting out */
2478 if (base64bits) { /* output remaining bits */
2479 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2480 base64buffer = 0;
2481 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002482 }
2483 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002484 /* Characters not in the BASE64 set implicitly unshift the sequence
2485 so no '-' is required, except if the character is itself a '-' */
2486 if (IS_BASE64(ch) || ch == '-') {
2487 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002488 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002489 *out++ = (char) ch;
2490 }
2491 else {
2492 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002493 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002494 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002495 else { /* not in a shift sequence */
2496 if (ch == '+') {
2497 *out++ = '+';
2498 *out++ = '-';
2499 }
2500 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2501 *out++ = (char) ch;
2502 }
2503 else {
2504 *out++ = '+';
2505 inShift = 1;
2506 goto encode_char;
2507 }
2508 }
2509 continue;
2510encode_char:
2511#ifdef Py_UNICODE_WIDE
2512 if (ch >= 0x10000) {
2513 /* code first surrogate */
2514 base64bits += 16;
2515 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2516 while (base64bits >= 6) {
2517 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2518 base64bits -= 6;
2519 }
2520 /* prepare second surrogate */
2521 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2522 }
2523#endif
2524 base64bits += 16;
2525 base64buffer = (base64buffer << 16) | ch;
2526 while (base64bits >= 6) {
2527 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2528 base64bits -= 6;
2529 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002530 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002531 if (base64bits)
2532 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2533 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002534 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002535 if (_PyBytes_Resize(&v, out - start) < 0)
2536 return NULL;
2537 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002538}
2539
Antoine Pitrou244651a2009-05-04 18:56:13 +00002540#undef IS_BASE64
2541#undef FROM_BASE64
2542#undef TO_BASE64
2543#undef DECODE_DIRECT
2544#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002545
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546/* --- UTF-8 Codec -------------------------------------------------------- */
2547
Tim Petersced69f82003-09-16 20:30:58 +00002548static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002550 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2551 illegal prefix. See RFC 3629 for details */
2552 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2553 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002554 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2556 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2557 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2558 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002559 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2560 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2562 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002563 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2564 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2565 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2566 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2567 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568};
2569
Alexander Belopolsky40018472011-02-26 01:02:56 +00002570PyObject *
2571PyUnicode_DecodeUTF8(const char *s,
2572 Py_ssize_t size,
2573 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574{
Walter Dörwald69652032004-09-07 20:24:22 +00002575 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2576}
2577
Antoine Pitrouab868312009-01-10 15:40:25 +00002578/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2579#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2580
2581/* Mask to quickly check whether a C 'long' contains a
2582 non-ASCII, UTF8-encoded char. */
2583#if (SIZEOF_LONG == 8)
2584# define ASCII_CHAR_MASK 0x8080808080808080L
2585#elif (SIZEOF_LONG == 4)
2586# define ASCII_CHAR_MASK 0x80808080L
2587#else
2588# error C 'long' size should be either 4 or 8!
2589#endif
2590
Alexander Belopolsky40018472011-02-26 01:02:56 +00002591PyObject *
2592PyUnicode_DecodeUTF8Stateful(const char *s,
2593 Py_ssize_t size,
2594 const char *errors,
2595 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002596{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002597 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002599 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002600 Py_ssize_t startinpos;
2601 Py_ssize_t endinpos;
2602 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002603 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 PyUnicodeObject *unicode;
2605 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002606 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002607 PyObject *errorHandler = NULL;
2608 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609
2610 /* Note: size will always be longer than the resulting Unicode
2611 character count */
2612 unicode = _PyUnicode_New(size);
2613 if (!unicode)
2614 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002615 if (size == 0) {
2616 if (consumed)
2617 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002619 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620
2621 /* Unpack UTF-8 encoded data */
2622 p = unicode->str;
2623 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002624 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625
2626 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002627 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628
2629 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002630 /* Fast path for runs of ASCII characters. Given that common UTF-8
2631 input will consist of an overwhelming majority of ASCII
2632 characters, we try to optimize for this case by checking
2633 as many characters as a C 'long' can contain.
2634 First, check if we can do an aligned read, as most CPUs have
2635 a penalty for unaligned reads.
2636 */
2637 if (!((size_t) s & LONG_PTR_MASK)) {
2638 /* Help register allocation */
2639 register const char *_s = s;
2640 register Py_UNICODE *_p = p;
2641 while (_s < aligned_end) {
2642 /* Read a whole long at a time (either 4 or 8 bytes),
2643 and do a fast unrolled copy if it only contains ASCII
2644 characters. */
2645 unsigned long data = *(unsigned long *) _s;
2646 if (data & ASCII_CHAR_MASK)
2647 break;
2648 _p[0] = (unsigned char) _s[0];
2649 _p[1] = (unsigned char) _s[1];
2650 _p[2] = (unsigned char) _s[2];
2651 _p[3] = (unsigned char) _s[3];
2652#if (SIZEOF_LONG == 8)
2653 _p[4] = (unsigned char) _s[4];
2654 _p[5] = (unsigned char) _s[5];
2655 _p[6] = (unsigned char) _s[6];
2656 _p[7] = (unsigned char) _s[7];
2657#endif
2658 _s += SIZEOF_LONG;
2659 _p += SIZEOF_LONG;
2660 }
2661 s = _s;
2662 p = _p;
2663 if (s == e)
2664 break;
2665 ch = (unsigned char)*s;
2666 }
2667 }
2668
2669 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002670 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 s++;
2672 continue;
2673 }
2674
2675 n = utf8_code_length[ch];
2676
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002677 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002678 if (consumed)
2679 break;
2680 else {
2681 errmsg = "unexpected end of data";
2682 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002683 endinpos = startinpos+1;
2684 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2685 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002686 goto utf8Error;
2687 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002688 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002689
2690 switch (n) {
2691
2692 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002693 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002694 startinpos = s-starts;
2695 endinpos = startinpos+1;
2696 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697
2698 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002699 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002700 startinpos = s-starts;
2701 endinpos = startinpos+1;
2702 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703
2704 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002705 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002706 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002707 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002708 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002709 goto utf8Error;
2710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002711 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002712 assert ((ch > 0x007F) && (ch <= 0x07FF));
2713 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 break;
2715
2716 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002717 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2718 will result in surrogates in range d800-dfff. Surrogates are
2719 not valid UTF-8 so they are rejected.
2720 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2721 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002722 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002723 (s[2] & 0xc0) != 0x80 ||
2724 ((unsigned char)s[0] == 0xE0 &&
2725 (unsigned char)s[1] < 0xA0) ||
2726 ((unsigned char)s[0] == 0xED &&
2727 (unsigned char)s[1] > 0x9F)) {
2728 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002729 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002730 endinpos = startinpos + 1;
2731
2732 /* if s[1] first two bits are 1 and 0, then the invalid
2733 continuation byte is s[2], so increment endinpos by 1,
2734 if not, s[1] is invalid and endinpos doesn't need to
2735 be incremented. */
2736 if ((s[1] & 0xC0) == 0x80)
2737 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002738 goto utf8Error;
2739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002741 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2742 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002743 break;
2744
2745 case 4:
2746 if ((s[1] & 0xc0) != 0x80 ||
2747 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002748 (s[3] & 0xc0) != 0x80 ||
2749 ((unsigned char)s[0] == 0xF0 &&
2750 (unsigned char)s[1] < 0x90) ||
2751 ((unsigned char)s[0] == 0xF4 &&
2752 (unsigned char)s[1] > 0x8F)) {
2753 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002754 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002755 endinpos = startinpos + 1;
2756 if ((s[1] & 0xC0) == 0x80) {
2757 endinpos++;
2758 if ((s[2] & 0xC0) == 0x80)
2759 endinpos++;
2760 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002761 goto utf8Error;
2762 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002763 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002764 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2765 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2766
Fredrik Lundh8f455852001-06-27 18:59:43 +00002767#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002768 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002769#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002770 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002771
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002772 /* translate from 10000..10FFFF to 0..FFFF */
2773 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002774
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002775 /* high surrogate = top 10 bits added to D800 */
2776 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002777
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002778 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002779 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002780#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782 }
2783 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002784 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002785
Benjamin Peterson29060642009-01-31 22:14:21 +00002786 utf8Error:
2787 outpos = p-PyUnicode_AS_UNICODE(unicode);
2788 if (unicode_decode_call_errorhandler(
2789 errors, &errorHandler,
2790 "utf8", errmsg,
2791 &starts, &e, &startinpos, &endinpos, &exc, &s,
2792 &unicode, &outpos, &p))
2793 goto onError;
2794 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795 }
Walter Dörwald69652032004-09-07 20:24:22 +00002796 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002797 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798
2799 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002800 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801 goto onError;
2802
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002803 Py_XDECREF(errorHandler);
2804 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 return (PyObject *)unicode;
2806
Benjamin Peterson29060642009-01-31 22:14:21 +00002807 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002808 Py_XDECREF(errorHandler);
2809 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810 Py_DECREF(unicode);
2811 return NULL;
2812}
2813
Antoine Pitrouab868312009-01-10 15:40:25 +00002814#undef ASCII_CHAR_MASK
2815
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002816#ifdef __APPLE__
2817
2818/* Simplified UTF-8 decoder using surrogateescape error handler,
2819 used to decode the command line arguments on Mac OS X. */
2820
2821wchar_t*
2822_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2823{
2824 int n;
2825 const char *e;
2826 wchar_t *unicode, *p;
2827
2828 /* Note: size will always be longer than the resulting Unicode
2829 character count */
2830 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2831 PyErr_NoMemory();
2832 return NULL;
2833 }
2834 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2835 if (!unicode)
2836 return NULL;
2837
2838 /* Unpack UTF-8 encoded data */
2839 p = unicode;
2840 e = s + size;
2841 while (s < e) {
2842 Py_UCS4 ch = (unsigned char)*s;
2843
2844 if (ch < 0x80) {
2845 *p++ = (wchar_t)ch;
2846 s++;
2847 continue;
2848 }
2849
2850 n = utf8_code_length[ch];
2851 if (s + n > e) {
2852 goto surrogateescape;
2853 }
2854
2855 switch (n) {
2856 case 0:
2857 case 1:
2858 goto surrogateescape;
2859
2860 case 2:
2861 if ((s[1] & 0xc0) != 0x80)
2862 goto surrogateescape;
2863 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2864 assert ((ch > 0x007F) && (ch <= 0x07FF));
2865 *p++ = (wchar_t)ch;
2866 break;
2867
2868 case 3:
2869 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2870 will result in surrogates in range d800-dfff. Surrogates are
2871 not valid UTF-8 so they are rejected.
2872 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2873 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2874 if ((s[1] & 0xc0) != 0x80 ||
2875 (s[2] & 0xc0) != 0x80 ||
2876 ((unsigned char)s[0] == 0xE0 &&
2877 (unsigned char)s[1] < 0xA0) ||
2878 ((unsigned char)s[0] == 0xED &&
2879 (unsigned char)s[1] > 0x9F)) {
2880
2881 goto surrogateescape;
2882 }
2883 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2884 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2885 *p++ = (Py_UNICODE)ch;
2886 break;
2887
2888 case 4:
2889 if ((s[1] & 0xc0) != 0x80 ||
2890 (s[2] & 0xc0) != 0x80 ||
2891 (s[3] & 0xc0) != 0x80 ||
2892 ((unsigned char)s[0] == 0xF0 &&
2893 (unsigned char)s[1] < 0x90) ||
2894 ((unsigned char)s[0] == 0xF4 &&
2895 (unsigned char)s[1] > 0x8F)) {
2896 goto surrogateescape;
2897 }
2898 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2899 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2900 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2901
2902#if SIZEOF_WCHAR_T == 4
2903 *p++ = (wchar_t)ch;
2904#else
2905 /* compute and append the two surrogates: */
2906
2907 /* translate from 10000..10FFFF to 0..FFFF */
2908 ch -= 0x10000;
2909
2910 /* high surrogate = top 10 bits added to D800 */
2911 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2912
2913 /* low surrogate = bottom 10 bits added to DC00 */
2914 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2915#endif
2916 break;
2917 }
2918 s += n;
2919 continue;
2920
2921 surrogateescape:
2922 *p++ = 0xDC00 + ch;
2923 s++;
2924 }
2925 *p = L'\0';
2926 return unicode;
2927}
2928
2929#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002930
Tim Peters602f7402002-04-27 18:03:26 +00002931/* Allocation strategy: if the string is short, convert into a stack buffer
2932 and allocate exactly as much space needed at the end. Else allocate the
2933 maximum possible needed (4 result bytes per Unicode character), and return
2934 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002935*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002936PyObject *
2937PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002938 Py_ssize_t size,
2939 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940{
Tim Peters602f7402002-04-27 18:03:26 +00002941#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002942
Guido van Rossum98297ee2007-11-06 21:34:58 +00002943 Py_ssize_t i; /* index into s of next input byte */
2944 PyObject *result; /* result string object */
2945 char *p; /* next free byte in output buffer */
2946 Py_ssize_t nallocated; /* number of result bytes allocated */
2947 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002948 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002949 PyObject *errorHandler = NULL;
2950 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002951
Tim Peters602f7402002-04-27 18:03:26 +00002952 assert(s != NULL);
2953 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002954
Tim Peters602f7402002-04-27 18:03:26 +00002955 if (size <= MAX_SHORT_UNICHARS) {
2956 /* Write into the stack buffer; nallocated can't overflow.
2957 * At the end, we'll allocate exactly as much heap space as it
2958 * turns out we need.
2959 */
2960 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002961 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002962 p = stackbuf;
2963 }
2964 else {
2965 /* Overallocate on the heap, and give the excess back at the end. */
2966 nallocated = size * 4;
2967 if (nallocated / 4 != size) /* overflow! */
2968 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002969 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002970 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002971 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002972 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002973 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002974
Tim Peters602f7402002-04-27 18:03:26 +00002975 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002976 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002977
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002978 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002979 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002981
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002983 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002984 *p++ = (char)(0xc0 | (ch >> 6));
2985 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002986 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002987#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002988 /* Special case: check for high and low surrogate */
2989 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2990 Py_UCS4 ch2 = s[i];
2991 /* Combine the two surrogates to form a UCS4 value */
2992 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2993 i++;
2994
2995 /* Encode UCS4 Unicode ordinals */
2996 *p++ = (char)(0xf0 | (ch >> 18));
2997 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002998 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2999 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00003000 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00003001#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003002 Py_ssize_t newpos;
3003 PyObject *rep;
3004 Py_ssize_t repsize, k;
3005 rep = unicode_encode_call_errorhandler
3006 (errors, &errorHandler, "utf-8", "surrogates not allowed",
3007 s, size, &exc, i-1, i, &newpos);
3008 if (!rep)
3009 goto error;
3010
3011 if (PyBytes_Check(rep))
3012 repsize = PyBytes_GET_SIZE(rep);
3013 else
3014 repsize = PyUnicode_GET_SIZE(rep);
3015
3016 if (repsize > 4) {
3017 Py_ssize_t offset;
3018
3019 if (result == NULL)
3020 offset = p - stackbuf;
3021 else
3022 offset = p - PyBytes_AS_STRING(result);
3023
3024 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
3025 /* integer overflow */
3026 PyErr_NoMemory();
3027 goto error;
3028 }
3029 nallocated += repsize - 4;
3030 if (result != NULL) {
3031 if (_PyBytes_Resize(&result, nallocated) < 0)
3032 goto error;
3033 } else {
3034 result = PyBytes_FromStringAndSize(NULL, nallocated);
3035 if (result == NULL)
3036 goto error;
3037 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3038 }
3039 p = PyBytes_AS_STRING(result) + offset;
3040 }
3041
3042 if (PyBytes_Check(rep)) {
3043 char *prep = PyBytes_AS_STRING(rep);
3044 for(k = repsize; k > 0; k--)
3045 *p++ = *prep++;
3046 } else /* rep is unicode */ {
3047 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3048 Py_UNICODE c;
3049
3050 for(k=0; k<repsize; k++) {
3051 c = prep[k];
3052 if (0x80 <= c) {
3053 raise_encode_exception(&exc, "utf-8", s, size,
3054 i-1, i, "surrogates not allowed");
3055 goto error;
3056 }
3057 *p++ = (char)prep[k];
3058 }
3059 }
3060 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003061#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003062 }
Victor Stinner445a6232010-04-22 20:01:57 +00003063#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003064 } else if (ch < 0x10000) {
3065 *p++ = (char)(0xe0 | (ch >> 12));
3066 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3067 *p++ = (char)(0x80 | (ch & 0x3f));
3068 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003069 /* Encode UCS4 Unicode ordinals */
3070 *p++ = (char)(0xf0 | (ch >> 18));
3071 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3072 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3073 *p++ = (char)(0x80 | (ch & 0x3f));
3074 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003076
Guido van Rossum98297ee2007-11-06 21:34:58 +00003077 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003078 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003079 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003080 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003081 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003082 }
3083 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003084 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003085 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003086 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003087 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003088 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003089 Py_XDECREF(errorHandler);
3090 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003091 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003092 error:
3093 Py_XDECREF(errorHandler);
3094 Py_XDECREF(exc);
3095 Py_XDECREF(result);
3096 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003097
Tim Peters602f7402002-04-27 18:03:26 +00003098#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099}
3100
Alexander Belopolsky40018472011-02-26 01:02:56 +00003101PyObject *
3102PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103{
Victor Stinnera5c68c32011-03-02 01:03:14 +00003104 PyObject *utf8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 if (!PyUnicode_Check(unicode)) {
3106 PyErr_BadArgument();
3107 return NULL;
3108 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003109 utf8 = _PyUnicode_AsDefaultEncodedString(unicode);
3110 if (utf8 == NULL)
3111 return NULL;
3112 Py_INCREF(utf8);
3113 return utf8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114}
3115
Walter Dörwald41980ca2007-08-16 21:55:45 +00003116/* --- UTF-32 Codec ------------------------------------------------------- */
3117
3118PyObject *
3119PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003120 Py_ssize_t size,
3121 const char *errors,
3122 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003123{
3124 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3125}
3126
3127PyObject *
3128PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003129 Py_ssize_t size,
3130 const char *errors,
3131 int *byteorder,
3132 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003133{
3134 const char *starts = s;
3135 Py_ssize_t startinpos;
3136 Py_ssize_t endinpos;
3137 Py_ssize_t outpos;
3138 PyUnicodeObject *unicode;
3139 Py_UNICODE *p;
3140#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003141 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003142 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003143#else
3144 const int pairs = 0;
3145#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003146 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003147 int bo = 0; /* assume native ordering by default */
3148 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003149 /* Offsets from q for retrieving bytes in the right order. */
3150#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3151 int iorder[] = {0, 1, 2, 3};
3152#else
3153 int iorder[] = {3, 2, 1, 0};
3154#endif
3155 PyObject *errorHandler = NULL;
3156 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003157
Walter Dörwald41980ca2007-08-16 21:55:45 +00003158 q = (unsigned char *)s;
3159 e = q + size;
3160
3161 if (byteorder)
3162 bo = *byteorder;
3163
3164 /* Check for BOM marks (U+FEFF) in the input and adjust current
3165 byte order setting accordingly. In native mode, the leading BOM
3166 mark is skipped, in all other modes, it is copied to the output
3167 stream as-is (giving a ZWNBSP character). */
3168 if (bo == 0) {
3169 if (size >= 4) {
3170 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003171 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003172#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003173 if (bom == 0x0000FEFF) {
3174 q += 4;
3175 bo = -1;
3176 }
3177 else if (bom == 0xFFFE0000) {
3178 q += 4;
3179 bo = 1;
3180 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003181#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003182 if (bom == 0x0000FEFF) {
3183 q += 4;
3184 bo = 1;
3185 }
3186 else if (bom == 0xFFFE0000) {
3187 q += 4;
3188 bo = -1;
3189 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003190#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003191 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003192 }
3193
3194 if (bo == -1) {
3195 /* force LE */
3196 iorder[0] = 0;
3197 iorder[1] = 1;
3198 iorder[2] = 2;
3199 iorder[3] = 3;
3200 }
3201 else if (bo == 1) {
3202 /* force BE */
3203 iorder[0] = 3;
3204 iorder[1] = 2;
3205 iorder[2] = 1;
3206 iorder[3] = 0;
3207 }
3208
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003209 /* On narrow builds we split characters outside the BMP into two
3210 codepoints => count how much extra space we need. */
3211#ifndef Py_UNICODE_WIDE
3212 for (qq = q; qq < e; qq += 4)
3213 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3214 pairs++;
3215#endif
3216
3217 /* This might be one to much, because of a BOM */
3218 unicode = _PyUnicode_New((size+3)/4+pairs);
3219 if (!unicode)
3220 return NULL;
3221 if (size == 0)
3222 return (PyObject *)unicode;
3223
3224 /* Unpack UTF-32 encoded data */
3225 p = unicode->str;
3226
Walter Dörwald41980ca2007-08-16 21:55:45 +00003227 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 Py_UCS4 ch;
3229 /* remaining bytes at the end? (size should be divisible by 4) */
3230 if (e-q<4) {
3231 if (consumed)
3232 break;
3233 errmsg = "truncated data";
3234 startinpos = ((const char *)q)-starts;
3235 endinpos = ((const char *)e)-starts;
3236 goto utf32Error;
3237 /* The remaining input chars are ignored if the callback
3238 chooses to skip the input */
3239 }
3240 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3241 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003242
Benjamin Peterson29060642009-01-31 22:14:21 +00003243 if (ch >= 0x110000)
3244 {
3245 errmsg = "codepoint not in range(0x110000)";
3246 startinpos = ((const char *)q)-starts;
3247 endinpos = startinpos+4;
3248 goto utf32Error;
3249 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003250#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003251 if (ch >= 0x10000)
3252 {
3253 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3254 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3255 }
3256 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003257#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003258 *p++ = ch;
3259 q += 4;
3260 continue;
3261 utf32Error:
3262 outpos = p-PyUnicode_AS_UNICODE(unicode);
3263 if (unicode_decode_call_errorhandler(
3264 errors, &errorHandler,
3265 "utf32", errmsg,
3266 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3267 &unicode, &outpos, &p))
3268 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003269 }
3270
3271 if (byteorder)
3272 *byteorder = bo;
3273
3274 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003275 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003276
3277 /* Adjust length */
3278 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3279 goto onError;
3280
3281 Py_XDECREF(errorHandler);
3282 Py_XDECREF(exc);
3283 return (PyObject *)unicode;
3284
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003286 Py_DECREF(unicode);
3287 Py_XDECREF(errorHandler);
3288 Py_XDECREF(exc);
3289 return NULL;
3290}
3291
3292PyObject *
3293PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003294 Py_ssize_t size,
3295 const char *errors,
3296 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003297{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003298 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003299 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003300 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003301#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003302 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003303#else
3304 const int pairs = 0;
3305#endif
3306 /* Offsets from p for storing byte pairs in the right order. */
3307#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3308 int iorder[] = {0, 1, 2, 3};
3309#else
3310 int iorder[] = {3, 2, 1, 0};
3311#endif
3312
Benjamin Peterson29060642009-01-31 22:14:21 +00003313#define STORECHAR(CH) \
3314 do { \
3315 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3316 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3317 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3318 p[iorder[0]] = (CH) & 0xff; \
3319 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003320 } while(0)
3321
3322 /* In narrow builds we can output surrogate pairs as one codepoint,
3323 so we need less space. */
3324#ifndef Py_UNICODE_WIDE
3325 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003326 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3327 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3328 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003329#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003330 nsize = (size - pairs + (byteorder == 0));
3331 bytesize = nsize * 4;
3332 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003333 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003334 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003335 if (v == NULL)
3336 return NULL;
3337
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003338 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003339 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003340 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003341 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003342 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003343
3344 if (byteorder == -1) {
3345 /* force LE */
3346 iorder[0] = 0;
3347 iorder[1] = 1;
3348 iorder[2] = 2;
3349 iorder[3] = 3;
3350 }
3351 else if (byteorder == 1) {
3352 /* force BE */
3353 iorder[0] = 3;
3354 iorder[1] = 2;
3355 iorder[2] = 1;
3356 iorder[3] = 0;
3357 }
3358
3359 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003360 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003361#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003362 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3363 Py_UCS4 ch2 = *s;
3364 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3365 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3366 s++;
3367 size--;
3368 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003369 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003370#endif
3371 STORECHAR(ch);
3372 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003373
3374 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003375 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003376#undef STORECHAR
3377}
3378
Alexander Belopolsky40018472011-02-26 01:02:56 +00003379PyObject *
3380PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003381{
3382 if (!PyUnicode_Check(unicode)) {
3383 PyErr_BadArgument();
3384 return NULL;
3385 }
3386 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003387 PyUnicode_GET_SIZE(unicode),
3388 NULL,
3389 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003390}
3391
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392/* --- UTF-16 Codec ------------------------------------------------------- */
3393
Tim Peters772747b2001-08-09 22:21:55 +00003394PyObject *
3395PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003396 Py_ssize_t size,
3397 const char *errors,
3398 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003399{
Walter Dörwald69652032004-09-07 20:24:22 +00003400 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3401}
3402
Antoine Pitrouab868312009-01-10 15:40:25 +00003403/* Two masks for fast checking of whether a C 'long' may contain
3404 UTF16-encoded surrogate characters. This is an efficient heuristic,
3405 assuming that non-surrogate characters with a code point >= 0x8000 are
3406 rare in most input.
3407 FAST_CHAR_MASK is used when the input is in native byte ordering,
3408 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003409*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003410#if (SIZEOF_LONG == 8)
3411# define FAST_CHAR_MASK 0x8000800080008000L
3412# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3413#elif (SIZEOF_LONG == 4)
3414# define FAST_CHAR_MASK 0x80008000L
3415# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3416#else
3417# error C 'long' size should be either 4 or 8!
3418#endif
3419
Walter Dörwald69652032004-09-07 20:24:22 +00003420PyObject *
3421PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003422 Py_ssize_t size,
3423 const char *errors,
3424 int *byteorder,
3425 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003426{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003427 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003428 Py_ssize_t startinpos;
3429 Py_ssize_t endinpos;
3430 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003431 PyUnicodeObject *unicode;
3432 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003433 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003434 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003435 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003436 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003437 /* Offsets from q for retrieving byte pairs in the right order. */
3438#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3439 int ihi = 1, ilo = 0;
3440#else
3441 int ihi = 0, ilo = 1;
3442#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003443 PyObject *errorHandler = NULL;
3444 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445
3446 /* Note: size will always be longer than the resulting Unicode
3447 character count */
3448 unicode = _PyUnicode_New(size);
3449 if (!unicode)
3450 return NULL;
3451 if (size == 0)
3452 return (PyObject *)unicode;
3453
3454 /* Unpack UTF-16 encoded data */
3455 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003456 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003457 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003458
3459 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003460 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003462 /* Check for BOM marks (U+FEFF) in the input and adjust current
3463 byte order setting accordingly. In native mode, the leading BOM
3464 mark is skipped, in all other modes, it is copied to the output
3465 stream as-is (giving a ZWNBSP character). */
3466 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003467 if (size >= 2) {
3468 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003469#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003470 if (bom == 0xFEFF) {
3471 q += 2;
3472 bo = -1;
3473 }
3474 else if (bom == 0xFFFE) {
3475 q += 2;
3476 bo = 1;
3477 }
Tim Petersced69f82003-09-16 20:30:58 +00003478#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003479 if (bom == 0xFEFF) {
3480 q += 2;
3481 bo = 1;
3482 }
3483 else if (bom == 0xFFFE) {
3484 q += 2;
3485 bo = -1;
3486 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003487#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003488 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490
Tim Peters772747b2001-08-09 22:21:55 +00003491 if (bo == -1) {
3492 /* force LE */
3493 ihi = 1;
3494 ilo = 0;
3495 }
3496 else if (bo == 1) {
3497 /* force BE */
3498 ihi = 0;
3499 ilo = 1;
3500 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003501#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3502 native_ordering = ilo < ihi;
3503#else
3504 native_ordering = ilo > ihi;
3505#endif
Tim Peters772747b2001-08-09 22:21:55 +00003506
Antoine Pitrouab868312009-01-10 15:40:25 +00003507 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003508 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003509 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003510 /* First check for possible aligned read of a C 'long'. Unaligned
3511 reads are more expensive, better to defer to another iteration. */
3512 if (!((size_t) q & LONG_PTR_MASK)) {
3513 /* Fast path for runs of non-surrogate chars. */
3514 register const unsigned char *_q = q;
3515 Py_UNICODE *_p = p;
3516 if (native_ordering) {
3517 /* Native ordering is simple: as long as the input cannot
3518 possibly contain a surrogate char, do an unrolled copy
3519 of several 16-bit code points to the target object.
3520 The non-surrogate check is done on several input bytes
3521 at a time (as many as a C 'long' can contain). */
3522 while (_q < aligned_end) {
3523 unsigned long data = * (unsigned long *) _q;
3524 if (data & FAST_CHAR_MASK)
3525 break;
3526 _p[0] = ((unsigned short *) _q)[0];
3527 _p[1] = ((unsigned short *) _q)[1];
3528#if (SIZEOF_LONG == 8)
3529 _p[2] = ((unsigned short *) _q)[2];
3530 _p[3] = ((unsigned short *) _q)[3];
3531#endif
3532 _q += SIZEOF_LONG;
3533 _p += SIZEOF_LONG / 2;
3534 }
3535 }
3536 else {
3537 /* Byteswapped ordering is similar, but we must decompose
3538 the copy bytewise, and take care of zero'ing out the
3539 upper bytes if the target object is in 32-bit units
3540 (that is, in UCS-4 builds). */
3541 while (_q < aligned_end) {
3542 unsigned long data = * (unsigned long *) _q;
3543 if (data & SWAPPED_FAST_CHAR_MASK)
3544 break;
3545 /* Zero upper bytes in UCS-4 builds */
3546#if (Py_UNICODE_SIZE > 2)
3547 _p[0] = 0;
3548 _p[1] = 0;
3549#if (SIZEOF_LONG == 8)
3550 _p[2] = 0;
3551 _p[3] = 0;
3552#endif
3553#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003554 /* Issue #4916; UCS-4 builds on big endian machines must
3555 fill the two last bytes of each 4-byte unit. */
3556#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3557# define OFF 2
3558#else
3559# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003560#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003561 ((unsigned char *) _p)[OFF + 1] = _q[0];
3562 ((unsigned char *) _p)[OFF + 0] = _q[1];
3563 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3564 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3565#if (SIZEOF_LONG == 8)
3566 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3567 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3568 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3569 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3570#endif
3571#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003572 _q += SIZEOF_LONG;
3573 _p += SIZEOF_LONG / 2;
3574 }
3575 }
3576 p = _p;
3577 q = _q;
3578 if (q >= e)
3579 break;
3580 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003581 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582
Benjamin Peterson14339b62009-01-31 16:36:08 +00003583 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003584
3585 if (ch < 0xD800 || ch > 0xDFFF) {
3586 *p++ = ch;
3587 continue;
3588 }
3589
3590 /* UTF-16 code pair: */
3591 if (q > e) {
3592 errmsg = "unexpected end of data";
3593 startinpos = (((const char *)q) - 2) - starts;
3594 endinpos = ((const char *)e) + 1 - starts;
3595 goto utf16Error;
3596 }
3597 if (0xD800 <= ch && ch <= 0xDBFF) {
3598 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3599 q += 2;
3600 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003601#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003602 *p++ = ch;
3603 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003604#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003605 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003606#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003607 continue;
3608 }
3609 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003610 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003611 startinpos = (((const char *)q)-4)-starts;
3612 endinpos = startinpos+2;
3613 goto utf16Error;
3614 }
3615
Benjamin Peterson14339b62009-01-31 16:36:08 +00003616 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003617 errmsg = "illegal encoding";
3618 startinpos = (((const char *)q)-2)-starts;
3619 endinpos = startinpos+2;
3620 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003621
Benjamin Peterson29060642009-01-31 22:14:21 +00003622 utf16Error:
3623 outpos = p - PyUnicode_AS_UNICODE(unicode);
3624 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003625 errors,
3626 &errorHandler,
3627 "utf16", errmsg,
3628 &starts,
3629 (const char **)&e,
3630 &startinpos,
3631 &endinpos,
3632 &exc,
3633 (const char **)&q,
3634 &unicode,
3635 &outpos,
3636 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003637 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003639 /* remaining byte at the end? (size should be even) */
3640 if (e == q) {
3641 if (!consumed) {
3642 errmsg = "truncated data";
3643 startinpos = ((const char *)q) - starts;
3644 endinpos = ((const char *)e) + 1 - starts;
3645 outpos = p - PyUnicode_AS_UNICODE(unicode);
3646 if (unicode_decode_call_errorhandler(
3647 errors,
3648 &errorHandler,
3649 "utf16", errmsg,
3650 &starts,
3651 (const char **)&e,
3652 &startinpos,
3653 &endinpos,
3654 &exc,
3655 (const char **)&q,
3656 &unicode,
3657 &outpos,
3658 &p))
3659 goto onError;
3660 /* The remaining input chars are ignored if the callback
3661 chooses to skip the input */
3662 }
3663 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664
3665 if (byteorder)
3666 *byteorder = bo;
3667
Walter Dörwald69652032004-09-07 20:24:22 +00003668 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003669 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003670
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003672 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673 goto onError;
3674
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 Py_XDECREF(errorHandler);
3676 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 return (PyObject *)unicode;
3678
Benjamin Peterson29060642009-01-31 22:14:21 +00003679 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681 Py_XDECREF(errorHandler);
3682 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 return NULL;
3684}
3685
Antoine Pitrouab868312009-01-10 15:40:25 +00003686#undef FAST_CHAR_MASK
3687#undef SWAPPED_FAST_CHAR_MASK
3688
Tim Peters772747b2001-08-09 22:21:55 +00003689PyObject *
3690PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003691 Py_ssize_t size,
3692 const char *errors,
3693 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003695 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003696 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003697 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003698#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003699 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003700#else
3701 const int pairs = 0;
3702#endif
Tim Peters772747b2001-08-09 22:21:55 +00003703 /* Offsets from p for storing byte pairs in the right order. */
3704#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3705 int ihi = 1, ilo = 0;
3706#else
3707 int ihi = 0, ilo = 1;
3708#endif
3709
Benjamin Peterson29060642009-01-31 22:14:21 +00003710#define STORECHAR(CH) \
3711 do { \
3712 p[ihi] = ((CH) >> 8) & 0xff; \
3713 p[ilo] = (CH) & 0xff; \
3714 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003715 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003717#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003718 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003719 if (s[i] >= 0x10000)
3720 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003721#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003722 /* 2 * (size + pairs + (byteorder == 0)) */
3723 if (size > PY_SSIZE_T_MAX ||
3724 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003725 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003726 nsize = size + pairs + (byteorder == 0);
3727 bytesize = nsize * 2;
3728 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003729 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003730 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 if (v == NULL)
3732 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003734 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003736 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003737 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003738 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003739
3740 if (byteorder == -1) {
3741 /* force LE */
3742 ihi = 1;
3743 ilo = 0;
3744 }
3745 else if (byteorder == 1) {
3746 /* force BE */
3747 ihi = 0;
3748 ilo = 1;
3749 }
3750
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003751 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003752 Py_UNICODE ch = *s++;
3753 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003754#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003755 if (ch >= 0x10000) {
3756 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3757 ch = 0xD800 | ((ch-0x10000) >> 10);
3758 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003759#endif
Tim Peters772747b2001-08-09 22:21:55 +00003760 STORECHAR(ch);
3761 if (ch2)
3762 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003763 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003764
3765 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003766 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003767#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768}
3769
Alexander Belopolsky40018472011-02-26 01:02:56 +00003770PyObject *
3771PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772{
3773 if (!PyUnicode_Check(unicode)) {
3774 PyErr_BadArgument();
3775 return NULL;
3776 }
3777 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003778 PyUnicode_GET_SIZE(unicode),
3779 NULL,
3780 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781}
3782
3783/* --- Unicode Escape Codec ----------------------------------------------- */
3784
Fredrik Lundh06d12682001-01-24 07:59:11 +00003785static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003786
Alexander Belopolsky40018472011-02-26 01:02:56 +00003787PyObject *
3788PyUnicode_DecodeUnicodeEscape(const char *s,
3789 Py_ssize_t size,
3790 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003793 Py_ssize_t startinpos;
3794 Py_ssize_t endinpos;
3795 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003798 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003800 char* message;
3801 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003802 PyObject *errorHandler = NULL;
3803 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003804
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 /* Escaped strings will always be longer than the resulting
3806 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003807 length after conversion to the true value.
3808 (but if the error callback returns a long replacement string
3809 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810 v = _PyUnicode_New(size);
3811 if (v == NULL)
3812 goto onError;
3813 if (size == 0)
3814 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003815
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003818
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819 while (s < end) {
3820 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003821 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003822 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823
3824 /* Non-escape characters are interpreted as Unicode ordinals */
3825 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003826 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 continue;
3828 }
3829
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003830 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831 /* \ - Escapes */
3832 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003833 c = *s++;
3834 if (s > end)
3835 c = '\0'; /* Invalid after \ */
3836 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837
Benjamin Peterson29060642009-01-31 22:14:21 +00003838 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 case '\n': break;
3840 case '\\': *p++ = '\\'; break;
3841 case '\'': *p++ = '\''; break;
3842 case '\"': *p++ = '\"'; break;
3843 case 'b': *p++ = '\b'; break;
3844 case 'f': *p++ = '\014'; break; /* FF */
3845 case 't': *p++ = '\t'; break;
3846 case 'n': *p++ = '\n'; break;
3847 case 'r': *p++ = '\r'; break;
3848 case 'v': *p++ = '\013'; break; /* VT */
3849 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3850
Benjamin Peterson29060642009-01-31 22:14:21 +00003851 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852 case '0': case '1': case '2': case '3':
3853 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003854 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003855 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003856 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003857 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003858 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003859 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003860 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861 break;
3862
Benjamin Peterson29060642009-01-31 22:14:21 +00003863 /* hex escapes */
3864 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003865 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003866 digits = 2;
3867 message = "truncated \\xXX escape";
3868 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869
Benjamin Peterson29060642009-01-31 22:14:21 +00003870 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003872 digits = 4;
3873 message = "truncated \\uXXXX escape";
3874 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875
Benjamin Peterson29060642009-01-31 22:14:21 +00003876 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003877 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003878 digits = 8;
3879 message = "truncated \\UXXXXXXXX escape";
3880 hexescape:
3881 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003882 outpos = p-PyUnicode_AS_UNICODE(v);
3883 if (s+digits>end) {
3884 endinpos = size;
3885 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003886 errors, &errorHandler,
3887 "unicodeescape", "end of string in escape sequence",
3888 &starts, &end, &startinpos, &endinpos, &exc, &s,
3889 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003890 goto onError;
3891 goto nextByte;
3892 }
3893 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003894 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003895 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003896 endinpos = (s+i+1)-starts;
3897 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003898 errors, &errorHandler,
3899 "unicodeescape", message,
3900 &starts, &end, &startinpos, &endinpos, &exc, &s,
3901 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003902 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003903 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003904 }
3905 chr = (chr<<4) & ~0xF;
3906 if (c >= '0' && c <= '9')
3907 chr += c - '0';
3908 else if (c >= 'a' && c <= 'f')
3909 chr += 10 + c - 'a';
3910 else
3911 chr += 10 + c - 'A';
3912 }
3913 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003914 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003915 /* _decoding_error will have already written into the
3916 target buffer. */
3917 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003918 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003919 /* when we get here, chr is a 32-bit unicode character */
3920 if (chr <= 0xffff)
3921 /* UCS-2 character */
3922 *p++ = (Py_UNICODE) chr;
3923 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003924 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003925 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003926#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003927 *p++ = chr;
3928#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003929 chr -= 0x10000L;
3930 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003931 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003932#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003933 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003934 endinpos = s-starts;
3935 outpos = p-PyUnicode_AS_UNICODE(v);
3936 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003937 errors, &errorHandler,
3938 "unicodeescape", "illegal Unicode character",
3939 &starts, &end, &startinpos, &endinpos, &exc, &s,
3940 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003941 goto onError;
3942 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003943 break;
3944
Benjamin Peterson29060642009-01-31 22:14:21 +00003945 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003946 case 'N':
3947 message = "malformed \\N character escape";
3948 if (ucnhash_CAPI == NULL) {
3949 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003950 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003951 if (ucnhash_CAPI == NULL)
3952 goto ucnhashError;
3953 }
3954 if (*s == '{') {
3955 const char *start = s+1;
3956 /* look for the closing brace */
3957 while (*s != '}' && s < end)
3958 s++;
3959 if (s > start && s < end && *s == '}') {
3960 /* found a name. look it up in the unicode database */
3961 message = "unknown Unicode character name";
3962 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003963 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003964 goto store;
3965 }
3966 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003967 endinpos = s-starts;
3968 outpos = p-PyUnicode_AS_UNICODE(v);
3969 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003970 errors, &errorHandler,
3971 "unicodeescape", message,
3972 &starts, &end, &startinpos, &endinpos, &exc, &s,
3973 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003974 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003975 break;
3976
3977 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003978 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979 message = "\\ at end of string";
3980 s--;
3981 endinpos = s-starts;
3982 outpos = p-PyUnicode_AS_UNICODE(v);
3983 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003984 errors, &errorHandler,
3985 "unicodeescape", message,
3986 &starts, &end, &startinpos, &endinpos, &exc, &s,
3987 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003988 goto onError;
3989 }
3990 else {
3991 *p++ = '\\';
3992 *p++ = (unsigned char)s[-1];
3993 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003994 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003996 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003997 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003999 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00004001 Py_XDECREF(errorHandler);
4002 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00004004
Benjamin Peterson29060642009-01-31 22:14:21 +00004005 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00004006 PyErr_SetString(
4007 PyExc_UnicodeError,
4008 "\\N escapes not supported (can't load unicodedata module)"
4009 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004010 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011 Py_XDECREF(errorHandler);
4012 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00004013 return NULL;
4014
Benjamin Peterson29060642009-01-31 22:14:21 +00004015 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 Py_XDECREF(errorHandler);
4018 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004019 return NULL;
4020}
4021
4022/* Return a Unicode-Escape string version of the Unicode object.
4023
4024 If quotes is true, the string is enclosed in u"" or u'' quotes as
4025 appropriate.
4026
4027*/
4028
Thomas Wouters477c8d52006-05-27 19:21:47 +00004029Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004030 Py_ssize_t size,
4031 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00004032{
4033 /* like wcschr, but doesn't stop at NULL characters */
4034
4035 while (size-- > 0) {
4036 if (*s == ch)
4037 return s;
4038 s++;
4039 }
4040
4041 return NULL;
4042}
Barry Warsaw51ac5802000-03-20 16:36:48 +00004043
Walter Dörwald79e913e2007-05-12 11:08:06 +00004044static const char *hexdigits = "0123456789abcdef";
4045
Alexander Belopolsky40018472011-02-26 01:02:56 +00004046PyObject *
4047PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
4048 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004050 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004053#ifdef Py_UNICODE_WIDE
4054 const Py_ssize_t expandsize = 10;
4055#else
4056 const Py_ssize_t expandsize = 6;
4057#endif
4058
Thomas Wouters89f507f2006-12-13 04:49:30 +00004059 /* XXX(nnorwitz): rather than over-allocating, it would be
4060 better to choose a different scheme. Perhaps scan the
4061 first N-chars of the string and allocate based on that size.
4062 */
4063 /* Initial allocation is based on the longest-possible unichr
4064 escape.
4065
4066 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4067 unichr, so in this case it's the longest unichr escape. In
4068 narrow (UTF-16) builds this is five chars per source unichr
4069 since there are two unichrs in the surrogate pair, so in narrow
4070 (UTF-16) builds it's not the longest unichr escape.
4071
4072 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4073 so in the narrow (UTF-16) build case it's the longest unichr
4074 escape.
4075 */
4076
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004077 if (size == 0)
4078 return PyBytes_FromStringAndSize(NULL, 0);
4079
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004080 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004081 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004082
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004083 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004084 2
4085 + expandsize*size
4086 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087 if (repr == NULL)
4088 return NULL;
4089
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004090 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092 while (size-- > 0) {
4093 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004094
Walter Dörwald79e913e2007-05-12 11:08:06 +00004095 /* Escape backslashes */
4096 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097 *p++ = '\\';
4098 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004099 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004100 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004101
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004102#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004103 /* Map 21-bit characters to '\U00xxxxxx' */
4104 else if (ch >= 0x10000) {
4105 *p++ = '\\';
4106 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004107 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4108 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4109 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4110 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4111 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4112 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4113 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4114 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004115 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004116 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004117#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004118 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4119 else if (ch >= 0xD800 && ch < 0xDC00) {
4120 Py_UNICODE ch2;
4121 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004122
Benjamin Peterson29060642009-01-31 22:14:21 +00004123 ch2 = *s++;
4124 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004125 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004126 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4127 *p++ = '\\';
4128 *p++ = 'U';
4129 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4130 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4131 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4132 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4133 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4134 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4135 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4136 *p++ = hexdigits[ucs & 0x0000000F];
4137 continue;
4138 }
4139 /* Fall through: isolated surrogates are copied as-is */
4140 s--;
4141 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004142 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004143#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004144
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004146 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147 *p++ = '\\';
4148 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004149 *p++ = hexdigits[(ch >> 12) & 0x000F];
4150 *p++ = hexdigits[(ch >> 8) & 0x000F];
4151 *p++ = hexdigits[(ch >> 4) & 0x000F];
4152 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004154
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004155 /* Map special whitespace to '\t', \n', '\r' */
4156 else if (ch == '\t') {
4157 *p++ = '\\';
4158 *p++ = 't';
4159 }
4160 else if (ch == '\n') {
4161 *p++ = '\\';
4162 *p++ = 'n';
4163 }
4164 else if (ch == '\r') {
4165 *p++ = '\\';
4166 *p++ = 'r';
4167 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004168
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004169 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004170 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004172 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004173 *p++ = hexdigits[(ch >> 4) & 0x000F];
4174 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004175 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004176
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 /* Copy everything else as-is */
4178 else
4179 *p++ = (char) ch;
4180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004182 assert(p - PyBytes_AS_STRING(repr) > 0);
4183 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4184 return NULL;
4185 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186}
4187
Alexander Belopolsky40018472011-02-26 01:02:56 +00004188PyObject *
4189PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004191 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 if (!PyUnicode_Check(unicode)) {
4193 PyErr_BadArgument();
4194 return NULL;
4195 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004196 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4197 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004198 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199}
4200
4201/* --- Raw Unicode Escape Codec ------------------------------------------- */
4202
Alexander Belopolsky40018472011-02-26 01:02:56 +00004203PyObject *
4204PyUnicode_DecodeRawUnicodeEscape(const char *s,
4205 Py_ssize_t size,
4206 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004209 Py_ssize_t startinpos;
4210 Py_ssize_t endinpos;
4211 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214 const char *end;
4215 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004216 PyObject *errorHandler = NULL;
4217 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004218
Guido van Rossumd57fd912000-03-10 22:53:23 +00004219 /* Escaped strings will always be longer than the resulting
4220 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004221 length after conversion to the true value. (But decoding error
4222 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004223 v = _PyUnicode_New(size);
4224 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004226 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004227 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004229 end = s + size;
4230 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004231 unsigned char c;
4232 Py_UCS4 x;
4233 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004234 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235
Benjamin Peterson29060642009-01-31 22:14:21 +00004236 /* Non-escape characters are interpreted as Unicode ordinals */
4237 if (*s != '\\') {
4238 *p++ = (unsigned char)*s++;
4239 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004240 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004241 startinpos = s-starts;
4242
4243 /* \u-escapes are only interpreted iff the number of leading
4244 backslashes if odd */
4245 bs = s;
4246 for (;s < end;) {
4247 if (*s != '\\')
4248 break;
4249 *p++ = (unsigned char)*s++;
4250 }
4251 if (((s - bs) & 1) == 0 ||
4252 s >= end ||
4253 (*s != 'u' && *s != 'U')) {
4254 continue;
4255 }
4256 p--;
4257 count = *s=='u' ? 4 : 8;
4258 s++;
4259
4260 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4261 outpos = p-PyUnicode_AS_UNICODE(v);
4262 for (x = 0, i = 0; i < count; ++i, ++s) {
4263 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004264 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004265 endinpos = s-starts;
4266 if (unicode_decode_call_errorhandler(
4267 errors, &errorHandler,
4268 "rawunicodeescape", "truncated \\uXXXX",
4269 &starts, &end, &startinpos, &endinpos, &exc, &s,
4270 &v, &outpos, &p))
4271 goto onError;
4272 goto nextByte;
4273 }
4274 x = (x<<4) & ~0xF;
4275 if (c >= '0' && c <= '9')
4276 x += c - '0';
4277 else if (c >= 'a' && c <= 'f')
4278 x += 10 + c - 'a';
4279 else
4280 x += 10 + c - 'A';
4281 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004282 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 /* UCS-2 character */
4284 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004285 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 /* UCS-4 character. Either store directly, or as
4287 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004288#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004290#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004291 x -= 0x10000L;
4292 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4293 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004294#endif
4295 } else {
4296 endinpos = s-starts;
4297 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004298 if (unicode_decode_call_errorhandler(
4299 errors, &errorHandler,
4300 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004301 &starts, &end, &startinpos, &endinpos, &exc, &s,
4302 &v, &outpos, &p))
4303 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004304 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004305 nextByte:
4306 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004307 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004308 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004309 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310 Py_XDECREF(errorHandler);
4311 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004313
Benjamin Peterson29060642009-01-31 22:14:21 +00004314 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004316 Py_XDECREF(errorHandler);
4317 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318 return NULL;
4319}
4320
Alexander Belopolsky40018472011-02-26 01:02:56 +00004321PyObject *
4322PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4323 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004325 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 char *p;
4327 char *q;
4328
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004329#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004330 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004331#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004332 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004333#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004334
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004335 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004336 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004337
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004338 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339 if (repr == NULL)
4340 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004341 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004342 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004344 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345 while (size-- > 0) {
4346 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004347#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004348 /* Map 32-bit characters to '\Uxxxxxxxx' */
4349 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004350 *p++ = '\\';
4351 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004352 *p++ = hexdigits[(ch >> 28) & 0xf];
4353 *p++ = hexdigits[(ch >> 24) & 0xf];
4354 *p++ = hexdigits[(ch >> 20) & 0xf];
4355 *p++ = hexdigits[(ch >> 16) & 0xf];
4356 *p++ = hexdigits[(ch >> 12) & 0xf];
4357 *p++ = hexdigits[(ch >> 8) & 0xf];
4358 *p++ = hexdigits[(ch >> 4) & 0xf];
4359 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004360 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004361 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004362#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004363 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4364 if (ch >= 0xD800 && ch < 0xDC00) {
4365 Py_UNICODE ch2;
4366 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004367
Benjamin Peterson29060642009-01-31 22:14:21 +00004368 ch2 = *s++;
4369 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004370 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004371 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4372 *p++ = '\\';
4373 *p++ = 'U';
4374 *p++ = hexdigits[(ucs >> 28) & 0xf];
4375 *p++ = hexdigits[(ucs >> 24) & 0xf];
4376 *p++ = hexdigits[(ucs >> 20) & 0xf];
4377 *p++ = hexdigits[(ucs >> 16) & 0xf];
4378 *p++ = hexdigits[(ucs >> 12) & 0xf];
4379 *p++ = hexdigits[(ucs >> 8) & 0xf];
4380 *p++ = hexdigits[(ucs >> 4) & 0xf];
4381 *p++ = hexdigits[ucs & 0xf];
4382 continue;
4383 }
4384 /* Fall through: isolated surrogates are copied as-is */
4385 s--;
4386 size++;
4387 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004388#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004389 /* Map 16-bit characters to '\uxxxx' */
4390 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391 *p++ = '\\';
4392 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004393 *p++ = hexdigits[(ch >> 12) & 0xf];
4394 *p++ = hexdigits[(ch >> 8) & 0xf];
4395 *p++ = hexdigits[(ch >> 4) & 0xf];
4396 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004398 /* Copy everything else as-is */
4399 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400 *p++ = (char) ch;
4401 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004402 size = p - q;
4403
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004404 assert(size > 0);
4405 if (_PyBytes_Resize(&repr, size) < 0)
4406 return NULL;
4407 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408}
4409
Alexander Belopolsky40018472011-02-26 01:02:56 +00004410PyObject *
4411PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004413 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004415 PyErr_BadArgument();
4416 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004418 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4419 PyUnicode_GET_SIZE(unicode));
4420
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004421 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422}
4423
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004424/* --- Unicode Internal Codec ------------------------------------------- */
4425
Alexander Belopolsky40018472011-02-26 01:02:56 +00004426PyObject *
4427_PyUnicode_DecodeUnicodeInternal(const char *s,
4428 Py_ssize_t size,
4429 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004430{
4431 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004432 Py_ssize_t startinpos;
4433 Py_ssize_t endinpos;
4434 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004435 PyUnicodeObject *v;
4436 Py_UNICODE *p;
4437 const char *end;
4438 const char *reason;
4439 PyObject *errorHandler = NULL;
4440 PyObject *exc = NULL;
4441
Neal Norwitzd43069c2006-01-08 01:12:10 +00004442#ifdef Py_UNICODE_WIDE
4443 Py_UNICODE unimax = PyUnicode_GetMax();
4444#endif
4445
Thomas Wouters89f507f2006-12-13 04:49:30 +00004446 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004447 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4448 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004449 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004450 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004451 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004452 p = PyUnicode_AS_UNICODE(v);
4453 end = s + size;
4454
4455 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004456 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004457 /* We have to sanity check the raw data, otherwise doom looms for
4458 some malformed UCS-4 data. */
4459 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004460#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004461 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004462#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004463 end-s < Py_UNICODE_SIZE
4464 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004466 startinpos = s - starts;
4467 if (end-s < Py_UNICODE_SIZE) {
4468 endinpos = end-starts;
4469 reason = "truncated input";
4470 }
4471 else {
4472 endinpos = s - starts + Py_UNICODE_SIZE;
4473 reason = "illegal code point (> 0x10FFFF)";
4474 }
4475 outpos = p - PyUnicode_AS_UNICODE(v);
4476 if (unicode_decode_call_errorhandler(
4477 errors, &errorHandler,
4478 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004479 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004480 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004481 goto onError;
4482 }
4483 }
4484 else {
4485 p++;
4486 s += Py_UNICODE_SIZE;
4487 }
4488 }
4489
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004490 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004491 goto onError;
4492 Py_XDECREF(errorHandler);
4493 Py_XDECREF(exc);
4494 return (PyObject *)v;
4495
Benjamin Peterson29060642009-01-31 22:14:21 +00004496 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004497 Py_XDECREF(v);
4498 Py_XDECREF(errorHandler);
4499 Py_XDECREF(exc);
4500 return NULL;
4501}
4502
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503/* --- Latin-1 Codec ------------------------------------------------------ */
4504
Alexander Belopolsky40018472011-02-26 01:02:56 +00004505PyObject *
4506PyUnicode_DecodeLatin1(const char *s,
4507 Py_ssize_t size,
4508 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509{
4510 PyUnicodeObject *v;
4511 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004512 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004513
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004515 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004516 Py_UNICODE r = *(unsigned char*)s;
4517 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004518 }
4519
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520 v = _PyUnicode_New(size);
4521 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004526 e = s + size;
4527 /* Unrolling the copy makes it much faster by reducing the looping
4528 overhead. This is similar to what many memcpy() implementations do. */
4529 unrolled_end = e - 4;
4530 while (s < unrolled_end) {
4531 p[0] = (unsigned char) s[0];
4532 p[1] = (unsigned char) s[1];
4533 p[2] = (unsigned char) s[2];
4534 p[3] = (unsigned char) s[3];
4535 s += 4;
4536 p += 4;
4537 }
4538 while (s < e)
4539 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004541
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543 Py_XDECREF(v);
4544 return NULL;
4545}
4546
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004548static void
4549make_encode_exception(PyObject **exceptionObject,
4550 const char *encoding,
4551 const Py_UNICODE *unicode, Py_ssize_t size,
4552 Py_ssize_t startpos, Py_ssize_t endpos,
4553 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004556 *exceptionObject = PyUnicodeEncodeError_Create(
4557 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004558 }
4559 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004560 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4561 goto onError;
4562 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4563 goto onError;
4564 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4565 goto onError;
4566 return;
4567 onError:
4568 Py_DECREF(*exceptionObject);
4569 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570 }
4571}
4572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004574static void
4575raise_encode_exception(PyObject **exceptionObject,
4576 const char *encoding,
4577 const Py_UNICODE *unicode, Py_ssize_t size,
4578 Py_ssize_t startpos, Py_ssize_t endpos,
4579 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580{
4581 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004582 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585}
4586
4587/* error handling callback helper:
4588 build arguments, call the callback and check the arguments,
4589 put the result into newpos and return the replacement string, which
4590 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004591static PyObject *
4592unicode_encode_call_errorhandler(const char *errors,
4593 PyObject **errorHandler,
4594 const char *encoding, const char *reason,
4595 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4596 Py_ssize_t startpos, Py_ssize_t endpos,
4597 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004598{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004599 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004600
4601 PyObject *restuple;
4602 PyObject *resunicode;
4603
4604 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004605 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004607 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004608 }
4609
4610 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004611 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614
4615 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004619 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004620 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004621 Py_DECREF(restuple);
4622 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004624 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 &resunicode, newpos)) {
4626 Py_DECREF(restuple);
4627 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004628 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004629 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4630 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4631 Py_DECREF(restuple);
4632 return NULL;
4633 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004634 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004635 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004636 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004637 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4638 Py_DECREF(restuple);
4639 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004640 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641 Py_INCREF(resunicode);
4642 Py_DECREF(restuple);
4643 return resunicode;
4644}
4645
Alexander Belopolsky40018472011-02-26 01:02:56 +00004646static PyObject *
4647unicode_encode_ucs1(const Py_UNICODE *p,
4648 Py_ssize_t size,
4649 const char *errors,
4650 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651{
4652 /* output object */
4653 PyObject *res;
4654 /* pointers to the beginning and end+1 of input */
4655 const Py_UNICODE *startp = p;
4656 const Py_UNICODE *endp = p + size;
4657 /* pointer to the beginning of the unencodable characters */
4658 /* const Py_UNICODE *badp = NULL; */
4659 /* pointer into the output */
4660 char *str;
4661 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004662 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004663 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4664 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004665 PyObject *errorHandler = NULL;
4666 PyObject *exc = NULL;
4667 /* the following variable is used for caching string comparisons
4668 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4669 int known_errorHandler = -1;
4670
4671 /* allocate enough for a simple encoding without
4672 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004673 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004674 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004675 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004676 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004677 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004678 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004679 ressize = size;
4680
4681 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004682 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004683
Benjamin Peterson29060642009-01-31 22:14:21 +00004684 /* can we encode this? */
4685 if (c<limit) {
4686 /* no overflow check, because we know that the space is enough */
4687 *str++ = (char)c;
4688 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004689 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004690 else {
4691 Py_ssize_t unicodepos = p-startp;
4692 Py_ssize_t requiredsize;
4693 PyObject *repunicode;
4694 Py_ssize_t repsize;
4695 Py_ssize_t newpos;
4696 Py_ssize_t respos;
4697 Py_UNICODE *uni2;
4698 /* startpos for collecting unencodable chars */
4699 const Py_UNICODE *collstart = p;
4700 const Py_UNICODE *collend = p;
4701 /* find all unecodable characters */
4702 while ((collend < endp) && ((*collend)>=limit))
4703 ++collend;
4704 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4705 if (known_errorHandler==-1) {
4706 if ((errors==NULL) || (!strcmp(errors, "strict")))
4707 known_errorHandler = 1;
4708 else if (!strcmp(errors, "replace"))
4709 known_errorHandler = 2;
4710 else if (!strcmp(errors, "ignore"))
4711 known_errorHandler = 3;
4712 else if (!strcmp(errors, "xmlcharrefreplace"))
4713 known_errorHandler = 4;
4714 else
4715 known_errorHandler = 0;
4716 }
4717 switch (known_errorHandler) {
4718 case 1: /* strict */
4719 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4720 goto onError;
4721 case 2: /* replace */
4722 while (collstart++<collend)
4723 *str++ = '?'; /* fall through */
4724 case 3: /* ignore */
4725 p = collend;
4726 break;
4727 case 4: /* xmlcharrefreplace */
4728 respos = str - PyBytes_AS_STRING(res);
4729 /* determine replacement size (temporarily (mis)uses p) */
4730 for (p = collstart, repsize = 0; p < collend; ++p) {
4731 if (*p<10)
4732 repsize += 2+1+1;
4733 else if (*p<100)
4734 repsize += 2+2+1;
4735 else if (*p<1000)
4736 repsize += 2+3+1;
4737 else if (*p<10000)
4738 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004739#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004740 else
4741 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004742#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004743 else if (*p<100000)
4744 repsize += 2+5+1;
4745 else if (*p<1000000)
4746 repsize += 2+6+1;
4747 else
4748 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004749#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 }
4751 requiredsize = respos+repsize+(endp-collend);
4752 if (requiredsize > ressize) {
4753 if (requiredsize<2*ressize)
4754 requiredsize = 2*ressize;
4755 if (_PyBytes_Resize(&res, requiredsize))
4756 goto onError;
4757 str = PyBytes_AS_STRING(res) + respos;
4758 ressize = requiredsize;
4759 }
4760 /* generate replacement (temporarily (mis)uses p) */
4761 for (p = collstart; p < collend; ++p) {
4762 str += sprintf(str, "&#%d;", (int)*p);
4763 }
4764 p = collend;
4765 break;
4766 default:
4767 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4768 encoding, reason, startp, size, &exc,
4769 collstart-startp, collend-startp, &newpos);
4770 if (repunicode == NULL)
4771 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004772 if (PyBytes_Check(repunicode)) {
4773 /* Directly copy bytes result to output. */
4774 repsize = PyBytes_Size(repunicode);
4775 if (repsize > 1) {
4776 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004777 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004778 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4779 Py_DECREF(repunicode);
4780 goto onError;
4781 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004782 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004783 ressize += repsize-1;
4784 }
4785 memcpy(str, PyBytes_AsString(repunicode), repsize);
4786 str += repsize;
4787 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004788 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004789 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004790 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004791 /* need more space? (at least enough for what we
4792 have+the replacement+the rest of the string, so
4793 we won't have to check space for encodable characters) */
4794 respos = str - PyBytes_AS_STRING(res);
4795 repsize = PyUnicode_GET_SIZE(repunicode);
4796 requiredsize = respos+repsize+(endp-collend);
4797 if (requiredsize > ressize) {
4798 if (requiredsize<2*ressize)
4799 requiredsize = 2*ressize;
4800 if (_PyBytes_Resize(&res, requiredsize)) {
4801 Py_DECREF(repunicode);
4802 goto onError;
4803 }
4804 str = PyBytes_AS_STRING(res) + respos;
4805 ressize = requiredsize;
4806 }
4807 /* check if there is anything unencodable in the replacement
4808 and copy it to the output */
4809 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4810 c = *uni2;
4811 if (c >= limit) {
4812 raise_encode_exception(&exc, encoding, startp, size,
4813 unicodepos, unicodepos+1, reason);
4814 Py_DECREF(repunicode);
4815 goto onError;
4816 }
4817 *str = (char)c;
4818 }
4819 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004820 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004821 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004822 }
4823 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004824 /* Resize if we allocated to much */
4825 size = str - PyBytes_AS_STRING(res);
4826 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004827 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004828 if (_PyBytes_Resize(&res, size) < 0)
4829 goto onError;
4830 }
4831
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004832 Py_XDECREF(errorHandler);
4833 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004834 return res;
4835
4836 onError:
4837 Py_XDECREF(res);
4838 Py_XDECREF(errorHandler);
4839 Py_XDECREF(exc);
4840 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004841}
4842
Alexander Belopolsky40018472011-02-26 01:02:56 +00004843PyObject *
4844PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4845 Py_ssize_t size,
4846 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849}
4850
Alexander Belopolsky40018472011-02-26 01:02:56 +00004851PyObject *
4852PyUnicode_AsLatin1String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853{
4854 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004855 PyErr_BadArgument();
4856 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 }
4858 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004859 PyUnicode_GET_SIZE(unicode),
4860 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861}
4862
4863/* --- 7-bit ASCII Codec -------------------------------------------------- */
4864
Alexander Belopolsky40018472011-02-26 01:02:56 +00004865PyObject *
4866PyUnicode_DecodeASCII(const char *s,
4867 Py_ssize_t size,
4868 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004870 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 PyUnicodeObject *v;
4872 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004873 Py_ssize_t startinpos;
4874 Py_ssize_t endinpos;
4875 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876 const char *e;
4877 PyObject *errorHandler = NULL;
4878 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004879
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004881 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004882 Py_UNICODE r = *(unsigned char*)s;
4883 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004884 }
Tim Petersced69f82003-09-16 20:30:58 +00004885
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 v = _PyUnicode_New(size);
4887 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004888 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004890 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004892 e = s + size;
4893 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004894 register unsigned char c = (unsigned char)*s;
4895 if (c < 128) {
4896 *p++ = c;
4897 ++s;
4898 }
4899 else {
4900 startinpos = s-starts;
4901 endinpos = startinpos + 1;
4902 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4903 if (unicode_decode_call_errorhandler(
4904 errors, &errorHandler,
4905 "ascii", "ordinal not in range(128)",
4906 &starts, &e, &startinpos, &endinpos, &exc, &s,
4907 &v, &outpos, &p))
4908 goto onError;
4909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004911 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004912 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4913 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004914 Py_XDECREF(errorHandler);
4915 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004917
Benjamin Peterson29060642009-01-31 22:14:21 +00004918 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004920 Py_XDECREF(errorHandler);
4921 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922 return NULL;
4923}
4924
Alexander Belopolsky40018472011-02-26 01:02:56 +00004925PyObject *
4926PyUnicode_EncodeASCII(const Py_UNICODE *p,
4927 Py_ssize_t size,
4928 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004930 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931}
4932
Alexander Belopolsky40018472011-02-26 01:02:56 +00004933PyObject *
4934PyUnicode_AsASCIIString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935{
4936 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 PyErr_BadArgument();
4938 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939 }
4940 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 PyUnicode_GET_SIZE(unicode),
4942 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943}
4944
Victor Stinner99b95382011-07-04 14:23:54 +02004945#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004946
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004947/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004948
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004949#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004950#define NEED_RETRY
4951#endif
4952
4953/* XXX This code is limited to "true" double-byte encodings, as
4954 a) it assumes an incomplete character consists of a single byte, and
4955 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004956 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004957
Alexander Belopolsky40018472011-02-26 01:02:56 +00004958static int
4959is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004960{
4961 const char *curr = s + offset;
4962
4963 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004964 const char *prev = CharPrev(s, curr);
4965 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004966 }
4967 return 0;
4968}
4969
4970/*
4971 * Decode MBCS string into unicode object. If 'final' is set, converts
4972 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4973 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004974static int
4975decode_mbcs(PyUnicodeObject **v,
4976 const char *s, /* MBCS string */
4977 int size, /* sizeof MBCS string */
4978 int final,
4979 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004980{
4981 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004982 Py_ssize_t n;
4983 DWORD usize;
4984 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004985
4986 assert(size >= 0);
4987
Victor Stinner554f3f02010-06-16 23:33:54 +00004988 /* check and handle 'errors' arg */
4989 if (errors==NULL || strcmp(errors, "strict")==0)
4990 flags = MB_ERR_INVALID_CHARS;
4991 else if (strcmp(errors, "ignore")==0)
4992 flags = 0;
4993 else {
4994 PyErr_Format(PyExc_ValueError,
4995 "mbcs encoding does not support errors='%s'",
4996 errors);
4997 return -1;
4998 }
4999
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005000 /* Skip trailing lead-byte unless 'final' is set */
5001 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00005002 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005003
5004 /* First get the size of the result */
5005 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005006 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
5007 if (usize==0)
5008 goto mbcs_decode_error;
5009 } else
5010 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005011
5012 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 /* Create unicode object */
5014 *v = _PyUnicode_New(usize);
5015 if (*v == NULL)
5016 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005017 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005018 }
5019 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 /* Extend unicode object */
5021 n = PyUnicode_GET_SIZE(*v);
5022 if (_PyUnicode_Resize(v, n + usize) < 0)
5023 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005024 }
5025
5026 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00005027 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005029 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
5030 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00005031 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005032 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005033 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00005034
5035mbcs_decode_error:
5036 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
5037 we raise a UnicodeDecodeError - else it is a 'generic'
5038 windows error
5039 */
5040 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
5041 /* Ideally, we should get reason from FormatMessage - this
5042 is the Windows 2000 English version of the message
5043 */
5044 PyObject *exc = NULL;
5045 const char *reason = "No mapping for the Unicode character exists "
5046 "in the target multi-byte code page.";
5047 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
5048 if (exc != NULL) {
5049 PyCodec_StrictErrors(exc);
5050 Py_DECREF(exc);
5051 }
5052 } else {
5053 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5054 }
5055 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005056}
5057
Alexander Belopolsky40018472011-02-26 01:02:56 +00005058PyObject *
5059PyUnicode_DecodeMBCSStateful(const char *s,
5060 Py_ssize_t size,
5061 const char *errors,
5062 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005063{
5064 PyUnicodeObject *v = NULL;
5065 int done;
5066
5067 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005069
5070#ifdef NEED_RETRY
5071 retry:
5072 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005073 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005074 else
5075#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005076 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005077
5078 if (done < 0) {
5079 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005080 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005081 }
5082
5083 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005085
5086#ifdef NEED_RETRY
5087 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005088 s += done;
5089 size -= done;
5090 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005091 }
5092#endif
5093
5094 return (PyObject *)v;
5095}
5096
Alexander Belopolsky40018472011-02-26 01:02:56 +00005097PyObject *
5098PyUnicode_DecodeMBCS(const char *s,
5099 Py_ssize_t size,
5100 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005101{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005102 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5103}
5104
5105/*
5106 * Convert unicode into string object (MBCS).
5107 * Returns 0 if succeed, -1 otherwise.
5108 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005109static int
5110encode_mbcs(PyObject **repr,
5111 const Py_UNICODE *p, /* unicode */
5112 int size, /* size of unicode */
5113 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005114{
Victor Stinner554f3f02010-06-16 23:33:54 +00005115 BOOL usedDefaultChar = FALSE;
5116 BOOL *pusedDefaultChar;
5117 int mbcssize;
5118 Py_ssize_t n;
5119 PyObject *exc = NULL;
5120 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005121
5122 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005123
Victor Stinner554f3f02010-06-16 23:33:54 +00005124 /* check and handle 'errors' arg */
5125 if (errors==NULL || strcmp(errors, "strict")==0) {
5126 flags = WC_NO_BEST_FIT_CHARS;
5127 pusedDefaultChar = &usedDefaultChar;
5128 } else if (strcmp(errors, "replace")==0) {
5129 flags = 0;
5130 pusedDefaultChar = NULL;
5131 } else {
5132 PyErr_Format(PyExc_ValueError,
5133 "mbcs encoding does not support errors='%s'",
5134 errors);
5135 return -1;
5136 }
5137
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005138 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005139 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005140 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5141 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005142 if (mbcssize == 0) {
5143 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5144 return -1;
5145 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005146 /* If we used a default char, then we failed! */
5147 if (pusedDefaultChar && *pusedDefaultChar)
5148 goto mbcs_encode_error;
5149 } else {
5150 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005151 }
5152
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005153 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005154 /* Create string object */
5155 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5156 if (*repr == NULL)
5157 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005158 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005159 }
5160 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005161 /* Extend string object */
5162 n = PyBytes_Size(*repr);
5163 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5164 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005165 }
5166
5167 /* Do the conversion */
5168 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005169 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005170 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5171 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005172 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5173 return -1;
5174 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005175 if (pusedDefaultChar && *pusedDefaultChar)
5176 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005177 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005178 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005179
5180mbcs_encode_error:
5181 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5182 Py_XDECREF(exc);
5183 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005184}
5185
Alexander Belopolsky40018472011-02-26 01:02:56 +00005186PyObject *
5187PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5188 Py_ssize_t size,
5189 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005190{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005191 PyObject *repr = NULL;
5192 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005193
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005194#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005195 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005196 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005197 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005198 else
5199#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005200 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005201
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005202 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005203 Py_XDECREF(repr);
5204 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005205 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005206
5207#ifdef NEED_RETRY
5208 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005209 p += INT_MAX;
5210 size -= INT_MAX;
5211 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005212 }
5213#endif
5214
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005215 return repr;
5216}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005217
Alexander Belopolsky40018472011-02-26 01:02:56 +00005218PyObject *
5219PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005220{
5221 if (!PyUnicode_Check(unicode)) {
5222 PyErr_BadArgument();
5223 return NULL;
5224 }
5225 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005226 PyUnicode_GET_SIZE(unicode),
5227 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005228}
5229
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005230#undef NEED_RETRY
5231
Victor Stinner99b95382011-07-04 14:23:54 +02005232#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005233
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234/* --- Character Mapping Codec -------------------------------------------- */
5235
Alexander Belopolsky40018472011-02-26 01:02:56 +00005236PyObject *
5237PyUnicode_DecodeCharmap(const char *s,
5238 Py_ssize_t size,
5239 PyObject *mapping,
5240 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005242 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005243 Py_ssize_t startinpos;
5244 Py_ssize_t endinpos;
5245 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005246 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247 PyUnicodeObject *v;
5248 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005249 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005250 PyObject *errorHandler = NULL;
5251 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005252 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005253 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005254
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255 /* Default to Latin-1 */
5256 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258
5259 v = _PyUnicode_New(size);
5260 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005265 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005266 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005267 mapstring = PyUnicode_AS_UNICODE(mapping);
5268 maplen = PyUnicode_GET_SIZE(mapping);
5269 while (s < e) {
5270 unsigned char ch = *s;
5271 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272
Benjamin Peterson29060642009-01-31 22:14:21 +00005273 if (ch < maplen)
5274 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275
Benjamin Peterson29060642009-01-31 22:14:21 +00005276 if (x == 0xfffe) {
5277 /* undefined mapping */
5278 outpos = p-PyUnicode_AS_UNICODE(v);
5279 startinpos = s-starts;
5280 endinpos = startinpos+1;
5281 if (unicode_decode_call_errorhandler(
5282 errors, &errorHandler,
5283 "charmap", "character maps to <undefined>",
5284 &starts, &e, &startinpos, &endinpos, &exc, &s,
5285 &v, &outpos, &p)) {
5286 goto onError;
5287 }
5288 continue;
5289 }
5290 *p++ = x;
5291 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005292 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005293 }
5294 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005295 while (s < e) {
5296 unsigned char ch = *s;
5297 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005298
Benjamin Peterson29060642009-01-31 22:14:21 +00005299 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5300 w = PyLong_FromLong((long)ch);
5301 if (w == NULL)
5302 goto onError;
5303 x = PyObject_GetItem(mapping, w);
5304 Py_DECREF(w);
5305 if (x == NULL) {
5306 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5307 /* No mapping found means: mapping is undefined. */
5308 PyErr_Clear();
5309 x = Py_None;
5310 Py_INCREF(x);
5311 } else
5312 goto onError;
5313 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005314
Benjamin Peterson29060642009-01-31 22:14:21 +00005315 /* Apply mapping */
5316 if (PyLong_Check(x)) {
5317 long value = PyLong_AS_LONG(x);
5318 if (value < 0 || value > 65535) {
5319 PyErr_SetString(PyExc_TypeError,
5320 "character mapping must be in range(65536)");
5321 Py_DECREF(x);
5322 goto onError;
5323 }
5324 *p++ = (Py_UNICODE)value;
5325 }
5326 else if (x == Py_None) {
5327 /* undefined mapping */
5328 outpos = p-PyUnicode_AS_UNICODE(v);
5329 startinpos = s-starts;
5330 endinpos = startinpos+1;
5331 if (unicode_decode_call_errorhandler(
5332 errors, &errorHandler,
5333 "charmap", "character maps to <undefined>",
5334 &starts, &e, &startinpos, &endinpos, &exc, &s,
5335 &v, &outpos, &p)) {
5336 Py_DECREF(x);
5337 goto onError;
5338 }
5339 Py_DECREF(x);
5340 continue;
5341 }
5342 else if (PyUnicode_Check(x)) {
5343 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005344
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 if (targetsize == 1)
5346 /* 1-1 mapping */
5347 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005348
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 else if (targetsize > 1) {
5350 /* 1-n mapping */
5351 if (targetsize > extrachars) {
5352 /* resize first */
5353 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5354 Py_ssize_t needed = (targetsize - extrachars) + \
5355 (targetsize << 2);
5356 extrachars += needed;
5357 /* XXX overflow detection missing */
5358 if (_PyUnicode_Resize(&v,
5359 PyUnicode_GET_SIZE(v) + needed) < 0) {
5360 Py_DECREF(x);
5361 goto onError;
5362 }
5363 p = PyUnicode_AS_UNICODE(v) + oldpos;
5364 }
5365 Py_UNICODE_COPY(p,
5366 PyUnicode_AS_UNICODE(x),
5367 targetsize);
5368 p += targetsize;
5369 extrachars -= targetsize;
5370 }
5371 /* 1-0 mapping: skip the character */
5372 }
5373 else {
5374 /* wrong return value */
5375 PyErr_SetString(PyExc_TypeError,
5376 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005377 Py_DECREF(x);
5378 goto onError;
5379 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005380 Py_DECREF(x);
5381 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005382 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 }
5384 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005385 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5386 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005387 Py_XDECREF(errorHandler);
5388 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005390
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005392 Py_XDECREF(errorHandler);
5393 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 Py_XDECREF(v);
5395 return NULL;
5396}
5397
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005398/* Charmap encoding: the lookup table */
5399
Alexander Belopolsky40018472011-02-26 01:02:56 +00005400struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00005401 PyObject_HEAD
5402 unsigned char level1[32];
5403 int count2, count3;
5404 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005405};
5406
5407static PyObject*
5408encoding_map_size(PyObject *obj, PyObject* args)
5409{
5410 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005411 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005413}
5414
5415static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005416 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 PyDoc_STR("Return the size (in bytes) of this object") },
5418 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005419};
5420
5421static void
5422encoding_map_dealloc(PyObject* o)
5423{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005424 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005425}
5426
5427static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005428 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 "EncodingMap", /*tp_name*/
5430 sizeof(struct encoding_map), /*tp_basicsize*/
5431 0, /*tp_itemsize*/
5432 /* methods */
5433 encoding_map_dealloc, /*tp_dealloc*/
5434 0, /*tp_print*/
5435 0, /*tp_getattr*/
5436 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005437 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005438 0, /*tp_repr*/
5439 0, /*tp_as_number*/
5440 0, /*tp_as_sequence*/
5441 0, /*tp_as_mapping*/
5442 0, /*tp_hash*/
5443 0, /*tp_call*/
5444 0, /*tp_str*/
5445 0, /*tp_getattro*/
5446 0, /*tp_setattro*/
5447 0, /*tp_as_buffer*/
5448 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5449 0, /*tp_doc*/
5450 0, /*tp_traverse*/
5451 0, /*tp_clear*/
5452 0, /*tp_richcompare*/
5453 0, /*tp_weaklistoffset*/
5454 0, /*tp_iter*/
5455 0, /*tp_iternext*/
5456 encoding_map_methods, /*tp_methods*/
5457 0, /*tp_members*/
5458 0, /*tp_getset*/
5459 0, /*tp_base*/
5460 0, /*tp_dict*/
5461 0, /*tp_descr_get*/
5462 0, /*tp_descr_set*/
5463 0, /*tp_dictoffset*/
5464 0, /*tp_init*/
5465 0, /*tp_alloc*/
5466 0, /*tp_new*/
5467 0, /*tp_free*/
5468 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005469};
5470
5471PyObject*
5472PyUnicode_BuildEncodingMap(PyObject* string)
5473{
5474 Py_UNICODE *decode;
5475 PyObject *result;
5476 struct encoding_map *mresult;
5477 int i;
5478 int need_dict = 0;
5479 unsigned char level1[32];
5480 unsigned char level2[512];
5481 unsigned char *mlevel1, *mlevel2, *mlevel3;
5482 int count2 = 0, count3 = 0;
5483
5484 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5485 PyErr_BadArgument();
5486 return NULL;
5487 }
5488 decode = PyUnicode_AS_UNICODE(string);
5489 memset(level1, 0xFF, sizeof level1);
5490 memset(level2, 0xFF, sizeof level2);
5491
5492 /* If there isn't a one-to-one mapping of NULL to \0,
5493 or if there are non-BMP characters, we need to use
5494 a mapping dictionary. */
5495 if (decode[0] != 0)
5496 need_dict = 1;
5497 for (i = 1; i < 256; i++) {
5498 int l1, l2;
5499 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005500#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005501 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005502#endif
5503 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005504 need_dict = 1;
5505 break;
5506 }
5507 if (decode[i] == 0xFFFE)
5508 /* unmapped character */
5509 continue;
5510 l1 = decode[i] >> 11;
5511 l2 = decode[i] >> 7;
5512 if (level1[l1] == 0xFF)
5513 level1[l1] = count2++;
5514 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005515 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005516 }
5517
5518 if (count2 >= 0xFF || count3 >= 0xFF)
5519 need_dict = 1;
5520
5521 if (need_dict) {
5522 PyObject *result = PyDict_New();
5523 PyObject *key, *value;
5524 if (!result)
5525 return NULL;
5526 for (i = 0; i < 256; i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00005527 key = PyLong_FromLong(decode[i]);
5528 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005529 if (!key || !value)
5530 goto failed1;
5531 if (PyDict_SetItem(result, key, value) == -1)
5532 goto failed1;
5533 Py_DECREF(key);
5534 Py_DECREF(value);
5535 }
5536 return result;
5537 failed1:
5538 Py_XDECREF(key);
5539 Py_XDECREF(value);
5540 Py_DECREF(result);
5541 return NULL;
5542 }
5543
5544 /* Create a three-level trie */
5545 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5546 16*count2 + 128*count3 - 1);
5547 if (!result)
5548 return PyErr_NoMemory();
5549 PyObject_Init(result, &EncodingMapType);
5550 mresult = (struct encoding_map*)result;
5551 mresult->count2 = count2;
5552 mresult->count3 = count3;
5553 mlevel1 = mresult->level1;
5554 mlevel2 = mresult->level23;
5555 mlevel3 = mresult->level23 + 16*count2;
5556 memcpy(mlevel1, level1, 32);
5557 memset(mlevel2, 0xFF, 16*count2);
5558 memset(mlevel3, 0, 128*count3);
5559 count3 = 0;
5560 for (i = 1; i < 256; i++) {
5561 int o1, o2, o3, i2, i3;
5562 if (decode[i] == 0xFFFE)
5563 /* unmapped character */
5564 continue;
5565 o1 = decode[i]>>11;
5566 o2 = (decode[i]>>7) & 0xF;
5567 i2 = 16*mlevel1[o1] + o2;
5568 if (mlevel2[i2] == 0xFF)
5569 mlevel2[i2] = count3++;
5570 o3 = decode[i] & 0x7F;
5571 i3 = 128*mlevel2[i2] + o3;
5572 mlevel3[i3] = i;
5573 }
5574 return result;
5575}
5576
5577static int
5578encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5579{
5580 struct encoding_map *map = (struct encoding_map*)mapping;
5581 int l1 = c>>11;
5582 int l2 = (c>>7) & 0xF;
5583 int l3 = c & 0x7F;
5584 int i;
5585
5586#ifdef Py_UNICODE_WIDE
5587 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005589 }
5590#endif
5591 if (c == 0)
5592 return 0;
5593 /* level 1*/
5594 i = map->level1[l1];
5595 if (i == 0xFF) {
5596 return -1;
5597 }
5598 /* level 2*/
5599 i = map->level23[16*i+l2];
5600 if (i == 0xFF) {
5601 return -1;
5602 }
5603 /* level 3 */
5604 i = map->level23[16*map->count2 + 128*i + l3];
5605 if (i == 0) {
5606 return -1;
5607 }
5608 return i;
5609}
5610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005611/* Lookup the character ch in the mapping. If the character
5612 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005613 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005614static PyObject *
5615charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616{
Christian Heimes217cfd12007-12-02 14:31:20 +00005617 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618 PyObject *x;
5619
5620 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005621 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005622 x = PyObject_GetItem(mapping, w);
5623 Py_DECREF(w);
5624 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005625 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5626 /* No mapping found means: mapping is undefined. */
5627 PyErr_Clear();
5628 x = Py_None;
5629 Py_INCREF(x);
5630 return x;
5631 } else
5632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005634 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005635 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005636 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 long value = PyLong_AS_LONG(x);
5638 if (value < 0 || value > 255) {
5639 PyErr_SetString(PyExc_TypeError,
5640 "character mapping must be in range(256)");
5641 Py_DECREF(x);
5642 return NULL;
5643 }
5644 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005646 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005649 /* wrong return value */
5650 PyErr_Format(PyExc_TypeError,
5651 "character mapping must return integer, bytes or None, not %.400s",
5652 x->ob_type->tp_name);
5653 Py_DECREF(x);
5654 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 }
5656}
5657
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005658static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005659charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005660{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005661 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5662 /* exponentially overallocate to minimize reallocations */
5663 if (requiredsize < 2*outsize)
5664 requiredsize = 2*outsize;
5665 if (_PyBytes_Resize(outobj, requiredsize))
5666 return -1;
5667 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005668}
5669
Benjamin Peterson14339b62009-01-31 16:36:08 +00005670typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005671 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00005672} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005673/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005674 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005675 space is available. Return a new reference to the object that
5676 was put in the output buffer, or Py_None, if the mapping was undefined
5677 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005678 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005679static charmapencode_result
5680charmapencode_output(Py_UNICODE c, PyObject *mapping,
5681 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005682{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005683 PyObject *rep;
5684 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005685 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686
Christian Heimes90aa7642007-12-19 02:45:37 +00005687 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005688 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005690 if (res == -1)
5691 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 if (outsize<requiredsize)
5693 if (charmapencode_resize(outobj, outpos, requiredsize))
5694 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005695 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 outstart[(*outpos)++] = (char)res;
5697 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005698 }
5699
5700 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005703 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005704 Py_DECREF(rep);
5705 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005706 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 if (PyLong_Check(rep)) {
5708 Py_ssize_t requiredsize = *outpos+1;
5709 if (outsize<requiredsize)
5710 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5711 Py_DECREF(rep);
5712 return enc_EXCEPTION;
5713 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005714 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005716 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 else {
5718 const char *repchars = PyBytes_AS_STRING(rep);
5719 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5720 Py_ssize_t requiredsize = *outpos+repsize;
5721 if (outsize<requiredsize)
5722 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5723 Py_DECREF(rep);
5724 return enc_EXCEPTION;
5725 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005726 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005727 memcpy(outstart + *outpos, repchars, repsize);
5728 *outpos += repsize;
5729 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005730 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005731 Py_DECREF(rep);
5732 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005733}
5734
5735/* handle an error in PyUnicode_EncodeCharmap
5736 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005737static int
5738charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005739 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005740 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005741 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005742 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005743{
5744 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005745 Py_ssize_t repsize;
5746 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005747 Py_UNICODE *uni2;
5748 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005749 Py_ssize_t collstartpos = *inpos;
5750 Py_ssize_t collendpos = *inpos+1;
5751 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752 char *encoding = "charmap";
5753 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005754 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005755
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756 /* find all unencodable characters */
5757 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005758 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005759 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 int res = encoding_map_lookup(p[collendpos], mapping);
5761 if (res != -1)
5762 break;
5763 ++collendpos;
5764 continue;
5765 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005766
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 rep = charmapencode_lookup(p[collendpos], mapping);
5768 if (rep==NULL)
5769 return -1;
5770 else if (rep!=Py_None) {
5771 Py_DECREF(rep);
5772 break;
5773 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005774 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005775 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005776 }
5777 /* cache callback name lookup
5778 * (if not done yet, i.e. it's the first error) */
5779 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 if ((errors==NULL) || (!strcmp(errors, "strict")))
5781 *known_errorHandler = 1;
5782 else if (!strcmp(errors, "replace"))
5783 *known_errorHandler = 2;
5784 else if (!strcmp(errors, "ignore"))
5785 *known_errorHandler = 3;
5786 else if (!strcmp(errors, "xmlcharrefreplace"))
5787 *known_errorHandler = 4;
5788 else
5789 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005790 }
5791 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005792 case 1: /* strict */
5793 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5794 return -1;
5795 case 2: /* replace */
5796 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005797 x = charmapencode_output('?', mapping, res, respos);
5798 if (x==enc_EXCEPTION) {
5799 return -1;
5800 }
5801 else if (x==enc_FAILED) {
5802 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5803 return -1;
5804 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005805 }
5806 /* fall through */
5807 case 3: /* ignore */
5808 *inpos = collendpos;
5809 break;
5810 case 4: /* xmlcharrefreplace */
5811 /* generate replacement (temporarily (mis)uses p) */
5812 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 char buffer[2+29+1+1];
5814 char *cp;
5815 sprintf(buffer, "&#%d;", (int)p[collpos]);
5816 for (cp = buffer; *cp; ++cp) {
5817 x = charmapencode_output(*cp, mapping, res, respos);
5818 if (x==enc_EXCEPTION)
5819 return -1;
5820 else if (x==enc_FAILED) {
5821 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5822 return -1;
5823 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005824 }
5825 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005826 *inpos = collendpos;
5827 break;
5828 default:
5829 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 encoding, reason, p, size, exceptionObject,
5831 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005832 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005834 if (PyBytes_Check(repunicode)) {
5835 /* Directly copy bytes result to output. */
5836 Py_ssize_t outsize = PyBytes_Size(*res);
5837 Py_ssize_t requiredsize;
5838 repsize = PyBytes_Size(repunicode);
5839 requiredsize = *respos + repsize;
5840 if (requiredsize > outsize)
5841 /* Make room for all additional bytes. */
5842 if (charmapencode_resize(res, respos, requiredsize)) {
5843 Py_DECREF(repunicode);
5844 return -1;
5845 }
5846 memcpy(PyBytes_AsString(*res) + *respos,
5847 PyBytes_AsString(repunicode), repsize);
5848 *respos += repsize;
5849 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005850 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005851 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005852 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005853 /* generate replacement */
5854 repsize = PyUnicode_GET_SIZE(repunicode);
5855 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005856 x = charmapencode_output(*uni2, mapping, res, respos);
5857 if (x==enc_EXCEPTION) {
5858 return -1;
5859 }
5860 else if (x==enc_FAILED) {
5861 Py_DECREF(repunicode);
5862 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5863 return -1;
5864 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005865 }
5866 *inpos = newpos;
5867 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005868 }
5869 return 0;
5870}
5871
Alexander Belopolsky40018472011-02-26 01:02:56 +00005872PyObject *
5873PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5874 Py_ssize_t size,
5875 PyObject *mapping,
5876 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005878 /* output object */
5879 PyObject *res = NULL;
5880 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005881 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005883 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005884 PyObject *errorHandler = NULL;
5885 PyObject *exc = NULL;
5886 /* the following variable is used for caching string comparisons
5887 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5888 * 3=ignore, 4=xmlcharrefreplace */
5889 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890
5891 /* Default to Latin-1 */
5892 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005893 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 /* allocate enough for a simple encoding without
5896 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005897 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 if (res == NULL)
5899 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005900 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005903 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 /* try to encode it */
5905 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5906 if (x==enc_EXCEPTION) /* error */
5907 goto onError;
5908 if (x==enc_FAILED) { /* unencodable character */
5909 if (charmap_encoding_error(p, size, &inpos, mapping,
5910 &exc,
5911 &known_errorHandler, &errorHandler, errors,
5912 &res, &respos)) {
5913 goto onError;
5914 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005915 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005916 else
5917 /* done with this character => adjust input position */
5918 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005921 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005922 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005923 if (_PyBytes_Resize(&res, respos) < 0)
5924 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005925
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005926 Py_XDECREF(exc);
5927 Py_XDECREF(errorHandler);
5928 return res;
5929
Benjamin Peterson29060642009-01-31 22:14:21 +00005930 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005931 Py_XDECREF(res);
5932 Py_XDECREF(exc);
5933 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 return NULL;
5935}
5936
Alexander Belopolsky40018472011-02-26 01:02:56 +00005937PyObject *
5938PyUnicode_AsCharmapString(PyObject *unicode,
5939 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940{
5941 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 PyErr_BadArgument();
5943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 }
5945 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 PyUnicode_GET_SIZE(unicode),
5947 mapping,
5948 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949}
5950
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005951/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005952static void
5953make_translate_exception(PyObject **exceptionObject,
5954 const Py_UNICODE *unicode, Py_ssize_t size,
5955 Py_ssize_t startpos, Py_ssize_t endpos,
5956 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005958 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005959 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 }
5962 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5964 goto onError;
5965 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5966 goto onError;
5967 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5968 goto onError;
5969 return;
5970 onError:
5971 Py_DECREF(*exceptionObject);
5972 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 }
5974}
5975
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005976/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005977static void
5978raise_translate_exception(PyObject **exceptionObject,
5979 const Py_UNICODE *unicode, Py_ssize_t size,
5980 Py_ssize_t startpos, Py_ssize_t endpos,
5981 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005982{
5983 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005985 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987}
5988
5989/* error handling callback helper:
5990 build arguments, call the callback and check the arguments,
5991 put the result into newpos and return the replacement string, which
5992 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005993static PyObject *
5994unicode_translate_call_errorhandler(const char *errors,
5995 PyObject **errorHandler,
5996 const char *reason,
5997 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5998 Py_ssize_t startpos, Py_ssize_t endpos,
5999 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006000{
Benjamin Peterson142957c2008-07-04 19:55:29 +00006001 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006002
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006003 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006004 PyObject *restuple;
6005 PyObject *resunicode;
6006
6007 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006009 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011 }
6012
6013 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006015 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006017
6018 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006020 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006022 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00006023 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 Py_DECREF(restuple);
6025 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026 }
6027 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 &resunicode, &i_newpos)) {
6029 Py_DECREF(restuple);
6030 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006031 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006032 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006034 else
6035 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006036 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6038 Py_DECREF(restuple);
6039 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006040 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006041 Py_INCREF(resunicode);
6042 Py_DECREF(restuple);
6043 return resunicode;
6044}
6045
6046/* Lookup the character ch in the mapping and put the result in result,
6047 which must be decrefed by the caller.
6048 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006049static int
6050charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006051{
Christian Heimes217cfd12007-12-02 14:31:20 +00006052 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006053 PyObject *x;
6054
6055 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006057 x = PyObject_GetItem(mapping, w);
6058 Py_DECREF(w);
6059 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6061 /* No mapping found means: use 1:1 mapping. */
6062 PyErr_Clear();
6063 *result = NULL;
6064 return 0;
6065 } else
6066 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006067 }
6068 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 *result = x;
6070 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006071 }
Christian Heimes217cfd12007-12-02 14:31:20 +00006072 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 long value = PyLong_AS_LONG(x);
6074 long max = PyUnicode_GetMax();
6075 if (value < 0 || value > max) {
6076 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00006077 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 Py_DECREF(x);
6079 return -1;
6080 }
6081 *result = x;
6082 return 0;
6083 }
6084 else if (PyUnicode_Check(x)) {
6085 *result = x;
6086 return 0;
6087 }
6088 else {
6089 /* wrong return value */
6090 PyErr_SetString(PyExc_TypeError,
6091 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006092 Py_DECREF(x);
6093 return -1;
6094 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006095}
6096/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 if not reallocate and adjust various state variables.
6098 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006099static int
6100charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006102{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006103 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006104 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 /* remember old output position */
6106 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6107 /* exponentially overallocate to minimize reallocations */
6108 if (requiredsize < 2 * oldsize)
6109 requiredsize = 2 * oldsize;
6110 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6111 return -1;
6112 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006113 }
6114 return 0;
6115}
6116/* lookup the character, put the result in the output string and adjust
6117 various state variables. Return a new reference to the object that
6118 was put in the output buffer in *result, or Py_None, if the mapping was
6119 undefined (in which case no character was written).
6120 The called must decref result.
6121 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006122static int
6123charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
6124 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6125 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126{
Walter Dörwald4894c302003-10-24 14:25:28 +00006127 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006129 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 /* not found => default to 1:1 mapping */
6131 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006132 }
6133 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006135 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 /* no overflow check, because we know that the space is enough */
6137 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138 }
6139 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6141 if (repsize==1) {
6142 /* no overflow check, because we know that the space is enough */
6143 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6144 }
6145 else if (repsize!=0) {
6146 /* more than one character */
6147 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6148 (insize - (curinp-startinp)) +
6149 repsize - 1;
6150 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6151 return -1;
6152 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6153 *outp += repsize;
6154 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006155 }
6156 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158 return 0;
6159}
6160
Alexander Belopolsky40018472011-02-26 01:02:56 +00006161PyObject *
6162PyUnicode_TranslateCharmap(const Py_UNICODE *p,
6163 Py_ssize_t size,
6164 PyObject *mapping,
6165 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006167 /* output object */
6168 PyObject *res = NULL;
6169 /* pointers to the beginning and end+1 of input */
6170 const Py_UNICODE *startp = p;
6171 const Py_UNICODE *endp = p + size;
6172 /* pointer into the output */
6173 Py_UNICODE *str;
6174 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006175 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006176 char *reason = "character maps to <undefined>";
6177 PyObject *errorHandler = NULL;
6178 PyObject *exc = NULL;
6179 /* the following variable is used for caching string comparisons
6180 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6181 * 3=ignore, 4=xmlcharrefreplace */
6182 int known_errorHandler = -1;
6183
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 PyErr_BadArgument();
6186 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006188
6189 /* allocate enough for a simple 1:1 translation without
6190 replacements, if we need more, we'll resize */
6191 res = PyUnicode_FromUnicode(NULL, size);
6192 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006193 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006196 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006198 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006199 /* try to encode it */
6200 PyObject *x = NULL;
6201 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6202 Py_XDECREF(x);
6203 goto onError;
6204 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006205 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 if (x!=Py_None) /* it worked => adjust input pointer */
6207 ++p;
6208 else { /* untranslatable character */
6209 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6210 Py_ssize_t repsize;
6211 Py_ssize_t newpos;
6212 Py_UNICODE *uni2;
6213 /* startpos for collecting untranslatable chars */
6214 const Py_UNICODE *collstart = p;
6215 const Py_UNICODE *collend = p+1;
6216 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 /* find all untranslatable characters */
6219 while (collend < endp) {
6220 if (charmaptranslate_lookup(*collend, mapping, &x))
6221 goto onError;
6222 Py_XDECREF(x);
6223 if (x!=Py_None)
6224 break;
6225 ++collend;
6226 }
6227 /* cache callback name lookup
6228 * (if not done yet, i.e. it's the first error) */
6229 if (known_errorHandler==-1) {
6230 if ((errors==NULL) || (!strcmp(errors, "strict")))
6231 known_errorHandler = 1;
6232 else if (!strcmp(errors, "replace"))
6233 known_errorHandler = 2;
6234 else if (!strcmp(errors, "ignore"))
6235 known_errorHandler = 3;
6236 else if (!strcmp(errors, "xmlcharrefreplace"))
6237 known_errorHandler = 4;
6238 else
6239 known_errorHandler = 0;
6240 }
6241 switch (known_errorHandler) {
6242 case 1: /* strict */
6243 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006244 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 case 2: /* replace */
6246 /* No need to check for space, this is a 1:1 replacement */
6247 for (coll = collstart; coll<collend; ++coll)
6248 *str++ = '?';
6249 /* fall through */
6250 case 3: /* ignore */
6251 p = collend;
6252 break;
6253 case 4: /* xmlcharrefreplace */
6254 /* generate replacement (temporarily (mis)uses p) */
6255 for (p = collstart; p < collend; ++p) {
6256 char buffer[2+29+1+1];
6257 char *cp;
6258 sprintf(buffer, "&#%d;", (int)*p);
6259 if (charmaptranslate_makespace(&res, &str,
6260 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6261 goto onError;
6262 for (cp = buffer; *cp; ++cp)
6263 *str++ = *cp;
6264 }
6265 p = collend;
6266 break;
6267 default:
6268 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6269 reason, startp, size, &exc,
6270 collstart-startp, collend-startp, &newpos);
6271 if (repunicode == NULL)
6272 goto onError;
6273 /* generate replacement */
6274 repsize = PyUnicode_GET_SIZE(repunicode);
6275 if (charmaptranslate_makespace(&res, &str,
6276 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6277 Py_DECREF(repunicode);
6278 goto onError;
6279 }
6280 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6281 *str++ = *uni2;
6282 p = startp + newpos;
6283 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006284 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006285 }
6286 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006287 /* Resize if we allocated to much */
6288 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006289 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 if (PyUnicode_Resize(&res, respos) < 0)
6291 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006292 }
6293 Py_XDECREF(exc);
6294 Py_XDECREF(errorHandler);
6295 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006298 Py_XDECREF(res);
6299 Py_XDECREF(exc);
6300 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 return NULL;
6302}
6303
Alexander Belopolsky40018472011-02-26 01:02:56 +00006304PyObject *
6305PyUnicode_Translate(PyObject *str,
6306 PyObject *mapping,
6307 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308{
6309 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006310
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311 str = PyUnicode_FromObject(str);
6312 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 PyUnicode_GET_SIZE(str),
6316 mapping,
6317 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318 Py_DECREF(str);
6319 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006320
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322 Py_XDECREF(str);
6323 return NULL;
6324}
Tim Petersced69f82003-09-16 20:30:58 +00006325
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006326PyObject *
6327PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6328 Py_ssize_t length)
6329{
6330 PyObject *result;
6331 Py_UNICODE *p; /* write pointer into result */
6332 Py_ssize_t i;
6333 /* Copy to a new string */
6334 result = (PyObject *)_PyUnicode_New(length);
6335 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6336 if (result == NULL)
6337 return result;
6338 p = PyUnicode_AS_UNICODE(result);
6339 /* Iterate over code points */
6340 for (i = 0; i < length; i++) {
6341 Py_UNICODE ch =s[i];
6342 if (ch > 127) {
6343 int decimal = Py_UNICODE_TODECIMAL(ch);
6344 if (decimal >= 0)
6345 p[i] = '0' + decimal;
6346 }
6347 }
6348 return result;
6349}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006350/* --- Decimal Encoder ---------------------------------------------------- */
6351
Alexander Belopolsky40018472011-02-26 01:02:56 +00006352int
6353PyUnicode_EncodeDecimal(Py_UNICODE *s,
6354 Py_ssize_t length,
6355 char *output,
6356 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006357{
6358 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006359 PyObject *errorHandler = NULL;
6360 PyObject *exc = NULL;
6361 const char *encoding = "decimal";
6362 const char *reason = "invalid decimal Unicode string";
6363 /* the following variable is used for caching string comparisons
6364 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6365 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006366
6367 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 PyErr_BadArgument();
6369 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006370 }
6371
6372 p = s;
6373 end = s + length;
6374 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 register Py_UNICODE ch = *p;
6376 int decimal;
6377 PyObject *repunicode;
6378 Py_ssize_t repsize;
6379 Py_ssize_t newpos;
6380 Py_UNICODE *uni2;
6381 Py_UNICODE *collstart;
6382 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006383
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006385 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 ++p;
6387 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006388 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 decimal = Py_UNICODE_TODECIMAL(ch);
6390 if (decimal >= 0) {
6391 *output++ = '0' + decimal;
6392 ++p;
6393 continue;
6394 }
6395 if (0 < ch && ch < 256) {
6396 *output++ = (char)ch;
6397 ++p;
6398 continue;
6399 }
6400 /* All other characters are considered unencodable */
6401 collstart = p;
6402 collend = p+1;
6403 while (collend < end) {
6404 if ((0 < *collend && *collend < 256) ||
6405 !Py_UNICODE_ISSPACE(*collend) ||
6406 Py_UNICODE_TODECIMAL(*collend))
6407 break;
6408 }
6409 /* cache callback name lookup
6410 * (if not done yet, i.e. it's the first error) */
6411 if (known_errorHandler==-1) {
6412 if ((errors==NULL) || (!strcmp(errors, "strict")))
6413 known_errorHandler = 1;
6414 else if (!strcmp(errors, "replace"))
6415 known_errorHandler = 2;
6416 else if (!strcmp(errors, "ignore"))
6417 known_errorHandler = 3;
6418 else if (!strcmp(errors, "xmlcharrefreplace"))
6419 known_errorHandler = 4;
6420 else
6421 known_errorHandler = 0;
6422 }
6423 switch (known_errorHandler) {
6424 case 1: /* strict */
6425 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6426 goto onError;
6427 case 2: /* replace */
6428 for (p = collstart; p < collend; ++p)
6429 *output++ = '?';
6430 /* fall through */
6431 case 3: /* ignore */
6432 p = collend;
6433 break;
6434 case 4: /* xmlcharrefreplace */
6435 /* generate replacement (temporarily (mis)uses p) */
6436 for (p = collstart; p < collend; ++p)
6437 output += sprintf(output, "&#%d;", (int)*p);
6438 p = collend;
6439 break;
6440 default:
6441 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6442 encoding, reason, s, length, &exc,
6443 collstart-s, collend-s, &newpos);
6444 if (repunicode == NULL)
6445 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006446 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006447 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006448 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6449 Py_DECREF(repunicode);
6450 goto onError;
6451 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 /* generate replacement */
6453 repsize = PyUnicode_GET_SIZE(repunicode);
6454 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6455 Py_UNICODE ch = *uni2;
6456 if (Py_UNICODE_ISSPACE(ch))
6457 *output++ = ' ';
6458 else {
6459 decimal = Py_UNICODE_TODECIMAL(ch);
6460 if (decimal >= 0)
6461 *output++ = '0' + decimal;
6462 else if (0 < ch && ch < 256)
6463 *output++ = (char)ch;
6464 else {
6465 Py_DECREF(repunicode);
6466 raise_encode_exception(&exc, encoding,
6467 s, length, collstart-s, collend-s, reason);
6468 goto onError;
6469 }
6470 }
6471 }
6472 p = s + newpos;
6473 Py_DECREF(repunicode);
6474 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006475 }
6476 /* 0-terminate the output string */
6477 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006478 Py_XDECREF(exc);
6479 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006480 return 0;
6481
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483 Py_XDECREF(exc);
6484 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006485 return -1;
6486}
6487
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488/* --- Helpers ------------------------------------------------------------ */
6489
Eric Smith8c663262007-08-25 02:26:07 +00006490#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006491#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006492
Thomas Wouters477c8d52006-05-27 19:21:47 +00006493#include "stringlib/count.h"
6494#include "stringlib/find.h"
6495#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006496#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006497
Eric Smith5807c412008-05-11 21:00:57 +00006498#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006499#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006500#include "stringlib/localeutil.h"
6501
Thomas Wouters477c8d52006-05-27 19:21:47 +00006502/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006503#define ADJUST_INDICES(start, end, len) \
6504 if (end > len) \
6505 end = len; \
6506 else if (end < 0) { \
6507 end += len; \
6508 if (end < 0) \
6509 end = 0; \
6510 } \
6511 if (start < 0) { \
6512 start += len; \
6513 if (start < 0) \
6514 start = 0; \
6515 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006516
Alexander Belopolsky40018472011-02-26 01:02:56 +00006517Py_ssize_t
6518PyUnicode_Count(PyObject *str,
6519 PyObject *substr,
6520 Py_ssize_t start,
6521 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006523 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006524 PyUnicodeObject* str_obj;
6525 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006526
Thomas Wouters477c8d52006-05-27 19:21:47 +00006527 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6528 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006530 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6531 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 Py_DECREF(str_obj);
6533 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 }
Tim Petersced69f82003-09-16 20:30:58 +00006535
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006536 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006537 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006538 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6539 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006540 );
6541
6542 Py_DECREF(sub_obj);
6543 Py_DECREF(str_obj);
6544
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 return result;
6546}
6547
Alexander Belopolsky40018472011-02-26 01:02:56 +00006548Py_ssize_t
6549PyUnicode_Find(PyObject *str,
6550 PyObject *sub,
6551 Py_ssize_t start,
6552 Py_ssize_t end,
6553 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006555 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006556
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006558 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006560 sub = PyUnicode_FromObject(sub);
6561 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 Py_DECREF(str);
6563 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 }
Tim Petersced69f82003-09-16 20:30:58 +00006565
Thomas Wouters477c8d52006-05-27 19:21:47 +00006566 if (direction > 0)
6567 result = stringlib_find_slice(
6568 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6569 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6570 start, end
6571 );
6572 else
6573 result = stringlib_rfind_slice(
6574 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6575 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6576 start, end
6577 );
6578
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006580 Py_DECREF(sub);
6581
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 return result;
6583}
6584
Alexander Belopolsky40018472011-02-26 01:02:56 +00006585static int
6586tailmatch(PyUnicodeObject *self,
6587 PyUnicodeObject *substring,
6588 Py_ssize_t start,
6589 Py_ssize_t end,
6590 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 if (substring->length == 0)
6593 return 1;
6594
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006595 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596 end -= substring->length;
6597 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599
6600 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 if (Py_UNICODE_MATCH(self, end, substring))
6602 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 } else {
6604 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 }
6607
6608 return 0;
6609}
6610
Alexander Belopolsky40018472011-02-26 01:02:56 +00006611Py_ssize_t
6612PyUnicode_Tailmatch(PyObject *str,
6613 PyObject *substr,
6614 Py_ssize_t start,
6615 Py_ssize_t end,
6616 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006618 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006619
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 str = PyUnicode_FromObject(str);
6621 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623 substr = PyUnicode_FromObject(substr);
6624 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 Py_DECREF(str);
6626 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 }
Tim Petersced69f82003-09-16 20:30:58 +00006628
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 (PyUnicodeObject *)substr,
6631 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 Py_DECREF(str);
6633 Py_DECREF(substr);
6634 return result;
6635}
6636
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637/* Apply fixfct filter to the Unicode object self and return a
6638 reference to the modified object */
6639
Alexander Belopolsky40018472011-02-26 01:02:56 +00006640static PyObject *
6641fixup(PyUnicodeObject *self,
6642 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643{
6644
6645 PyUnicodeObject *u;
6646
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006647 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006650
6651 Py_UNICODE_COPY(u->str, self->str, self->length);
6652
Tim Peters7a29bd52001-09-12 03:03:31 +00006653 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 /* fixfct should return TRUE if it modified the buffer. If
6655 FALSE, return a reference to the original buffer instead
6656 (to save space, not time) */
6657 Py_INCREF(self);
6658 Py_DECREF(u);
6659 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 }
6661 return (PyObject*) u;
6662}
6663
Alexander Belopolsky40018472011-02-26 01:02:56 +00006664static int
6665fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006667 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 Py_UNICODE *s = self->str;
6669 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006670
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006673
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 ch = Py_UNICODE_TOUPPER(*s);
6675 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006677 *s = ch;
6678 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 s++;
6680 }
6681
6682 return status;
6683}
6684
Alexander Belopolsky40018472011-02-26 01:02:56 +00006685static int
6686fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006688 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 Py_UNICODE *s = self->str;
6690 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006691
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006694
Benjamin Peterson29060642009-01-31 22:14:21 +00006695 ch = Py_UNICODE_TOLOWER(*s);
6696 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 *s = ch;
6699 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700 s++;
6701 }
6702
6703 return status;
6704}
6705
Alexander Belopolsky40018472011-02-26 01:02:56 +00006706static int
6707fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006709 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 Py_UNICODE *s = self->str;
6711 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006712
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 while (len-- > 0) {
6714 if (Py_UNICODE_ISUPPER(*s)) {
6715 *s = Py_UNICODE_TOLOWER(*s);
6716 status = 1;
6717 } else if (Py_UNICODE_ISLOWER(*s)) {
6718 *s = Py_UNICODE_TOUPPER(*s);
6719 status = 1;
6720 }
6721 s++;
6722 }
6723
6724 return status;
6725}
6726
Alexander Belopolsky40018472011-02-26 01:02:56 +00006727static int
6728fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006730 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006731 Py_UNICODE *s = self->str;
6732 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006733
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006734 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006736 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 *s = Py_UNICODE_TOUPPER(*s);
6738 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006740 s++;
6741 while (--len > 0) {
6742 if (Py_UNICODE_ISUPPER(*s)) {
6743 *s = Py_UNICODE_TOLOWER(*s);
6744 status = 1;
6745 }
6746 s++;
6747 }
6748 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749}
6750
Alexander Belopolsky40018472011-02-26 01:02:56 +00006751static int
6752fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753{
6754 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6755 register Py_UNICODE *e;
6756 int previous_is_cased;
6757
6758 /* Shortcut for single character strings */
6759 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6761 if (*p != ch) {
6762 *p = ch;
6763 return 1;
6764 }
6765 else
6766 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 }
Tim Petersced69f82003-09-16 20:30:58 +00006768
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 e = p + PyUnicode_GET_SIZE(self);
6770 previous_is_cased = 0;
6771 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006773
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 if (previous_is_cased)
6775 *p = Py_UNICODE_TOLOWER(ch);
6776 else
6777 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006778
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 if (Py_UNICODE_ISLOWER(ch) ||
6780 Py_UNICODE_ISUPPER(ch) ||
6781 Py_UNICODE_ISTITLE(ch))
6782 previous_is_cased = 1;
6783 else
6784 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 }
6786 return 1;
6787}
6788
Tim Peters8ce9f162004-08-27 01:49:32 +00006789PyObject *
6790PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791{
Skip Montanaro6543b452004-09-16 03:28:13 +00006792 const Py_UNICODE blank = ' ';
6793 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006794 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006795 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006796 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6797 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006798 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6799 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006800 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006801 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802
Tim Peters05eba1f2004-08-27 21:32:02 +00006803 fseq = PySequence_Fast(seq, "");
6804 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006805 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006806 }
6807
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006808 /* NOTE: the following code can't call back into Python code,
6809 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006810 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006811
Tim Peters05eba1f2004-08-27 21:32:02 +00006812 seqlen = PySequence_Fast_GET_SIZE(fseq);
6813 /* If empty sequence, return u"". */
6814 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006815 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6816 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006817 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006818 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006819 /* If singleton sequence with an exact Unicode, return that. */
6820 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 item = items[0];
6822 if (PyUnicode_CheckExact(item)) {
6823 Py_INCREF(item);
6824 res = (PyUnicodeObject *)item;
6825 goto Done;
6826 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006827 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006828 else {
6829 /* Set up sep and seplen */
6830 if (separator == NULL) {
6831 sep = &blank;
6832 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006833 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006834 else {
6835 if (!PyUnicode_Check(separator)) {
6836 PyErr_Format(PyExc_TypeError,
6837 "separator: expected str instance,"
6838 " %.80s found",
6839 Py_TYPE(separator)->tp_name);
6840 goto onError;
6841 }
6842 sep = PyUnicode_AS_UNICODE(separator);
6843 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006844 }
6845 }
6846
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006847 /* There are at least two things to join, or else we have a subclass
6848 * of str in the sequence.
6849 * Do a pre-pass to figure out the total amount of space we'll
6850 * need (sz), and see whether all argument are strings.
6851 */
6852 sz = 0;
6853 for (i = 0; i < seqlen; i++) {
6854 const Py_ssize_t old_sz = sz;
6855 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006856 if (!PyUnicode_Check(item)) {
6857 PyErr_Format(PyExc_TypeError,
6858 "sequence item %zd: expected str instance,"
6859 " %.80s found",
6860 i, Py_TYPE(item)->tp_name);
6861 goto onError;
6862 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006863 sz += PyUnicode_GET_SIZE(item);
6864 if (i != 0)
6865 sz += seplen;
6866 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6867 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006869 goto onError;
6870 }
6871 }
Tim Petersced69f82003-09-16 20:30:58 +00006872
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006873 res = _PyUnicode_New(sz);
6874 if (res == NULL)
6875 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006876
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006877 /* Catenate everything. */
6878 res_p = PyUnicode_AS_UNICODE(res);
6879 for (i = 0; i < seqlen; ++i) {
6880 Py_ssize_t itemlen;
6881 item = items[i];
6882 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 /* Copy item, and maybe the separator. */
6884 if (i) {
6885 Py_UNICODE_COPY(res_p, sep, seplen);
6886 res_p += seplen;
6887 }
6888 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6889 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006890 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006891
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006893 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894 return (PyObject *)res;
6895
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006897 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006898 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899 return NULL;
6900}
6901
Alexander Belopolsky40018472011-02-26 01:02:56 +00006902static PyUnicodeObject *
6903pad(PyUnicodeObject *self,
6904 Py_ssize_t left,
6905 Py_ssize_t right,
6906 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907{
6908 PyUnicodeObject *u;
6909
6910 if (left < 0)
6911 left = 0;
6912 if (right < 0)
6913 right = 0;
6914
Tim Peters7a29bd52001-09-12 03:03:31 +00006915 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 Py_INCREF(self);
6917 return self;
6918 }
6919
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006920 if (left > PY_SSIZE_T_MAX - self->length ||
6921 right > PY_SSIZE_T_MAX - (left + self->length)) {
6922 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6923 return NULL;
6924 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 u = _PyUnicode_New(left + self->length + right);
6926 if (u) {
6927 if (left)
6928 Py_UNICODE_FILL(u->str, fill, left);
6929 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6930 if (right)
6931 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6932 }
6933
6934 return u;
6935}
6936
Alexander Belopolsky40018472011-02-26 01:02:56 +00006937PyObject *
6938PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941
6942 string = PyUnicode_FromObject(string);
6943 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006944 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006946 list = stringlib_splitlines(
6947 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6948 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949
6950 Py_DECREF(string);
6951 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952}
6953
Alexander Belopolsky40018472011-02-26 01:02:56 +00006954static PyObject *
6955split(PyUnicodeObject *self,
6956 PyUnicodeObject *substring,
6957 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006960 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006963 return stringlib_split_whitespace(
6964 (PyObject*) self, self->str, self->length, maxcount
6965 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006967 return stringlib_split(
6968 (PyObject*) self, self->str, self->length,
6969 substring->str, substring->length,
6970 maxcount
6971 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972}
6973
Alexander Belopolsky40018472011-02-26 01:02:56 +00006974static PyObject *
6975rsplit(PyUnicodeObject *self,
6976 PyUnicodeObject *substring,
6977 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006978{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006979 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006980 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006981
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006982 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006983 return stringlib_rsplit_whitespace(
6984 (PyObject*) self, self->str, self->length, maxcount
6985 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006986
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006987 return stringlib_rsplit(
6988 (PyObject*) self, self->str, self->length,
6989 substring->str, substring->length,
6990 maxcount
6991 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006992}
6993
Alexander Belopolsky40018472011-02-26 01:02:56 +00006994static PyObject *
6995replace(PyUnicodeObject *self,
6996 PyUnicodeObject *str1,
6997 PyUnicodeObject *str2,
6998 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999{
7000 PyUnicodeObject *u;
7001
7002 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007004 else if (maxcount == 0 || self->length == 0)
7005 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006
Thomas Wouters477c8d52006-05-27 19:21:47 +00007007 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00007008 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007009 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007010 if (str1->length == 0)
7011 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007012 if (str1->length == 1) {
7013 /* replace characters */
7014 Py_UNICODE u1, u2;
7015 if (!findchar(self->str, self->length, str1->str[0]))
7016 goto nothing;
7017 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7018 if (!u)
7019 return NULL;
7020 Py_UNICODE_COPY(u->str, self->str, self->length);
7021 u1 = str1->str[0];
7022 u2 = str2->str[0];
7023 for (i = 0; i < u->length; i++)
7024 if (u->str[i] == u1) {
7025 if (--maxcount < 0)
7026 break;
7027 u->str[i] = u2;
7028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007030 i = stringlib_find(
7031 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00007033 if (i < 0)
7034 goto nothing;
7035 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7036 if (!u)
7037 return NULL;
7038 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007039
7040 /* change everything in-place, starting with this one */
7041 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7042 i += str1->length;
7043
7044 while ( --maxcount > 0) {
7045 i = stringlib_find(self->str+i, self->length-i,
7046 str1->str, str1->length,
7047 i);
7048 if (i == -1)
7049 break;
7050 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7051 i += str1->length;
7052 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007055
Brett Cannonb94767f2011-02-22 20:15:44 +00007056 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007057 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058 Py_UNICODE *p;
7059
7060 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007061 n = stringlib_count(self->str, self->length, str1->str, str1->length,
7062 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007063 if (n == 0)
7064 goto nothing;
7065 /* new_size = self->length + n * (str2->length - str1->length)); */
7066 delta = (str2->length - str1->length);
7067 if (delta == 0) {
7068 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007070 product = n * (str2->length - str1->length);
7071 if ((product / (str2->length - str1->length)) != n) {
7072 PyErr_SetString(PyExc_OverflowError,
7073 "replace string is too long");
7074 return NULL;
7075 }
7076 new_size = self->length + product;
7077 if (new_size < 0) {
7078 PyErr_SetString(PyExc_OverflowError,
7079 "replace string is too long");
7080 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081 }
7082 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007083 u = _PyUnicode_New(new_size);
7084 if (!u)
7085 return NULL;
7086 i = 0;
7087 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007088 if (str1->length > 0) {
7089 while (n-- > 0) {
7090 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007091 j = stringlib_find(self->str+i, self->length-i,
7092 str1->str, str1->length,
7093 i);
7094 if (j == -1)
7095 break;
7096 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007097 /* copy unchanged part [i:j] */
7098 Py_UNICODE_COPY(p, self->str+i, j-i);
7099 p += j - i;
7100 }
7101 /* copy substitution string */
7102 if (str2->length > 0) {
7103 Py_UNICODE_COPY(p, str2->str, str2->length);
7104 p += str2->length;
7105 }
7106 i = j + str1->length;
7107 }
7108 if (i < self->length)
7109 /* copy tail [i:] */
7110 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7111 } else {
7112 /* interleave */
7113 while (n > 0) {
7114 Py_UNICODE_COPY(p, str2->str, str2->length);
7115 p += str2->length;
7116 if (--n <= 0)
7117 break;
7118 *p++ = self->str[i++];
7119 }
7120 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7121 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007124
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007126 /* nothing to replace; return original string (when possible) */
7127 if (PyUnicode_CheckExact(self)) {
7128 Py_INCREF(self);
7129 return (PyObject *) self;
7130 }
7131 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132}
7133
7134/* --- Unicode Object Methods --------------------------------------------- */
7135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007136PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138\n\
7139Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007140characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141
7142static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007143unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 return fixup(self, fixtitle);
7146}
7147
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007148PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007149 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150\n\
7151Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007152have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153
7154static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007155unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157 return fixup(self, fixcapitalize);
7158}
7159
7160#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007161PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007162 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163\n\
7164Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007165normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166
7167static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007168unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169{
7170 PyObject *list;
7171 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007172 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174 /* Split into words */
7175 list = split(self, NULL, -1);
7176 if (!list)
7177 return NULL;
7178
7179 /* Capitalize each word */
7180 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7181 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007182 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 if (item == NULL)
7184 goto onError;
7185 Py_DECREF(PyList_GET_ITEM(list, i));
7186 PyList_SET_ITEM(list, i, item);
7187 }
7188
7189 /* Join the words to form a new string */
7190 item = PyUnicode_Join(NULL, list);
7191
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 Py_DECREF(list);
7194 return (PyObject *)item;
7195}
7196#endif
7197
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007198/* Argument converter. Coerces to a single unicode character */
7199
7200static int
7201convert_uc(PyObject *obj, void *addr)
7202{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007203 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7204 PyObject *uniobj;
7205 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007206
Benjamin Peterson14339b62009-01-31 16:36:08 +00007207 uniobj = PyUnicode_FromObject(obj);
7208 if (uniobj == NULL) {
7209 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007211 return 0;
7212 }
7213 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7214 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007215 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007216 Py_DECREF(uniobj);
7217 return 0;
7218 }
7219 unistr = PyUnicode_AS_UNICODE(uniobj);
7220 *fillcharloc = unistr[0];
7221 Py_DECREF(uniobj);
7222 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007223}
7224
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007225PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007228Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007229done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230
7231static PyObject *
7232unicode_center(PyUnicodeObject *self, PyObject *args)
7233{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007234 Py_ssize_t marg, left;
7235 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007236 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237
Thomas Woutersde017742006-02-16 19:34:37 +00007238 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239 return NULL;
7240
Tim Peters7a29bd52001-09-12 03:03:31 +00007241 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 Py_INCREF(self);
7243 return (PyObject*) self;
7244 }
7245
7246 marg = width - self->length;
7247 left = marg / 2 + (marg & width & 1);
7248
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007249 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250}
7251
Marc-André Lemburge5034372000-08-08 08:04:29 +00007252#if 0
7253
7254/* This code should go into some future Unicode collation support
7255 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007256 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007257
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007258/* speedy UTF-16 code point order comparison */
7259/* gleaned from: */
7260/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7261
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007262static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007263{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007264 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007265 0, 0, 0, 0, 0, 0, 0, 0,
7266 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007267 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007268};
7269
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270static int
7271unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7272{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007273 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007274
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 Py_UNICODE *s1 = str1->str;
7276 Py_UNICODE *s2 = str2->str;
7277
7278 len1 = str1->length;
7279 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007280
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007282 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007283
7284 c1 = *s1++;
7285 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007286
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 if (c1 > (1<<11) * 26)
7288 c1 += utf16Fixup[c1>>11];
7289 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007290 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007291 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007292
7293 if (c1 != c2)
7294 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007295
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007296 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297 }
7298
7299 return (len1 < len2) ? -1 : (len1 != len2);
7300}
7301
Marc-André Lemburge5034372000-08-08 08:04:29 +00007302#else
7303
7304static int
7305unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7306{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007307 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007308
7309 Py_UNICODE *s1 = str1->str;
7310 Py_UNICODE *s2 = str2->str;
7311
7312 len1 = str1->length;
7313 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007314
Marc-André Lemburge5034372000-08-08 08:04:29 +00007315 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007316 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007317
Fredrik Lundh45714e92001-06-26 16:39:36 +00007318 c1 = *s1++;
7319 c2 = *s2++;
7320
7321 if (c1 != c2)
7322 return (c1 < c2) ? -1 : 1;
7323
Marc-André Lemburge5034372000-08-08 08:04:29 +00007324 len1--; len2--;
7325 }
7326
7327 return (len1 < len2) ? -1 : (len1 != len2);
7328}
7329
7330#endif
7331
Alexander Belopolsky40018472011-02-26 01:02:56 +00007332int
7333PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007335 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7336 return unicode_compare((PyUnicodeObject *)left,
7337 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007338 PyErr_Format(PyExc_TypeError,
7339 "Can't compare %.100s and %.100s",
7340 left->ob_type->tp_name,
7341 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342 return -1;
7343}
7344
Martin v. Löwis5b222132007-06-10 09:51:05 +00007345int
7346PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7347{
7348 int i;
7349 Py_UNICODE *id;
7350 assert(PyUnicode_Check(uni));
7351 id = PyUnicode_AS_UNICODE(uni);
7352 /* Compare Unicode string and source character set string */
7353 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007354 if (id[i] != str[i])
7355 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007356 /* This check keeps Python strings that end in '\0' from comparing equal
7357 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007358 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007360 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007361 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007362 return 0;
7363}
7364
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007365
Benjamin Peterson29060642009-01-31 22:14:21 +00007366#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007367 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007368
Alexander Belopolsky40018472011-02-26 01:02:56 +00007369PyObject *
7370PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007371{
7372 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007373
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007374 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7375 PyObject *v;
Benjamin Peterson5fd4bd32011-03-06 09:06:34 -06007376 if (PyUnicode_GET_SIZE(left) != PyUnicode_GET_SIZE(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007377 if (op == Py_EQ) {
7378 Py_INCREF(Py_False);
7379 return Py_False;
7380 }
7381 if (op == Py_NE) {
7382 Py_INCREF(Py_True);
7383 return Py_True;
7384 }
7385 }
7386 if (left == right)
7387 result = 0;
7388 else
7389 result = unicode_compare((PyUnicodeObject *)left,
7390 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007391
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007392 /* Convert the return value to a Boolean */
7393 switch (op) {
7394 case Py_EQ:
7395 v = TEST_COND(result == 0);
7396 break;
7397 case Py_NE:
7398 v = TEST_COND(result != 0);
7399 break;
7400 case Py_LE:
7401 v = TEST_COND(result <= 0);
7402 break;
7403 case Py_GE:
7404 v = TEST_COND(result >= 0);
7405 break;
7406 case Py_LT:
7407 v = TEST_COND(result == -1);
7408 break;
7409 case Py_GT:
7410 v = TEST_COND(result == 1);
7411 break;
7412 default:
7413 PyErr_BadArgument();
7414 return NULL;
7415 }
7416 Py_INCREF(v);
7417 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007418 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007419
Brian Curtindfc80e32011-08-10 20:28:54 -05007420 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007421}
7422
Alexander Belopolsky40018472011-02-26 01:02:56 +00007423int
7424PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007425{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007426 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007427 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007428
7429 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007430 sub = PyUnicode_FromObject(element);
7431 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 PyErr_Format(PyExc_TypeError,
7433 "'in <string>' requires string as left operand, not %s",
7434 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007435 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007436 }
7437
Thomas Wouters477c8d52006-05-27 19:21:47 +00007438 str = PyUnicode_FromObject(container);
7439 if (!str) {
7440 Py_DECREF(sub);
7441 return -1;
7442 }
7443
7444 result = stringlib_contains_obj(str, sub);
7445
7446 Py_DECREF(str);
7447 Py_DECREF(sub);
7448
Guido van Rossum403d68b2000-03-13 15:55:09 +00007449 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007450}
7451
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452/* Concat to string or Unicode object giving a new Unicode object. */
7453
Alexander Belopolsky40018472011-02-26 01:02:56 +00007454PyObject *
7455PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007456{
7457 PyUnicodeObject *u = NULL, *v = NULL, *w;
7458
7459 /* Coerce the two arguments */
7460 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7461 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7464 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466
7467 /* Shortcuts */
7468 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 Py_DECREF(v);
7470 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471 }
7472 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007473 Py_DECREF(u);
7474 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 }
7476
7477 /* Concat the two Unicode strings */
7478 w = _PyUnicode_New(u->length + v->length);
7479 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007481 Py_UNICODE_COPY(w->str, u->str, u->length);
7482 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7483
7484 Py_DECREF(u);
7485 Py_DECREF(v);
7486 return (PyObject *)w;
7487
Benjamin Peterson29060642009-01-31 22:14:21 +00007488 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007489 Py_XDECREF(u);
7490 Py_XDECREF(v);
7491 return NULL;
7492}
7493
Walter Dörwald1ab83302007-05-18 17:15:44 +00007494void
7495PyUnicode_Append(PyObject **pleft, PyObject *right)
7496{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007497 PyObject *new;
7498 if (*pleft == NULL)
7499 return;
7500 if (right == NULL || !PyUnicode_Check(*pleft)) {
7501 Py_DECREF(*pleft);
7502 *pleft = NULL;
7503 return;
7504 }
7505 new = PyUnicode_Concat(*pleft, right);
7506 Py_DECREF(*pleft);
7507 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007508}
7509
7510void
7511PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7512{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007513 PyUnicode_Append(pleft, right);
7514 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007515}
7516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007517PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007520Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007521string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007522interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007523
7524static PyObject *
7525unicode_count(PyUnicodeObject *self, PyObject *args)
7526{
7527 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007528 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007529 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007530 PyObject *result;
7531
Jesus Ceaac451502011-04-20 17:09:23 +02007532 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
7533 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00007534 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007535
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007536 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007537 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007538 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007539 substring->str, substring->length,
7540 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007541 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542
7543 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007544
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545 return result;
7546}
7547
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007548PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007549 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007551Encode S using the codec registered for encoding. Default encoding\n\
7552is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007553handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007554a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7555'xmlcharrefreplace' as well as any other name registered with\n\
7556codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007557
7558static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007559unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007561 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562 char *encoding = NULL;
7563 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007564
Benjamin Peterson308d6372009-09-18 21:42:35 +00007565 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7566 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007568 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007569}
7570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007571PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573\n\
7574Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007575If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576
7577static PyObject*
7578unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7579{
7580 Py_UNICODE *e;
7581 Py_UNICODE *p;
7582 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007583 Py_UNICODE *qe;
7584 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007585 PyUnicodeObject *u;
7586 int tabsize = 8;
7587
7588 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590
Thomas Wouters7e474022000-07-16 12:04:32 +00007591 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007592 i = 0; /* chars up to and including most recent \n or \r */
7593 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7594 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595 for (p = self->str; p < e; p++)
7596 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 if (tabsize > 0) {
7598 incr = tabsize - (j % tabsize); /* cannot overflow */
7599 if (j > PY_SSIZE_T_MAX - incr)
7600 goto overflow1;
7601 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007602 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007605 if (j > PY_SSIZE_T_MAX - 1)
7606 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 j++;
7608 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 if (i > PY_SSIZE_T_MAX - j)
7610 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007612 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613 }
7614 }
7615
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007616 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007618
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619 /* Second pass: create output string and fill it */
7620 u = _PyUnicode_New(i + j);
7621 if (!u)
7622 return NULL;
7623
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007624 j = 0; /* same as in first pass */
7625 q = u->str; /* next output char */
7626 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627
7628 for (p = self->str; p < e; p++)
7629 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 if (tabsize > 0) {
7631 i = tabsize - (j % tabsize);
7632 j += i;
7633 while (i--) {
7634 if (q >= qe)
7635 goto overflow2;
7636 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007637 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007639 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 else {
7641 if (q >= qe)
7642 goto overflow2;
7643 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007644 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645 if (*p == '\n' || *p == '\r')
7646 j = 0;
7647 }
7648
7649 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007650
7651 overflow2:
7652 Py_DECREF(u);
7653 overflow1:
7654 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007656}
7657
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007658PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007659 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660\n\
7661Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08007662such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663arguments start and end are interpreted as in slice notation.\n\
7664\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007665Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666
7667static PyObject *
7668unicode_find(PyUnicodeObject *self, PyObject *args)
7669{
Jesus Ceaac451502011-04-20 17:09:23 +02007670 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007671 Py_ssize_t start;
7672 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007673 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674
Jesus Ceaac451502011-04-20 17:09:23 +02007675 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
7676 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678
Thomas Wouters477c8d52006-05-27 19:21:47 +00007679 result = stringlib_find_slice(
7680 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7681 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7682 start, end
7683 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684
7685 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007686
Christian Heimes217cfd12007-12-02 14:31:20 +00007687 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688}
7689
7690static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007691unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007692{
7693 if (index < 0 || index >= self->length) {
7694 PyErr_SetString(PyExc_IndexError, "string index out of range");
7695 return NULL;
7696 }
7697
7698 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7699}
7700
Guido van Rossumc2504932007-09-18 19:42:40 +00007701/* Believe it or not, this produces the same value for ASCII strings
7702 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007703static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007704unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705{
Guido van Rossumc2504932007-09-18 19:42:40 +00007706 Py_ssize_t len;
7707 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007708 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007709
7710 if (self->hash != -1)
7711 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007712 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007713 p = self->str;
7714 x = *p << 7;
7715 while (--len >= 0)
7716 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007717 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007718 if (x == -1)
7719 x = -2;
7720 self->hash = x;
7721 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722}
7723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007724PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007727Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728
7729static PyObject *
7730unicode_index(PyUnicodeObject *self, PyObject *args)
7731{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007732 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02007733 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007734 Py_ssize_t start;
7735 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736
Jesus Ceaac451502011-04-20 17:09:23 +02007737 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
7738 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740
Thomas Wouters477c8d52006-05-27 19:21:47 +00007741 result = stringlib_find_slice(
7742 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7743 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7744 start, end
7745 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746
7747 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007748
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749 if (result < 0) {
7750 PyErr_SetString(PyExc_ValueError, "substring not found");
7751 return NULL;
7752 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007753
Christian Heimes217cfd12007-12-02 14:31:20 +00007754 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755}
7756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007757PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007758 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007760Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007761at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762
7763static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007764unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007765{
7766 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7767 register const Py_UNICODE *e;
7768 int cased;
7769
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770 /* Shortcut for single character strings */
7771 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007772 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007774 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007775 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007776 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007777
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778 e = p + PyUnicode_GET_SIZE(self);
7779 cased = 0;
7780 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007782
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7784 return PyBool_FromLong(0);
7785 else if (!cased && Py_UNICODE_ISLOWER(ch))
7786 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007787 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007788 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789}
7790
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007791PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007794Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007795at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796
7797static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007798unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007799{
7800 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7801 register const Py_UNICODE *e;
7802 int cased;
7803
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804 /* Shortcut for single character strings */
7805 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007808 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007809 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007810 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007811
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812 e = p + PyUnicode_GET_SIZE(self);
7813 cased = 0;
7814 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007815 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007816
Benjamin Peterson29060642009-01-31 22:14:21 +00007817 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7818 return PyBool_FromLong(0);
7819 else if (!cased && Py_UNICODE_ISUPPER(ch))
7820 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007822 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007823}
7824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007825PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007827\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007828Return True if S is a titlecased string and there is at least one\n\
7829character in S, i.e. upper- and titlecase characters may only\n\
7830follow uncased characters and lowercase characters only cased ones.\n\
7831Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832
7833static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007834unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835{
7836 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7837 register const Py_UNICODE *e;
7838 int cased, previous_is_cased;
7839
Guido van Rossumd57fd912000-03-10 22:53:23 +00007840 /* Shortcut for single character strings */
7841 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7843 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007845 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007846 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007848
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849 e = p + PyUnicode_GET_SIZE(self);
7850 cased = 0;
7851 previous_is_cased = 0;
7852 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007854
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7856 if (previous_is_cased)
7857 return PyBool_FromLong(0);
7858 previous_is_cased = 1;
7859 cased = 1;
7860 }
7861 else if (Py_UNICODE_ISLOWER(ch)) {
7862 if (!previous_is_cased)
7863 return PyBool_FromLong(0);
7864 previous_is_cased = 1;
7865 cased = 1;
7866 }
7867 else
7868 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007870 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871}
7872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007873PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007876Return True if all characters in S are whitespace\n\
7877and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878
7879static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007880unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007881{
7882 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7883 register const Py_UNICODE *e;
7884
Guido van Rossumd57fd912000-03-10 22:53:23 +00007885 /* Shortcut for single character strings */
7886 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007887 Py_UNICODE_ISSPACE(*p))
7888 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007890 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007891 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007893
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894 e = p + PyUnicode_GET_SIZE(self);
7895 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007896 if (!Py_UNICODE_ISSPACE(*p))
7897 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007899 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007900}
7901
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007902PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007904\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007905Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007906and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007907
7908static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007909unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007910{
7911 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7912 register const Py_UNICODE *e;
7913
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007914 /* Shortcut for single character strings */
7915 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 Py_UNICODE_ISALPHA(*p))
7917 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007918
7919 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007920 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007922
7923 e = p + PyUnicode_GET_SIZE(self);
7924 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 if (!Py_UNICODE_ISALPHA(*p))
7926 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007927 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007928 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007929}
7930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007931PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007932 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007933\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007934Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007935and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007936
7937static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007938unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007939{
7940 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7941 register const Py_UNICODE *e;
7942
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007943 /* Shortcut for single character strings */
7944 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 Py_UNICODE_ISALNUM(*p))
7946 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007947
7948 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007949 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007950 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007951
7952 e = p + PyUnicode_GET_SIZE(self);
7953 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007954 if (!Py_UNICODE_ISALNUM(*p))
7955 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007956 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007957 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007958}
7959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007960PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007963Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007964False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965
7966static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007967unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968{
7969 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7970 register const Py_UNICODE *e;
7971
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972 /* Shortcut for single character strings */
7973 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 Py_UNICODE_ISDECIMAL(*p))
7975 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007977 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007978 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007980
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 e = p + PyUnicode_GET_SIZE(self);
7982 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 if (!Py_UNICODE_ISDECIMAL(*p))
7984 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007986 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987}
7988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007989PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007992Return True if all characters in S are digits\n\
7993and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994
7995static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007996unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997{
7998 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7999 register const Py_UNICODE *e;
8000
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 /* Shortcut for single character strings */
8002 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 Py_UNICODE_ISDIGIT(*p))
8004 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008006 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008007 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008008 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008009
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 e = p + PyUnicode_GET_SIZE(self);
8011 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008012 if (!Py_UNICODE_ISDIGIT(*p))
8013 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008015 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016}
8017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008018PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00008021Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008022False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023
8024static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008025unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026{
8027 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8028 register const Py_UNICODE *e;
8029
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030 /* Shortcut for single character strings */
8031 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 Py_UNICODE_ISNUMERIC(*p))
8033 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008035 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008036 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008038
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039 e = p + PyUnicode_GET_SIZE(self);
8040 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 if (!Py_UNICODE_ISNUMERIC(*p))
8042 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008044 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008045}
8046
Martin v. Löwis47383402007-08-15 07:32:56 +00008047int
8048PyUnicode_IsIdentifier(PyObject *self)
8049{
8050 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
8051 register const Py_UNICODE *e;
8052
8053 /* Special case for empty strings */
8054 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008056
8057 /* PEP 3131 says that the first character must be in
8058 XID_Start and subsequent characters in XID_Continue,
8059 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00008060 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00008061 letters, digits, underscore). However, given the current
8062 definition of XID_Start and XID_Continue, it is sufficient
8063 to check just for these, except that _ must be allowed
8064 as starting an identifier. */
8065 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
8066 return 0;
8067
8068 e = p + PyUnicode_GET_SIZE(self);
8069 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 if (!_PyUnicode_IsXidContinue(*p))
8071 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008072 }
8073 return 1;
8074}
8075
8076PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008077 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008078\n\
8079Return True if S is a valid identifier according\n\
8080to the language definition.");
8081
8082static PyObject*
8083unicode_isidentifier(PyObject *self)
8084{
8085 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8086}
8087
Georg Brandl559e5d72008-06-11 18:37:52 +00008088PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008090\n\
8091Return True if all characters in S are considered\n\
8092printable in repr() or S is empty, False otherwise.");
8093
8094static PyObject*
8095unicode_isprintable(PyObject *self)
8096{
8097 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8098 register const Py_UNICODE *e;
8099
8100 /* Shortcut for single character strings */
8101 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8102 Py_RETURN_TRUE;
8103 }
8104
8105 e = p + PyUnicode_GET_SIZE(self);
8106 for (; p < e; p++) {
8107 if (!Py_UNICODE_ISPRINTABLE(*p)) {
8108 Py_RETURN_FALSE;
8109 }
8110 }
8111 Py_RETURN_TRUE;
8112}
8113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008114PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008115 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116\n\
8117Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008118iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119
8120static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008121unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008123 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124}
8125
Martin v. Löwis18e16552006-02-15 17:27:45 +00008126static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127unicode_length(PyUnicodeObject *self)
8128{
8129 return self->length;
8130}
8131
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008132PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008135Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008136done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137
8138static PyObject *
8139unicode_ljust(PyUnicodeObject *self, PyObject *args)
8140{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008141 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008142 Py_UNICODE fillchar = ' ';
8143
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008144 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145 return NULL;
8146
Tim Peters7a29bd52001-09-12 03:03:31 +00008147 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 Py_INCREF(self);
8149 return (PyObject*) self;
8150 }
8151
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008152 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153}
8154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008155PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008157\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008158Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159
8160static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008161unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163 return fixup(self, fixlower);
8164}
8165
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008166#define LEFTSTRIP 0
8167#define RIGHTSTRIP 1
8168#define BOTHSTRIP 2
8169
8170/* Arrays indexed by above */
8171static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8172
8173#define STRIPNAME(i) (stripformat[i]+3)
8174
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008175/* externally visible for str.strip(unicode) */
8176PyObject *
8177_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8178{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008179 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8180 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8181 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8182 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8183 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008184
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008186
Benjamin Peterson14339b62009-01-31 16:36:08 +00008187 i = 0;
8188 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8190 i++;
8191 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008192 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008193
Benjamin Peterson14339b62009-01-31 16:36:08 +00008194 j = len;
8195 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 do {
8197 j--;
8198 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8199 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008200 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008201
Benjamin Peterson14339b62009-01-31 16:36:08 +00008202 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008203 Py_INCREF(self);
8204 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008205 }
8206 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008207 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008208}
8209
Guido van Rossumd57fd912000-03-10 22:53:23 +00008210
8211static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008212do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008214 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8215 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008216
Benjamin Peterson14339b62009-01-31 16:36:08 +00008217 i = 0;
8218 if (striptype != RIGHTSTRIP) {
8219 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8220 i++;
8221 }
8222 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008223
Benjamin Peterson14339b62009-01-31 16:36:08 +00008224 j = len;
8225 if (striptype != LEFTSTRIP) {
8226 do {
8227 j--;
8228 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8229 j++;
8230 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008231
Benjamin Peterson14339b62009-01-31 16:36:08 +00008232 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8233 Py_INCREF(self);
8234 return (PyObject*)self;
8235 }
8236 else
8237 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238}
8239
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008240
8241static PyObject *
8242do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8243{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008244 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008245
Benjamin Peterson14339b62009-01-31 16:36:08 +00008246 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8247 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008248
Benjamin Peterson14339b62009-01-31 16:36:08 +00008249 if (sep != NULL && sep != Py_None) {
8250 if (PyUnicode_Check(sep))
8251 return _PyUnicode_XStrip(self, striptype, sep);
8252 else {
8253 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 "%s arg must be None or str",
8255 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008256 return NULL;
8257 }
8258 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008259
Benjamin Peterson14339b62009-01-31 16:36:08 +00008260 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008261}
8262
8263
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008264PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008265 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008266\n\
8267Return a copy of the string S with leading and trailing\n\
8268whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008269If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008270
8271static PyObject *
8272unicode_strip(PyUnicodeObject *self, PyObject *args)
8273{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008274 if (PyTuple_GET_SIZE(args) == 0)
8275 return do_strip(self, BOTHSTRIP); /* Common case */
8276 else
8277 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008278}
8279
8280
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008281PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008283\n\
8284Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008285If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008286
8287static PyObject *
8288unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8289{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008290 if (PyTuple_GET_SIZE(args) == 0)
8291 return do_strip(self, LEFTSTRIP); /* Common case */
8292 else
8293 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008294}
8295
8296
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008297PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008299\n\
8300Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008301If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008302
8303static PyObject *
8304unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8305{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008306 if (PyTuple_GET_SIZE(args) == 0)
8307 return do_strip(self, RIGHTSTRIP); /* Common case */
8308 else
8309 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008310}
8311
8312
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008314unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315{
8316 PyUnicodeObject *u;
8317 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008318 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008319 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320
Georg Brandl222de0f2009-04-12 12:01:50 +00008321 if (len < 1) {
8322 Py_INCREF(unicode_empty);
8323 return (PyObject *)unicode_empty;
8324 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325
Tim Peters7a29bd52001-09-12 03:03:31 +00008326 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327 /* no repeat, return original string */
8328 Py_INCREF(str);
8329 return (PyObject*) str;
8330 }
Tim Peters8f422462000-09-09 06:13:41 +00008331
8332 /* ensure # of chars needed doesn't overflow int and # of bytes
8333 * needed doesn't overflow size_t
8334 */
8335 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008336 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008337 PyErr_SetString(PyExc_OverflowError,
8338 "repeated string is too long");
8339 return NULL;
8340 }
8341 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8342 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8343 PyErr_SetString(PyExc_OverflowError,
8344 "repeated string is too long");
8345 return NULL;
8346 }
8347 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348 if (!u)
8349 return NULL;
8350
8351 p = u->str;
8352
Georg Brandl222de0f2009-04-12 12:01:50 +00008353 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008354 Py_UNICODE_FILL(p, str->str[0], len);
8355 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008356 Py_ssize_t done = str->length; /* number of characters copied this far */
8357 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008359 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008360 Py_UNICODE_COPY(p+done, p, n);
8361 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 }
8364
8365 return (PyObject*) u;
8366}
8367
Alexander Belopolsky40018472011-02-26 01:02:56 +00008368PyObject *
8369PyUnicode_Replace(PyObject *obj,
8370 PyObject *subobj,
8371 PyObject *replobj,
8372 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373{
8374 PyObject *self;
8375 PyObject *str1;
8376 PyObject *str2;
8377 PyObject *result;
8378
8379 self = PyUnicode_FromObject(obj);
8380 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 str1 = PyUnicode_FromObject(subobj);
8383 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 Py_DECREF(self);
8385 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386 }
8387 str2 = PyUnicode_FromObject(replobj);
8388 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 Py_DECREF(self);
8390 Py_DECREF(str1);
8391 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008392 }
Tim Petersced69f82003-09-16 20:30:58 +00008393 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 (PyUnicodeObject *)str1,
8395 (PyUnicodeObject *)str2,
8396 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397 Py_DECREF(self);
8398 Py_DECREF(str1);
8399 Py_DECREF(str2);
8400 return result;
8401}
8402
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008403PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008404 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405\n\
8406Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008407old replaced by new. If the optional argument count is\n\
8408given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008409
8410static PyObject*
8411unicode_replace(PyUnicodeObject *self, PyObject *args)
8412{
8413 PyUnicodeObject *str1;
8414 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008415 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416 PyObject *result;
8417
Martin v. Löwis18e16552006-02-15 17:27:45 +00008418 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419 return NULL;
8420 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8421 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008424 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 Py_DECREF(str1);
8426 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008427 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428
8429 result = replace(self, str1, str2, maxcount);
8430
8431 Py_DECREF(str1);
8432 Py_DECREF(str2);
8433 return result;
8434}
8435
Alexander Belopolsky40018472011-02-26 01:02:56 +00008436static PyObject *
8437unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008438{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008439 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008440 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008441 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8442 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8443
8444 /* XXX(nnorwitz): rather than over-allocating, it would be
8445 better to choose a different scheme. Perhaps scan the
8446 first N-chars of the string and allocate based on that size.
8447 */
8448 /* Initial allocation is based on the longest-possible unichr
8449 escape.
8450
8451 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8452 unichr, so in this case it's the longest unichr escape. In
8453 narrow (UTF-16) builds this is five chars per source unichr
8454 since there are two unichrs in the surrogate pair, so in narrow
8455 (UTF-16) builds it's not the longest unichr escape.
8456
8457 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8458 so in the narrow (UTF-16) build case it's the longest unichr
8459 escape.
8460 */
8461
Walter Dörwald1ab83302007-05-18 17:15:44 +00008462 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008464#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008466#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008468#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008470 if (repr == NULL)
8471 return NULL;
8472
Walter Dörwald1ab83302007-05-18 17:15:44 +00008473 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008474
8475 /* Add quote */
8476 *p++ = (findchar(s, size, '\'') &&
8477 !findchar(s, size, '"')) ? '"' : '\'';
8478 while (size-- > 0) {
8479 Py_UNICODE ch = *s++;
8480
8481 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008482 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008483 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008484 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008485 continue;
8486 }
8487
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008489 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008490 *p++ = '\\';
8491 *p++ = 't';
8492 }
8493 else if (ch == '\n') {
8494 *p++ = '\\';
8495 *p++ = 'n';
8496 }
8497 else if (ch == '\r') {
8498 *p++ = '\\';
8499 *p++ = 'r';
8500 }
8501
8502 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008503 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008504 *p++ = '\\';
8505 *p++ = 'x';
8506 *p++ = hexdigits[(ch >> 4) & 0x000F];
8507 *p++ = hexdigits[ch & 0x000F];
8508 }
8509
Georg Brandl559e5d72008-06-11 18:37:52 +00008510 /* Copy ASCII characters as-is */
8511 else if (ch < 0x7F) {
8512 *p++ = ch;
8513 }
8514
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008516 else {
8517 Py_UCS4 ucs = ch;
8518
8519#ifndef Py_UNICODE_WIDE
8520 Py_UNICODE ch2 = 0;
8521 /* Get code point from surrogate pair */
8522 if (size > 0) {
8523 ch2 = *s;
8524 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008526 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008528 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008529 size--;
8530 }
8531 }
8532#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008533 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008534 (categories Z* and C* except ASCII space)
8535 */
8536 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8537 /* Map 8-bit characters to '\xhh' */
8538 if (ucs <= 0xff) {
8539 *p++ = '\\';
8540 *p++ = 'x';
8541 *p++ = hexdigits[(ch >> 4) & 0x000F];
8542 *p++ = hexdigits[ch & 0x000F];
8543 }
8544 /* Map 21-bit characters to '\U00xxxxxx' */
8545 else if (ucs >= 0x10000) {
8546 *p++ = '\\';
8547 *p++ = 'U';
8548 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8549 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8550 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8551 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8552 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8553 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8554 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8555 *p++ = hexdigits[ucs & 0x0000000F];
8556 }
8557 /* Map 16-bit characters to '\uxxxx' */
8558 else {
8559 *p++ = '\\';
8560 *p++ = 'u';
8561 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8562 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8563 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8564 *p++ = hexdigits[ucs & 0x000F];
8565 }
8566 }
8567 /* Copy characters as-is */
8568 else {
8569 *p++ = ch;
8570#ifndef Py_UNICODE_WIDE
8571 if (ucs >= 0x10000)
8572 *p++ = ch2;
8573#endif
8574 }
8575 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008576 }
8577 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008578 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008579
8580 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008581 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008582 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583}
8584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008585PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587\n\
8588Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +08008589such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590arguments start and end are interpreted as in slice notation.\n\
8591\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008592Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593
8594static PyObject *
8595unicode_rfind(PyUnicodeObject *self, PyObject *args)
8596{
Jesus Ceaac451502011-04-20 17:09:23 +02008597 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008598 Py_ssize_t start;
8599 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008600 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601
Jesus Ceaac451502011-04-20 17:09:23 +02008602 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
8603 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008604 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605
Thomas Wouters477c8d52006-05-27 19:21:47 +00008606 result = stringlib_rfind_slice(
8607 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8608 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8609 start, end
8610 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611
8612 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008613
Christian Heimes217cfd12007-12-02 14:31:20 +00008614 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615}
8616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008617PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008620Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621
8622static PyObject *
8623unicode_rindex(PyUnicodeObject *self, PyObject *args)
8624{
Jesus Ceaac451502011-04-20 17:09:23 +02008625 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008626 Py_ssize_t start;
8627 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008628 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629
Jesus Ceaac451502011-04-20 17:09:23 +02008630 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
8631 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633
Thomas Wouters477c8d52006-05-27 19:21:47 +00008634 result = stringlib_rfind_slice(
8635 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8636 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8637 start, end
8638 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639
8640 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008641
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642 if (result < 0) {
8643 PyErr_SetString(PyExc_ValueError, "substring not found");
8644 return NULL;
8645 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008646 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647}
8648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008649PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008652Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008653done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654
8655static PyObject *
8656unicode_rjust(PyUnicodeObject *self, PyObject *args)
8657{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008658 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008659 Py_UNICODE fillchar = ' ';
8660
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008661 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662 return NULL;
8663
Tim Peters7a29bd52001-09-12 03:03:31 +00008664 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 Py_INCREF(self);
8666 return (PyObject*) self;
8667 }
8668
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008669 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670}
8671
Alexander Belopolsky40018472011-02-26 01:02:56 +00008672PyObject *
8673PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674{
8675 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008676
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677 s = PyUnicode_FromObject(s);
8678 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008679 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 if (sep != NULL) {
8681 sep = PyUnicode_FromObject(sep);
8682 if (sep == NULL) {
8683 Py_DECREF(s);
8684 return NULL;
8685 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686 }
8687
8688 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8689
8690 Py_DECREF(s);
8691 Py_XDECREF(sep);
8692 return result;
8693}
8694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008695PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008696 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697\n\
8698Return a list of the words in S, using sep as the\n\
8699delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008700splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008701whitespace string is a separator and empty strings are\n\
8702removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703
8704static PyObject*
8705unicode_split(PyUnicodeObject *self, PyObject *args)
8706{
8707 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008708 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709
Martin v. Löwis18e16552006-02-15 17:27:45 +00008710 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711 return NULL;
8712
8713 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008715 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008719}
8720
Thomas Wouters477c8d52006-05-27 19:21:47 +00008721PyObject *
8722PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8723{
8724 PyObject* str_obj;
8725 PyObject* sep_obj;
8726 PyObject* out;
8727
8728 str_obj = PyUnicode_FromObject(str_in);
8729 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008731 sep_obj = PyUnicode_FromObject(sep_in);
8732 if (!sep_obj) {
8733 Py_DECREF(str_obj);
8734 return NULL;
8735 }
8736
8737 out = stringlib_partition(
8738 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8739 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8740 );
8741
8742 Py_DECREF(sep_obj);
8743 Py_DECREF(str_obj);
8744
8745 return out;
8746}
8747
8748
8749PyObject *
8750PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8751{
8752 PyObject* str_obj;
8753 PyObject* sep_obj;
8754 PyObject* out;
8755
8756 str_obj = PyUnicode_FromObject(str_in);
8757 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008758 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008759 sep_obj = PyUnicode_FromObject(sep_in);
8760 if (!sep_obj) {
8761 Py_DECREF(str_obj);
8762 return NULL;
8763 }
8764
8765 out = stringlib_rpartition(
8766 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8767 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8768 );
8769
8770 Py_DECREF(sep_obj);
8771 Py_DECREF(str_obj);
8772
8773 return out;
8774}
8775
8776PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008778\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008779Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008780the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008781found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008782
8783static PyObject*
8784unicode_partition(PyUnicodeObject *self, PyObject *separator)
8785{
8786 return PyUnicode_Partition((PyObject *)self, separator);
8787}
8788
8789PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008790 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008791\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008792Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008793the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008794separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008795
8796static PyObject*
8797unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8798{
8799 return PyUnicode_RPartition((PyObject *)self, separator);
8800}
8801
Alexander Belopolsky40018472011-02-26 01:02:56 +00008802PyObject *
8803PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008804{
8805 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008806
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008807 s = PyUnicode_FromObject(s);
8808 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008809 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008810 if (sep != NULL) {
8811 sep = PyUnicode_FromObject(sep);
8812 if (sep == NULL) {
8813 Py_DECREF(s);
8814 return NULL;
8815 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008816 }
8817
8818 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8819
8820 Py_DECREF(s);
8821 Py_XDECREF(sep);
8822 return result;
8823}
8824
8825PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008826 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008827\n\
8828Return a list of the words in S, using sep as the\n\
8829delimiter string, starting at the end of the string and\n\
8830working to the front. If maxsplit is given, at most maxsplit\n\
8831splits are done. If sep is not specified, any whitespace string\n\
8832is a separator.");
8833
8834static PyObject*
8835unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8836{
8837 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008838 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008839
Martin v. Löwis18e16552006-02-15 17:27:45 +00008840 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008841 return NULL;
8842
8843 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008844 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008845 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008846 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008847 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008848 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008849}
8850
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008851PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008852 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853\n\
8854Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008855Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008856is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857
8858static PyObject*
8859unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8860{
Guido van Rossum86662912000-04-11 15:38:46 +00008861 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862
Guido van Rossum86662912000-04-11 15:38:46 +00008863 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008864 return NULL;
8865
Guido van Rossum86662912000-04-11 15:38:46 +00008866 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867}
8868
8869static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008870PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871{
Walter Dörwald346737f2007-05-31 10:44:43 +00008872 if (PyUnicode_CheckExact(self)) {
8873 Py_INCREF(self);
8874 return self;
8875 } else
8876 /* Subtype -- return genuine unicode string with the same value. */
8877 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8878 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879}
8880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008881PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883\n\
8884Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008885and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886
8887static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008888unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890 return fixup(self, fixswapcase);
8891}
8892
Georg Brandlceee0772007-11-27 23:48:05 +00008893PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008894 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008895\n\
8896Return a translation table usable for str.translate().\n\
8897If there is only one argument, it must be a dictionary mapping Unicode\n\
8898ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008899Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008900If there are two arguments, they must be strings of equal length, and\n\
8901in the resulting dictionary, each character in x will be mapped to the\n\
8902character at the same position in y. If there is a third argument, it\n\
8903must be a string, whose characters will be mapped to None in the result.");
8904
8905static PyObject*
8906unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8907{
8908 PyObject *x, *y = NULL, *z = NULL;
8909 PyObject *new = NULL, *key, *value;
8910 Py_ssize_t i = 0;
8911 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008912
Georg Brandlceee0772007-11-27 23:48:05 +00008913 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8914 return NULL;
8915 new = PyDict_New();
8916 if (!new)
8917 return NULL;
8918 if (y != NULL) {
8919 /* x must be a string too, of equal length */
8920 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8921 if (!PyUnicode_Check(x)) {
8922 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8923 "be a string if there is a second argument");
8924 goto err;
8925 }
8926 if (PyUnicode_GET_SIZE(x) != ylen) {
8927 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8928 "arguments must have equal length");
8929 goto err;
8930 }
8931 /* create entries for translating chars in x to those in y */
8932 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008933 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8934 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008935 if (!key || !value)
8936 goto err;
8937 res = PyDict_SetItem(new, key, value);
8938 Py_DECREF(key);
8939 Py_DECREF(value);
8940 if (res < 0)
8941 goto err;
8942 }
8943 /* create entries for deleting chars in z */
8944 if (z != NULL) {
8945 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008946 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008947 if (!key)
8948 goto err;
8949 res = PyDict_SetItem(new, key, Py_None);
8950 Py_DECREF(key);
8951 if (res < 0)
8952 goto err;
8953 }
8954 }
8955 } else {
8956 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008957 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008958 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8959 "to maketrans it must be a dict");
8960 goto err;
8961 }
8962 /* copy entries into the new dict, converting string keys to int keys */
8963 while (PyDict_Next(x, &i, &key, &value)) {
8964 if (PyUnicode_Check(key)) {
8965 /* convert string keys to integer keys */
8966 PyObject *newkey;
8967 if (PyUnicode_GET_SIZE(key) != 1) {
8968 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8969 "table must be of length 1");
8970 goto err;
8971 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008972 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008973 if (!newkey)
8974 goto err;
8975 res = PyDict_SetItem(new, newkey, value);
8976 Py_DECREF(newkey);
8977 if (res < 0)
8978 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008979 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008980 /* just keep integer keys */
8981 if (PyDict_SetItem(new, key, value) < 0)
8982 goto err;
8983 } else {
8984 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8985 "be strings or integers");
8986 goto err;
8987 }
8988 }
8989 }
8990 return new;
8991 err:
8992 Py_DECREF(new);
8993 return NULL;
8994}
8995
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008996PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008997 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998\n\
8999Return a copy of the string S, where all characters have been mapped\n\
9000through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009001Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00009002Unmapped characters are left untouched. Characters mapped to None\n\
9003are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004
9005static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009006unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007{
Georg Brandlceee0772007-11-27 23:48:05 +00009008 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009}
9010
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009011PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009012 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009014Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015
9016static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009017unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 return fixup(self, fixupper);
9020}
9021
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009022PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009023 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00009025Pad a numeric string S with zeros on the left, to fill a field\n\
9026of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027
9028static PyObject *
9029unicode_zfill(PyUnicodeObject *self, PyObject *args)
9030{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009031 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032 PyUnicodeObject *u;
9033
Martin v. Löwis18e16552006-02-15 17:27:45 +00009034 Py_ssize_t width;
9035 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036 return NULL;
9037
9038 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00009039 if (PyUnicode_CheckExact(self)) {
9040 Py_INCREF(self);
9041 return (PyObject*) self;
9042 }
9043 else
9044 return PyUnicode_FromUnicode(
9045 PyUnicode_AS_UNICODE(self),
9046 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00009047 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048 }
9049
9050 fill = width - self->length;
9051
9052 u = pad(self, fill, 0, '0');
9053
Walter Dörwald068325e2002-04-15 13:36:47 +00009054 if (u == NULL)
9055 return NULL;
9056
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057 if (u->str[fill] == '+' || u->str[fill] == '-') {
9058 /* move sign to beginning of string */
9059 u->str[0] = u->str[fill];
9060 u->str[fill] = '0';
9061 }
9062
9063 return (PyObject*) u;
9064}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065
9066#if 0
9067static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009068unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069{
Christian Heimes2202f872008-02-06 14:31:34 +00009070 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009072
9073static PyObject *
9074unicode__decimal2ascii(PyObject *self)
9075{
9076 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9077 PyUnicode_GET_SIZE(self));
9078}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079#endif
9080
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009081PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009082 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009084Return True if S starts with the specified prefix, False otherwise.\n\
9085With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009086With optional end, stop comparing S at that position.\n\
9087prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088
9089static PyObject *
9090unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009091 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009093 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009095 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009096 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009097 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098
Jesus Ceaac451502011-04-20 17:09:23 +02009099 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009100 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009101 if (PyTuple_Check(subobj)) {
9102 Py_ssize_t i;
9103 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9104 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009105 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009106 if (substring == NULL)
9107 return NULL;
9108 result = tailmatch(self, substring, start, end, -1);
9109 Py_DECREF(substring);
9110 if (result) {
9111 Py_RETURN_TRUE;
9112 }
9113 }
9114 /* nothing matched */
9115 Py_RETURN_FALSE;
9116 }
9117 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009118 if (substring == NULL) {
9119 if (PyErr_ExceptionMatches(PyExc_TypeError))
9120 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
9121 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009123 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009124 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009126 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127}
9128
9129
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009130PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009133Return True if S ends with the specified suffix, False otherwise.\n\
9134With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009135With optional end, stop comparing S at that position.\n\
9136suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137
9138static PyObject *
9139unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009140 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009142 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009143 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009144 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009145 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009146 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147
Jesus Ceaac451502011-04-20 17:09:23 +02009148 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009149 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009150 if (PyTuple_Check(subobj)) {
9151 Py_ssize_t i;
9152 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9153 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009154 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009155 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009156 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009157 result = tailmatch(self, substring, start, end, +1);
9158 Py_DECREF(substring);
9159 if (result) {
9160 Py_RETURN_TRUE;
9161 }
9162 }
9163 Py_RETURN_FALSE;
9164 }
9165 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009166 if (substring == NULL) {
9167 if (PyErr_ExceptionMatches(PyExc_TypeError))
9168 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
9169 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009170 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009171 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009172 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009174 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175}
9176
Eric Smith8c663262007-08-25 02:26:07 +00009177#include "stringlib/string_format.h"
9178
9179PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009180 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009181\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009182Return a formatted version of S, using substitutions from args and kwargs.\n\
9183The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009184
Eric Smith27bbca62010-11-04 17:06:58 +00009185PyDoc_STRVAR(format_map__doc__,
9186 "S.format_map(mapping) -> str\n\
9187\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009188Return a formatted version of S, using substitutions from mapping.\n\
9189The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009190
Eric Smith4a7d76d2008-05-30 18:10:19 +00009191static PyObject *
9192unicode__format__(PyObject* self, PyObject* args)
9193{
9194 PyObject *format_spec;
9195
9196 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9197 return NULL;
9198
9199 return _PyUnicode_FormatAdvanced(self,
9200 PyUnicode_AS_UNICODE(format_spec),
9201 PyUnicode_GET_SIZE(format_spec));
9202}
9203
Eric Smith8c663262007-08-25 02:26:07 +00009204PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009205 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009206\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009207Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009208
9209static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009210unicode__sizeof__(PyUnicodeObject *v)
9211{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009212 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9213 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009214}
9215
9216PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009217 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009218
9219static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009220unicode_getnewargs(PyUnicodeObject *v)
9221{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009222 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009223}
9224
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225static PyMethodDef unicode_methods[] = {
9226
9227 /* Order is according to common usage: often used methods should
9228 appear first, since lookup is done sequentially. */
9229
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009230 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009231 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9232 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009233 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009234 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9235 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9236 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9237 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9238 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9239 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9240 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009241 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009242 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9243 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9244 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009245 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009246 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9247 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9248 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009249 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009250 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009251 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009252 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009253 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9254 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9255 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9256 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9257 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9258 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9259 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9260 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9261 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9262 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9263 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9264 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9265 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9266 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009267 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009268 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009269 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009270 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009271 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009272 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009273 {"maketrans", (PyCFunction) unicode_maketrans,
9274 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009275 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009276#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009277 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278#endif
9279
9280#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009281 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009282 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009283 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284#endif
9285
Benjamin Peterson14339b62009-01-31 16:36:08 +00009286 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287 {NULL, NULL}
9288};
9289
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009290static PyObject *
9291unicode_mod(PyObject *v, PyObject *w)
9292{
Brian Curtindfc80e32011-08-10 20:28:54 -05009293 if (!PyUnicode_Check(v))
9294 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +00009295 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009296}
9297
9298static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009299 0, /*nb_add*/
9300 0, /*nb_subtract*/
9301 0, /*nb_multiply*/
9302 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009303};
9304
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009306 (lenfunc) unicode_length, /* sq_length */
9307 PyUnicode_Concat, /* sq_concat */
9308 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9309 (ssizeargfunc) unicode_getitem, /* sq_item */
9310 0, /* sq_slice */
9311 0, /* sq_ass_item */
9312 0, /* sq_ass_slice */
9313 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009314};
9315
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009316static PyObject*
9317unicode_subscript(PyUnicodeObject* self, PyObject* item)
9318{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009319 if (PyIndex_Check(item)) {
9320 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009321 if (i == -1 && PyErr_Occurred())
9322 return NULL;
9323 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009324 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009325 return unicode_getitem(self, i);
9326 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009327 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009328 Py_UNICODE* source_buf;
9329 Py_UNICODE* result_buf;
9330 PyObject* result;
9331
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009332 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009333 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009334 return NULL;
9335 }
9336
9337 if (slicelength <= 0) {
9338 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009339 } else if (start == 0 && step == 1 && slicelength == self->length &&
9340 PyUnicode_CheckExact(self)) {
9341 Py_INCREF(self);
9342 return (PyObject *)self;
9343 } else if (step == 1) {
9344 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009345 } else {
9346 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009347 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9348 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009349
Benjamin Peterson29060642009-01-31 22:14:21 +00009350 if (result_buf == NULL)
9351 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009352
9353 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9354 result_buf[i] = source_buf[cur];
9355 }
Tim Petersced69f82003-09-16 20:30:58 +00009356
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009357 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009358 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009359 return result;
9360 }
9361 } else {
9362 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9363 return NULL;
9364 }
9365}
9366
9367static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009368 (lenfunc)unicode_length, /* mp_length */
9369 (binaryfunc)unicode_subscript, /* mp_subscript */
9370 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009371};
9372
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374/* Helpers for PyUnicode_Format() */
9375
9376static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009377getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009379 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009381 (*p_argidx)++;
9382 if (arglen < 0)
9383 return args;
9384 else
9385 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009386 }
9387 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009388 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009389 return NULL;
9390}
9391
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009392/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009394static PyObject *
9395formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009396{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009397 char *p;
9398 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009400
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401 x = PyFloat_AsDouble(v);
9402 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009403 return NULL;
9404
Guido van Rossumd57fd912000-03-10 22:53:23 +00009405 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009406 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009407
Eric Smith0923d1d2009-04-16 20:16:10 +00009408 p = PyOS_double_to_string(x, type, prec,
9409 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009410 if (p == NULL)
9411 return NULL;
9412 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009413 PyMem_Free(p);
9414 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009415}
9416
Tim Peters38fd5b62000-09-21 05:43:11 +00009417static PyObject*
9418formatlong(PyObject *val, int flags, int prec, int type)
9419{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009420 char *buf;
9421 int len;
9422 PyObject *str; /* temporary string object. */
9423 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009424
Benjamin Peterson14339b62009-01-31 16:36:08 +00009425 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9426 if (!str)
9427 return NULL;
9428 result = PyUnicode_FromStringAndSize(buf, len);
9429 Py_DECREF(str);
9430 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009431}
9432
Guido van Rossumd57fd912000-03-10 22:53:23 +00009433static int
9434formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009435 size_t buflen,
9436 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009437{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009438 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009439 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009440 if (PyUnicode_GET_SIZE(v) == 1) {
9441 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9442 buf[1] = '\0';
9443 return 1;
9444 }
9445#ifndef Py_UNICODE_WIDE
9446 if (PyUnicode_GET_SIZE(v) == 2) {
9447 /* Decode a valid surrogate pair */
9448 int c0 = PyUnicode_AS_UNICODE(v)[0];
9449 int c1 = PyUnicode_AS_UNICODE(v)[1];
9450 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9451 0xDC00 <= c1 && c1 <= 0xDFFF) {
9452 buf[0] = c0;
9453 buf[1] = c1;
9454 buf[2] = '\0';
9455 return 2;
9456 }
9457 }
9458#endif
9459 goto onError;
9460 }
9461 else {
9462 /* Integer input truncated to a character */
9463 long x;
9464 x = PyLong_AsLong(v);
9465 if (x == -1 && PyErr_Occurred())
9466 goto onError;
9467
9468 if (x < 0 || x > 0x10ffff) {
9469 PyErr_SetString(PyExc_OverflowError,
9470 "%c arg not in range(0x110000)");
9471 return -1;
9472 }
9473
9474#ifndef Py_UNICODE_WIDE
9475 if (x > 0xffff) {
9476 x -= 0x10000;
9477 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9478 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9479 return 2;
9480 }
9481#endif
9482 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009483 buf[1] = '\0';
9484 return 1;
9485 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009486
Benjamin Peterson29060642009-01-31 22:14:21 +00009487 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009488 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009489 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009490 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491}
9492
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009493/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009494 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009495*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009496#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009497
Alexander Belopolsky40018472011-02-26 01:02:56 +00009498PyObject *
9499PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500{
9501 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009502 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503 int args_owned = 0;
9504 PyUnicodeObject *result = NULL;
9505 PyObject *dict = NULL;
9506 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009507
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009509 PyErr_BadInternalCall();
9510 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511 }
9512 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009513 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009514 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515 fmt = PyUnicode_AS_UNICODE(uformat);
9516 fmtcnt = PyUnicode_GET_SIZE(uformat);
9517
9518 reslen = rescnt = fmtcnt + 100;
9519 result = _PyUnicode_New(reslen);
9520 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009521 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522 res = PyUnicode_AS_UNICODE(result);
9523
9524 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009525 arglen = PyTuple_Size(args);
9526 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009527 }
9528 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009529 arglen = -1;
9530 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009532 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009533 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009534 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009535
9536 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009537 if (*fmt != '%') {
9538 if (--rescnt < 0) {
9539 rescnt = fmtcnt + 100;
9540 reslen += rescnt;
9541 if (_PyUnicode_Resize(&result, reslen) < 0)
9542 goto onError;
9543 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9544 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009545 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009546 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009547 }
9548 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 /* Got a format specifier */
9550 int flags = 0;
9551 Py_ssize_t width = -1;
9552 int prec = -1;
9553 Py_UNICODE c = '\0';
9554 Py_UNICODE fill;
9555 int isnumok;
9556 PyObject *v = NULL;
9557 PyObject *temp = NULL;
9558 Py_UNICODE *pbuf;
9559 Py_UNICODE sign;
9560 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009561 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009562
Benjamin Peterson29060642009-01-31 22:14:21 +00009563 fmt++;
9564 if (*fmt == '(') {
9565 Py_UNICODE *keystart;
9566 Py_ssize_t keylen;
9567 PyObject *key;
9568 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009569
Benjamin Peterson29060642009-01-31 22:14:21 +00009570 if (dict == NULL) {
9571 PyErr_SetString(PyExc_TypeError,
9572 "format requires a mapping");
9573 goto onError;
9574 }
9575 ++fmt;
9576 --fmtcnt;
9577 keystart = fmt;
9578 /* Skip over balanced parentheses */
9579 while (pcount > 0 && --fmtcnt >= 0) {
9580 if (*fmt == ')')
9581 --pcount;
9582 else if (*fmt == '(')
9583 ++pcount;
9584 fmt++;
9585 }
9586 keylen = fmt - keystart - 1;
9587 if (fmtcnt < 0 || pcount > 0) {
9588 PyErr_SetString(PyExc_ValueError,
9589 "incomplete format key");
9590 goto onError;
9591 }
9592#if 0
9593 /* keys are converted to strings using UTF-8 and
9594 then looked up since Python uses strings to hold
9595 variables names etc. in its namespaces and we
9596 wouldn't want to break common idioms. */
9597 key = PyUnicode_EncodeUTF8(keystart,
9598 keylen,
9599 NULL);
9600#else
9601 key = PyUnicode_FromUnicode(keystart, keylen);
9602#endif
9603 if (key == NULL)
9604 goto onError;
9605 if (args_owned) {
9606 Py_DECREF(args);
9607 args_owned = 0;
9608 }
9609 args = PyObject_GetItem(dict, key);
9610 Py_DECREF(key);
9611 if (args == NULL) {
9612 goto onError;
9613 }
9614 args_owned = 1;
9615 arglen = -1;
9616 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009617 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009618 while (--fmtcnt >= 0) {
9619 switch (c = *fmt++) {
9620 case '-': flags |= F_LJUST; continue;
9621 case '+': flags |= F_SIGN; continue;
9622 case ' ': flags |= F_BLANK; continue;
9623 case '#': flags |= F_ALT; continue;
9624 case '0': flags |= F_ZERO; continue;
9625 }
9626 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009627 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009628 if (c == '*') {
9629 v = getnextarg(args, arglen, &argidx);
9630 if (v == NULL)
9631 goto onError;
9632 if (!PyLong_Check(v)) {
9633 PyErr_SetString(PyExc_TypeError,
9634 "* wants int");
9635 goto onError;
9636 }
9637 width = PyLong_AsLong(v);
9638 if (width == -1 && PyErr_Occurred())
9639 goto onError;
9640 if (width < 0) {
9641 flags |= F_LJUST;
9642 width = -width;
9643 }
9644 if (--fmtcnt >= 0)
9645 c = *fmt++;
9646 }
9647 else if (c >= '0' && c <= '9') {
9648 width = c - '0';
9649 while (--fmtcnt >= 0) {
9650 c = *fmt++;
9651 if (c < '0' || c > '9')
9652 break;
9653 if ((width*10) / 10 != width) {
9654 PyErr_SetString(PyExc_ValueError,
9655 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009656 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009657 }
9658 width = width*10 + (c - '0');
9659 }
9660 }
9661 if (c == '.') {
9662 prec = 0;
9663 if (--fmtcnt >= 0)
9664 c = *fmt++;
9665 if (c == '*') {
9666 v = getnextarg(args, arglen, &argidx);
9667 if (v == NULL)
9668 goto onError;
9669 if (!PyLong_Check(v)) {
9670 PyErr_SetString(PyExc_TypeError,
9671 "* wants int");
9672 goto onError;
9673 }
9674 prec = PyLong_AsLong(v);
9675 if (prec == -1 && PyErr_Occurred())
9676 goto onError;
9677 if (prec < 0)
9678 prec = 0;
9679 if (--fmtcnt >= 0)
9680 c = *fmt++;
9681 }
9682 else if (c >= '0' && c <= '9') {
9683 prec = c - '0';
9684 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009685 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009686 if (c < '0' || c > '9')
9687 break;
9688 if ((prec*10) / 10 != prec) {
9689 PyErr_SetString(PyExc_ValueError,
9690 "prec too big");
9691 goto onError;
9692 }
9693 prec = prec*10 + (c - '0');
9694 }
9695 }
9696 } /* prec */
9697 if (fmtcnt >= 0) {
9698 if (c == 'h' || c == 'l' || c == 'L') {
9699 if (--fmtcnt >= 0)
9700 c = *fmt++;
9701 }
9702 }
9703 if (fmtcnt < 0) {
9704 PyErr_SetString(PyExc_ValueError,
9705 "incomplete format");
9706 goto onError;
9707 }
9708 if (c != '%') {
9709 v = getnextarg(args, arglen, &argidx);
9710 if (v == NULL)
9711 goto onError;
9712 }
9713 sign = 0;
9714 fill = ' ';
9715 switch (c) {
9716
9717 case '%':
9718 pbuf = formatbuf;
9719 /* presume that buffer length is at least 1 */
9720 pbuf[0] = '%';
9721 len = 1;
9722 break;
9723
9724 case 's':
9725 case 'r':
9726 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009727 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009728 temp = v;
9729 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009730 }
9731 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009732 if (c == 's')
9733 temp = PyObject_Str(v);
9734 else if (c == 'r')
9735 temp = PyObject_Repr(v);
9736 else
9737 temp = PyObject_ASCII(v);
9738 if (temp == NULL)
9739 goto onError;
9740 if (PyUnicode_Check(temp))
9741 /* nothing to do */;
9742 else {
9743 Py_DECREF(temp);
9744 PyErr_SetString(PyExc_TypeError,
9745 "%s argument has non-string str()");
9746 goto onError;
9747 }
9748 }
9749 pbuf = PyUnicode_AS_UNICODE(temp);
9750 len = PyUnicode_GET_SIZE(temp);
9751 if (prec >= 0 && len > prec)
9752 len = prec;
9753 break;
9754
9755 case 'i':
9756 case 'd':
9757 case 'u':
9758 case 'o':
9759 case 'x':
9760 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +00009761 isnumok = 0;
9762 if (PyNumber_Check(v)) {
9763 PyObject *iobj=NULL;
9764
9765 if (PyLong_Check(v)) {
9766 iobj = v;
9767 Py_INCREF(iobj);
9768 }
9769 else {
9770 iobj = PyNumber_Long(v);
9771 }
9772 if (iobj!=NULL) {
9773 if (PyLong_Check(iobj)) {
9774 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -07009775 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +00009776 Py_DECREF(iobj);
9777 if (!temp)
9778 goto onError;
9779 pbuf = PyUnicode_AS_UNICODE(temp);
9780 len = PyUnicode_GET_SIZE(temp);
9781 sign = 1;
9782 }
9783 else {
9784 Py_DECREF(iobj);
9785 }
9786 }
9787 }
9788 if (!isnumok) {
9789 PyErr_Format(PyExc_TypeError,
9790 "%%%c format: a number is required, "
9791 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9792 goto onError;
9793 }
9794 if (flags & F_ZERO)
9795 fill = '0';
9796 break;
9797
9798 case 'e':
9799 case 'E':
9800 case 'f':
9801 case 'F':
9802 case 'g':
9803 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009804 temp = formatfloat(v, flags, prec, c);
9805 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009806 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009807 pbuf = PyUnicode_AS_UNICODE(temp);
9808 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009809 sign = 1;
9810 if (flags & F_ZERO)
9811 fill = '0';
9812 break;
9813
9814 case 'c':
9815 pbuf = formatbuf;
9816 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9817 if (len < 0)
9818 goto onError;
9819 break;
9820
9821 default:
9822 PyErr_Format(PyExc_ValueError,
9823 "unsupported format character '%c' (0x%x) "
9824 "at index %zd",
9825 (31<=c && c<=126) ? (char)c : '?',
9826 (int)c,
9827 (Py_ssize_t)(fmt - 1 -
9828 PyUnicode_AS_UNICODE(uformat)));
9829 goto onError;
9830 }
9831 if (sign) {
9832 if (*pbuf == '-' || *pbuf == '+') {
9833 sign = *pbuf++;
9834 len--;
9835 }
9836 else if (flags & F_SIGN)
9837 sign = '+';
9838 else if (flags & F_BLANK)
9839 sign = ' ';
9840 else
9841 sign = 0;
9842 }
9843 if (width < len)
9844 width = len;
9845 if (rescnt - (sign != 0) < width) {
9846 reslen -= rescnt;
9847 rescnt = width + fmtcnt + 100;
9848 reslen += rescnt;
9849 if (reslen < 0) {
9850 Py_XDECREF(temp);
9851 PyErr_NoMemory();
9852 goto onError;
9853 }
9854 if (_PyUnicode_Resize(&result, reslen) < 0) {
9855 Py_XDECREF(temp);
9856 goto onError;
9857 }
9858 res = PyUnicode_AS_UNICODE(result)
9859 + reslen - rescnt;
9860 }
9861 if (sign) {
9862 if (fill != ' ')
9863 *res++ = sign;
9864 rescnt--;
9865 if (width > len)
9866 width--;
9867 }
9868 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9869 assert(pbuf[0] == '0');
9870 assert(pbuf[1] == c);
9871 if (fill != ' ') {
9872 *res++ = *pbuf++;
9873 *res++ = *pbuf++;
9874 }
9875 rescnt -= 2;
9876 width -= 2;
9877 if (width < 0)
9878 width = 0;
9879 len -= 2;
9880 }
9881 if (width > len && !(flags & F_LJUST)) {
9882 do {
9883 --rescnt;
9884 *res++ = fill;
9885 } while (--width > len);
9886 }
9887 if (fill == ' ') {
9888 if (sign)
9889 *res++ = sign;
9890 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9891 assert(pbuf[0] == '0');
9892 assert(pbuf[1] == c);
9893 *res++ = *pbuf++;
9894 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009895 }
9896 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009897 Py_UNICODE_COPY(res, pbuf, len);
9898 res += len;
9899 rescnt -= len;
9900 while (--width >= len) {
9901 --rescnt;
9902 *res++ = ' ';
9903 }
9904 if (dict && (argidx < arglen) && c != '%') {
9905 PyErr_SetString(PyExc_TypeError,
9906 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009907 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009908 goto onError;
9909 }
9910 Py_XDECREF(temp);
9911 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912 } /* until end */
9913 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009914 PyErr_SetString(PyExc_TypeError,
9915 "not all arguments converted during string formatting");
9916 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009917 }
9918
Thomas Woutersa96affe2006-03-12 00:29:36 +00009919 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009920 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009921 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009922 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009923 }
9924 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009925 return (PyObject *)result;
9926
Benjamin Peterson29060642009-01-31 22:14:21 +00009927 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928 Py_XDECREF(result);
9929 Py_DECREF(uformat);
9930 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009931 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932 }
9933 return NULL;
9934}
9935
Jeremy Hylton938ace62002-07-17 16:30:39 +00009936static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009937unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9938
Tim Peters6d6c1a32001-08-02 04:15:00 +00009939static PyObject *
9940unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9941{
Benjamin Peterson29060642009-01-31 22:14:21 +00009942 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009943 static char *kwlist[] = {"object", "encoding", "errors", 0};
9944 char *encoding = NULL;
9945 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009946
Benjamin Peterson14339b62009-01-31 16:36:08 +00009947 if (type != &PyUnicode_Type)
9948 return unicode_subtype_new(type, args, kwds);
9949 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009950 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009951 return NULL;
9952 if (x == NULL)
9953 return (PyObject *)_PyUnicode_New(0);
9954 if (encoding == NULL && errors == NULL)
9955 return PyObject_Str(x);
9956 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009957 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009958}
9959
Guido van Rossume023fe02001-08-30 03:12:59 +00009960static PyObject *
9961unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9962{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009963 PyUnicodeObject *tmp, *pnew;
9964 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009965
Benjamin Peterson14339b62009-01-31 16:36:08 +00009966 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9967 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9968 if (tmp == NULL)
9969 return NULL;
9970 assert(PyUnicode_Check(tmp));
9971 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9972 if (pnew == NULL) {
9973 Py_DECREF(tmp);
9974 return NULL;
9975 }
9976 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9977 if (pnew->str == NULL) {
9978 _Py_ForgetReference((PyObject *)pnew);
9979 PyObject_Del(pnew);
9980 Py_DECREF(tmp);
9981 return PyErr_NoMemory();
9982 }
9983 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9984 pnew->length = n;
9985 pnew->hash = tmp->hash;
9986 Py_DECREF(tmp);
9987 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009988}
9989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009990PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009991 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009992\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009993Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009994encoding defaults to the current default string encoding.\n\
9995errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009996
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009997static PyObject *unicode_iter(PyObject *seq);
9998
Guido van Rossumd57fd912000-03-10 22:53:23 +00009999PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000010000 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010001 "str", /* tp_name */
10002 sizeof(PyUnicodeObject), /* tp_size */
10003 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010005 (destructor)unicode_dealloc, /* tp_dealloc */
10006 0, /* tp_print */
10007 0, /* tp_getattr */
10008 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010009 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010010 unicode_repr, /* tp_repr */
10011 &unicode_as_number, /* tp_as_number */
10012 &unicode_as_sequence, /* tp_as_sequence */
10013 &unicode_as_mapping, /* tp_as_mapping */
10014 (hashfunc) unicode_hash, /* tp_hash*/
10015 0, /* tp_call*/
10016 (reprfunc) unicode_str, /* tp_str */
10017 PyObject_GenericGetAttr, /* tp_getattro */
10018 0, /* tp_setattro */
10019 0, /* tp_as_buffer */
10020 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000010021 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010022 unicode_doc, /* tp_doc */
10023 0, /* tp_traverse */
10024 0, /* tp_clear */
10025 PyUnicode_RichCompare, /* tp_richcompare */
10026 0, /* tp_weaklistoffset */
10027 unicode_iter, /* tp_iter */
10028 0, /* tp_iternext */
10029 unicode_methods, /* tp_methods */
10030 0, /* tp_members */
10031 0, /* tp_getset */
10032 &PyBaseObject_Type, /* tp_base */
10033 0, /* tp_dict */
10034 0, /* tp_descr_get */
10035 0, /* tp_descr_set */
10036 0, /* tp_dictoffset */
10037 0, /* tp_init */
10038 0, /* tp_alloc */
10039 unicode_new, /* tp_new */
10040 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010041};
10042
10043/* Initialize the Unicode implementation */
10044
Thomas Wouters78890102000-07-22 19:25:51 +000010045void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010046{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010047 int i;
10048
Thomas Wouters477c8d52006-05-27 19:21:47 +000010049 /* XXX - move this array to unicodectype.c ? */
10050 Py_UNICODE linebreak[] = {
10051 0x000A, /* LINE FEED */
10052 0x000D, /* CARRIAGE RETURN */
10053 0x001C, /* FILE SEPARATOR */
10054 0x001D, /* GROUP SEPARATOR */
10055 0x001E, /* RECORD SEPARATOR */
10056 0x0085, /* NEXT LINE */
10057 0x2028, /* LINE SEPARATOR */
10058 0x2029, /* PARAGRAPH SEPARATOR */
10059 };
10060
Fred Drakee4315f52000-05-09 19:53:39 +000010061 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +000010062 free_list = NULL;
10063 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010064 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010065 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +000010066 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010067
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010068 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000010069 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000010070 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010071 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000010072
10073 /* initialize the linebreak bloom filter */
10074 bloom_linebreak = make_bloom_mask(
10075 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10076 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010077
10078 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010079}
10080
10081/* Finalize the Unicode implementation */
10082
Christian Heimesa156e092008-02-16 07:38:31 +000010083int
10084PyUnicode_ClearFreeList(void)
10085{
10086 int freelist_size = numfree;
10087 PyUnicodeObject *u;
10088
10089 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010090 PyUnicodeObject *v = u;
10091 u = *(PyUnicodeObject **)u;
10092 if (v->str)
10093 PyObject_DEL(v->str);
10094 Py_XDECREF(v->defenc);
10095 PyObject_Del(v);
10096 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010097 }
10098 free_list = NULL;
10099 assert(numfree == 0);
10100 return freelist_size;
10101}
10102
Guido van Rossumd57fd912000-03-10 22:53:23 +000010103void
Thomas Wouters78890102000-07-22 19:25:51 +000010104_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010105{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010106 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010107
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010108 Py_XDECREF(unicode_empty);
10109 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010110
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010111 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010112 if (unicode_latin1[i]) {
10113 Py_DECREF(unicode_latin1[i]);
10114 unicode_latin1[i] = NULL;
10115 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010116 }
Christian Heimesa156e092008-02-16 07:38:31 +000010117 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010118}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010119
Walter Dörwald16807132007-05-25 13:52:07 +000010120void
10121PyUnicode_InternInPlace(PyObject **p)
10122{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010123 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10124 PyObject *t;
10125 if (s == NULL || !PyUnicode_Check(s))
10126 Py_FatalError(
10127 "PyUnicode_InternInPlace: unicode strings only please!");
10128 /* If it's a subclass, we don't really know what putting
10129 it in the interned dict might do. */
10130 if (!PyUnicode_CheckExact(s))
10131 return;
10132 if (PyUnicode_CHECK_INTERNED(s))
10133 return;
10134 if (interned == NULL) {
10135 interned = PyDict_New();
10136 if (interned == NULL) {
10137 PyErr_Clear(); /* Don't leave an exception */
10138 return;
10139 }
10140 }
10141 /* It might be that the GetItem call fails even
10142 though the key is present in the dictionary,
10143 namely when this happens during a stack overflow. */
10144 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010145 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010146 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010147
Benjamin Peterson29060642009-01-31 22:14:21 +000010148 if (t) {
10149 Py_INCREF(t);
10150 Py_DECREF(*p);
10151 *p = t;
10152 return;
10153 }
Walter Dörwald16807132007-05-25 13:52:07 +000010154
Benjamin Peterson14339b62009-01-31 16:36:08 +000010155 PyThreadState_GET()->recursion_critical = 1;
10156 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10157 PyErr_Clear();
10158 PyThreadState_GET()->recursion_critical = 0;
10159 return;
10160 }
10161 PyThreadState_GET()->recursion_critical = 0;
10162 /* The two references in interned are not counted by refcnt.
10163 The deallocator will take care of this */
10164 Py_REFCNT(s) -= 2;
10165 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010166}
10167
10168void
10169PyUnicode_InternImmortal(PyObject **p)
10170{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010171 PyUnicode_InternInPlace(p);
10172 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10173 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10174 Py_INCREF(*p);
10175 }
Walter Dörwald16807132007-05-25 13:52:07 +000010176}
10177
10178PyObject *
10179PyUnicode_InternFromString(const char *cp)
10180{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010181 PyObject *s = PyUnicode_FromString(cp);
10182 if (s == NULL)
10183 return NULL;
10184 PyUnicode_InternInPlace(&s);
10185 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010186}
10187
Alexander Belopolsky40018472011-02-26 01:02:56 +000010188void
10189_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000010190{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010191 PyObject *keys;
10192 PyUnicodeObject *s;
10193 Py_ssize_t i, n;
10194 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010195
Benjamin Peterson14339b62009-01-31 16:36:08 +000010196 if (interned == NULL || !PyDict_Check(interned))
10197 return;
10198 keys = PyDict_Keys(interned);
10199 if (keys == NULL || !PyList_Check(keys)) {
10200 PyErr_Clear();
10201 return;
10202 }
Walter Dörwald16807132007-05-25 13:52:07 +000010203
Benjamin Peterson14339b62009-01-31 16:36:08 +000010204 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10205 detector, interned unicode strings are not forcibly deallocated;
10206 rather, we give them their stolen references back, and then clear
10207 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010208
Benjamin Peterson14339b62009-01-31 16:36:08 +000010209 n = PyList_GET_SIZE(keys);
10210 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010211 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010212 for (i = 0; i < n; i++) {
10213 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10214 switch (s->state) {
10215 case SSTATE_NOT_INTERNED:
10216 /* XXX Shouldn't happen */
10217 break;
10218 case SSTATE_INTERNED_IMMORTAL:
10219 Py_REFCNT(s) += 1;
10220 immortal_size += s->length;
10221 break;
10222 case SSTATE_INTERNED_MORTAL:
10223 Py_REFCNT(s) += 2;
10224 mortal_size += s->length;
10225 break;
10226 default:
10227 Py_FatalError("Inconsistent interned string state.");
10228 }
10229 s->state = SSTATE_NOT_INTERNED;
10230 }
10231 fprintf(stderr, "total size of all interned strings: "
10232 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10233 "mortal/immortal\n", mortal_size, immortal_size);
10234 Py_DECREF(keys);
10235 PyDict_Clear(interned);
10236 Py_DECREF(interned);
10237 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010238}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010239
10240
10241/********************* Unicode Iterator **************************/
10242
10243typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010244 PyObject_HEAD
10245 Py_ssize_t it_index;
10246 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010247} unicodeiterobject;
10248
10249static void
10250unicodeiter_dealloc(unicodeiterobject *it)
10251{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010252 _PyObject_GC_UNTRACK(it);
10253 Py_XDECREF(it->it_seq);
10254 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010255}
10256
10257static int
10258unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10259{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010260 Py_VISIT(it->it_seq);
10261 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010262}
10263
10264static PyObject *
10265unicodeiter_next(unicodeiterobject *it)
10266{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010267 PyUnicodeObject *seq;
10268 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010269
Benjamin Peterson14339b62009-01-31 16:36:08 +000010270 assert(it != NULL);
10271 seq = it->it_seq;
10272 if (seq == NULL)
10273 return NULL;
10274 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010275
Benjamin Peterson14339b62009-01-31 16:36:08 +000010276 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10277 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010278 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010279 if (item != NULL)
10280 ++it->it_index;
10281 return item;
10282 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010283
Benjamin Peterson14339b62009-01-31 16:36:08 +000010284 Py_DECREF(seq);
10285 it->it_seq = NULL;
10286 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010287}
10288
10289static PyObject *
10290unicodeiter_len(unicodeiterobject *it)
10291{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010292 Py_ssize_t len = 0;
10293 if (it->it_seq)
10294 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10295 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010296}
10297
10298PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10299
10300static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010301 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010302 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010303 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010304};
10305
10306PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010307 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10308 "str_iterator", /* tp_name */
10309 sizeof(unicodeiterobject), /* tp_basicsize */
10310 0, /* tp_itemsize */
10311 /* methods */
10312 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10313 0, /* tp_print */
10314 0, /* tp_getattr */
10315 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010316 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010317 0, /* tp_repr */
10318 0, /* tp_as_number */
10319 0, /* tp_as_sequence */
10320 0, /* tp_as_mapping */
10321 0, /* tp_hash */
10322 0, /* tp_call */
10323 0, /* tp_str */
10324 PyObject_GenericGetAttr, /* tp_getattro */
10325 0, /* tp_setattro */
10326 0, /* tp_as_buffer */
10327 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10328 0, /* tp_doc */
10329 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10330 0, /* tp_clear */
10331 0, /* tp_richcompare */
10332 0, /* tp_weaklistoffset */
10333 PyObject_SelfIter, /* tp_iter */
10334 (iternextfunc)unicodeiter_next, /* tp_iternext */
10335 unicodeiter_methods, /* tp_methods */
10336 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010337};
10338
10339static PyObject *
10340unicode_iter(PyObject *seq)
10341{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010342 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010343
Benjamin Peterson14339b62009-01-31 16:36:08 +000010344 if (!PyUnicode_Check(seq)) {
10345 PyErr_BadInternalCall();
10346 return NULL;
10347 }
10348 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10349 if (it == NULL)
10350 return NULL;
10351 it->it_index = 0;
10352 Py_INCREF(seq);
10353 it->it_seq = (PyUnicodeObject *)seq;
10354 _PyObject_GC_TRACK(it);
10355 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010356}
10357
Martin v. Löwis5b222132007-06-10 09:51:05 +000010358size_t
10359Py_UNICODE_strlen(const Py_UNICODE *u)
10360{
10361 int res = 0;
10362 while(*u++)
10363 res++;
10364 return res;
10365}
10366
10367Py_UNICODE*
10368Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10369{
10370 Py_UNICODE *u = s1;
10371 while ((*u++ = *s2++));
10372 return s1;
10373}
10374
10375Py_UNICODE*
10376Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10377{
10378 Py_UNICODE *u = s1;
10379 while ((*u++ = *s2++))
10380 if (n-- == 0)
10381 break;
10382 return s1;
10383}
10384
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010385Py_UNICODE*
10386Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10387{
10388 Py_UNICODE *u1 = s1;
10389 u1 += Py_UNICODE_strlen(u1);
10390 Py_UNICODE_strcpy(u1, s2);
10391 return s1;
10392}
10393
Martin v. Löwis5b222132007-06-10 09:51:05 +000010394int
10395Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10396{
10397 while (*s1 && *s2 && *s1 == *s2)
10398 s1++, s2++;
10399 if (*s1 && *s2)
10400 return (*s1 < *s2) ? -1 : +1;
10401 if (*s1)
10402 return 1;
10403 if (*s2)
10404 return -1;
10405 return 0;
10406}
10407
Victor Stinneref8d95c2010-08-16 22:03:11 +000010408int
10409Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10410{
10411 register Py_UNICODE u1, u2;
10412 for (; n != 0; n--) {
10413 u1 = *s1;
10414 u2 = *s2;
10415 if (u1 != u2)
10416 return (u1 < u2) ? -1 : +1;
10417 if (u1 == '\0')
10418 return 0;
10419 s1++;
10420 s2++;
10421 }
10422 return 0;
10423}
10424
Martin v. Löwis5b222132007-06-10 09:51:05 +000010425Py_UNICODE*
10426Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10427{
10428 const Py_UNICODE *p;
10429 for (p = s; *p; p++)
10430 if (*p == c)
10431 return (Py_UNICODE*)p;
10432 return NULL;
10433}
10434
Victor Stinner331ea922010-08-10 16:37:20 +000010435Py_UNICODE*
10436Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10437{
10438 const Py_UNICODE *p;
10439 p = s + Py_UNICODE_strlen(s);
10440 while (p != s) {
10441 p--;
10442 if (*p == c)
10443 return (Py_UNICODE*)p;
10444 }
10445 return NULL;
10446}
10447
Victor Stinner71133ff2010-09-01 23:43:53 +000010448Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010449PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010450{
10451 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10452 Py_UNICODE *copy;
10453 Py_ssize_t size;
10454
10455 /* Ensure we won't overflow the size. */
10456 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10457 PyErr_NoMemory();
10458 return NULL;
10459 }
10460 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10461 size *= sizeof(Py_UNICODE);
10462 copy = PyMem_Malloc(size);
10463 if (copy == NULL) {
10464 PyErr_NoMemory();
10465 return NULL;
10466 }
10467 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10468 return copy;
10469}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010470
Georg Brandl66c221e2010-10-14 07:04:07 +000010471/* A _string module, to export formatter_parser and formatter_field_name_split
10472 to the string.Formatter class implemented in Python. */
10473
10474static PyMethodDef _string_methods[] = {
10475 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10476 METH_O, PyDoc_STR("split the argument as a field name")},
10477 {"formatter_parser", (PyCFunction) formatter_parser,
10478 METH_O, PyDoc_STR("parse the argument as a format string")},
10479 {NULL, NULL}
10480};
10481
10482static struct PyModuleDef _string_module = {
10483 PyModuleDef_HEAD_INIT,
10484 "_string",
10485 PyDoc_STR("string helper module"),
10486 0,
10487 _string_methods,
10488 NULL,
10489 NULL,
10490 NULL,
10491 NULL
10492};
10493
10494PyMODINIT_FUNC
10495PyInit__string(void)
10496{
10497 return PyModule_Create(&_string_module);
10498}
10499
10500
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010501#ifdef __cplusplus
10502}
10503#endif