blob: 1d0e97b8ce09515eb89496077df3af9573c707dd [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Alexander Belopolsky40018472011-02-26 01:02:56 +0000145static PyObject *
146unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000147 PyObject **errorHandler,const char *encoding, const char *reason,
148 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
149 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
150
Alexander Belopolsky40018472011-02-26 01:02:56 +0000151static void
152raise_encode_exception(PyObject **exceptionObject,
153 const char *encoding,
154 const Py_UNICODE *unicode, Py_ssize_t size,
155 Py_ssize_t startpos, Py_ssize_t endpos,
156 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000157
Christian Heimes190d79e2008-01-30 11:58:22 +0000158/* Same for linebreaks */
159static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000161/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000162/* 0x000B, * LINE TABULATION */
163/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000165 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000166 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000167/* 0x001C, * FILE SEPARATOR */
168/* 0x001D, * GROUP SEPARATOR */
169/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000175
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000184};
185
186
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000188PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000190#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000192#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 /* This is actually an illegal character, so it should
194 not be passed to unichr. */
195 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000196#endif
197}
198
Thomas Wouters477c8d52006-05-27 19:21:47 +0000199/* --- Bloom Filters ----------------------------------------------------- */
200
201/* stuff to implement simple "bloom filters" for Unicode characters.
202 to keep things simple, we use a single bitmask, using the least 5
203 bits from each unicode characters as the bit index. */
204
205/* the linebreak mask is set up by Unicode_Init below */
206
Antoine Pitrouf068f942010-01-13 14:19:12 +0000207#if LONG_BIT >= 128
208#define BLOOM_WIDTH 128
209#elif LONG_BIT >= 64
210#define BLOOM_WIDTH 64
211#elif LONG_BIT >= 32
212#define BLOOM_WIDTH 32
213#else
214#error "LONG_BIT is smaller than 32"
215#endif
216
Thomas Wouters477c8d52006-05-27 19:21:47 +0000217#define BLOOM_MASK unsigned long
218
219static BLOOM_MASK bloom_linebreak;
220
Antoine Pitrouf068f942010-01-13 14:19:12 +0000221#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
222#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223
Benjamin Peterson29060642009-01-31 22:14:21 +0000224#define BLOOM_LINEBREAK(ch) \
225 ((ch) < 128U ? ascii_linebreak[(ch)] : \
226 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227
Alexander Belopolsky40018472011-02-26 01:02:56 +0000228Py_LOCAL_INLINE(BLOOM_MASK)
229make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243Py_LOCAL_INLINE(int)
244unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245{
246 Py_ssize_t i;
247
248 for (i = 0; i < setlen; i++)
249 if (set[i] == chr)
250 return 1;
251
252 return 0;
253}
254
Benjamin Peterson29060642009-01-31 22:14:21 +0000255#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000256 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
257
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258/* --- Unicode Object ----------------------------------------------------- */
259
Alexander Belopolsky40018472011-02-26 01:02:56 +0000260static int
261unicode_resize(register PyUnicodeObject *unicode,
262 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263{
264 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000265
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000268 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000270 /* Resizing shared object (unicode_empty or single character
271 objects) in-place is not allowed. Use PyUnicode_Resize()
272 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000275 (unicode->length == 1 &&
276 unicode->str[0] < 256U &&
277 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000279 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return -1;
281 }
282
Thomas Wouters477c8d52006-05-27 19:21:47 +0000283 /* We allocate one more byte to make sure the string is Ux0000 terminated.
284 The overallocation is also used by fastsearch, which assumes that it's
285 safe to look at str[length] (without making any assumptions about what
286 it contains). */
287
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000289 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000290 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000292 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 PyErr_NoMemory();
294 return -1;
295 }
296 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000297 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000301 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000302 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000303 }
304 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000305
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306 return 0;
307}
308
309/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000310 Ux0000 terminated; some code (e.g. new_identifier)
311 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312
313 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000314 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
316*/
317
Alexander Belopolsky40018472011-02-26 01:02:56 +0000318static PyUnicodeObject *
319_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320{
321 register PyUnicodeObject *unicode;
322
Thomas Wouters477c8d52006-05-27 19:21:47 +0000323 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (length == 0 && unicode_empty != NULL) {
325 Py_INCREF(unicode_empty);
326 return unicode_empty;
327 }
328
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000329 /* Ensure we won't overflow the size. */
330 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
331 return (PyUnicodeObject *)PyErr_NoMemory();
332 }
333
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000335 if (free_list) {
336 unicode = free_list;
337 free_list = *(PyUnicodeObject **)unicode;
338 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 if (unicode->str) {
340 /* Keep-Alive optimization: we only upsize the buffer,
341 never downsize it. */
342 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000343 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000344 PyObject_DEL(unicode->str);
345 unicode->str = NULL;
346 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000347 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000348 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000351 }
352 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 }
354 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000356 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 if (unicode == NULL)
358 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000359 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
360 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 }
362
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000363 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 PyErr_NoMemory();
365 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000366 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000367 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000368 * the caller fails before initializing str -- unicode_resize()
369 * reads str[0], and the Keep-Alive optimization can keep memory
370 * allocated for str alive across a call to unicode_dealloc(unicode).
371 * We don't want unicode_resize to read uninitialized memory in
372 * that case.
373 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000374 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000378 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000379 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000380 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381
Benjamin Peterson29060642009-01-31 22:14:21 +0000382 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000383 /* XXX UNREF/NEWREF interface should be more symmetrical */
384 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000385 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000386 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388}
389
Alexander Belopolsky40018472011-02-26 01:02:56 +0000390static void
391unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392{
Walter Dörwald16807132007-05-25 13:52:07 +0000393 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000394 case SSTATE_NOT_INTERNED:
395 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000396
Benjamin Peterson29060642009-01-31 22:14:21 +0000397 case SSTATE_INTERNED_MORTAL:
398 /* revive dead object temporarily for DelItem */
399 Py_REFCNT(unicode) = 3;
400 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
401 Py_FatalError(
402 "deletion of interned string failed");
403 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 case SSTATE_INTERNED_IMMORTAL:
406 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000407
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 default:
409 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000410 }
411
Guido van Rossum604ddf82001-12-06 20:03:56 +0000412 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000414 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000415 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
416 PyObject_DEL(unicode->str);
417 unicode->str = NULL;
418 unicode->length = 0;
419 }
420 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000421 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 }
423 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000424 *(PyUnicodeObject **)unicode = free_list;
425 free_list = unicode;
426 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 }
428 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000429 PyObject_DEL(unicode->str);
430 Py_XDECREF(unicode->defenc);
431 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 }
433}
434
Alexander Belopolsky40018472011-02-26 01:02:56 +0000435static int
436_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000437{
438 register PyUnicodeObject *v;
439
440 /* Argument checks */
441 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000442 PyErr_BadInternalCall();
443 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000446 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000447 PyErr_BadInternalCall();
448 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000449 }
450
451 /* Resizing unicode_empty and single character objects is not
452 possible since these are being shared. We simply return a fresh
453 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000454 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000455 (v == unicode_empty || v->length == 1)) {
456 PyUnicodeObject *w = _PyUnicode_New(length);
457 if (w == NULL)
458 return -1;
459 Py_UNICODE_COPY(w->str, v->str,
460 length < v->length ? length : v->length);
461 Py_DECREF(*unicode);
462 *unicode = w;
463 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000464 }
465
466 /* Note that we don't have to modify *unicode for unshared Unicode
467 objects, since we can modify them in-place. */
468 return unicode_resize(v, length);
469}
470
Alexander Belopolsky40018472011-02-26 01:02:56 +0000471int
472PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000473{
474 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
475}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476
Alexander Belopolsky40018472011-02-26 01:02:56 +0000477PyObject *
478PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479{
480 PyUnicodeObject *unicode;
481
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects. */
484 if (u != NULL) {
485
Benjamin Peterson29060642009-01-31 22:14:21 +0000486 /* Optimization for empty strings */
487 if (size == 0 && unicode_empty != NULL) {
488 Py_INCREF(unicode_empty);
489 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000490 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000491
492 /* Single character Unicode objects in the Latin-1 range are
493 shared when using this constructor */
494 if (size == 1 && *u < 256) {
495 unicode = unicode_latin1[*u];
496 if (!unicode) {
497 unicode = _PyUnicode_New(1);
498 if (!unicode)
499 return NULL;
500 unicode->str[0] = *u;
501 unicode_latin1[*u] = unicode;
502 }
503 Py_INCREF(unicode);
504 return (PyObject *)unicode;
505 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 unicode = _PyUnicode_New(size);
509 if (!unicode)
510 return NULL;
511
512 /* Copy the Unicode data into the new object */
513 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000514 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000515
516 return (PyObject *)unicode;
517}
518
Alexander Belopolsky40018472011-02-26 01:02:56 +0000519PyObject *
520PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000521{
522 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 if (size < 0) {
525 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000526 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000527 return NULL;
528 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000531 some optimizations which share commonly used objects.
532 Also, this means the input must be UTF-8, so fall back to the
533 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 if (u != NULL) {
535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536 /* Optimization for empty strings */
537 if (size == 0 && unicode_empty != NULL) {
538 Py_INCREF(unicode_empty);
539 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000540 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000541
542 /* Single characters are shared when using this constructor.
543 Restrict to ASCII, since the input must be UTF-8. */
544 if (size == 1 && Py_CHARMASK(*u) < 128) {
545 unicode = unicode_latin1[Py_CHARMASK(*u)];
546 if (!unicode) {
547 unicode = _PyUnicode_New(1);
548 if (!unicode)
549 return NULL;
550 unicode->str[0] = Py_CHARMASK(*u);
551 unicode_latin1[Py_CHARMASK(*u)] = unicode;
552 }
553 Py_INCREF(unicode);
554 return (PyObject *)unicode;
555 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000556
557 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 }
559
Walter Dörwald55507312007-05-18 13:12:10 +0000560 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 if (!unicode)
562 return NULL;
563
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 return (PyObject *)unicode;
565}
566
Alexander Belopolsky40018472011-02-26 01:02:56 +0000567PyObject *
568PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +0000569{
570 size_t size = strlen(u);
571 if (size > PY_SSIZE_T_MAX) {
572 PyErr_SetString(PyExc_OverflowError, "input too long");
573 return NULL;
574 }
575
576 return PyUnicode_FromStringAndSize(u, size);
577}
578
Guido van Rossumd57fd912000-03-10 22:53:23 +0000579#ifdef HAVE_WCHAR_H
580
Mark Dickinson081dfee2009-03-18 14:47:41 +0000581#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
582# define CONVERT_WCHAR_TO_SURROGATES
583#endif
584
585#ifdef CONVERT_WCHAR_TO_SURROGATES
586
587/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
588 to convert from UTF32 to UTF16. */
589
Alexander Belopolsky40018472011-02-26 01:02:56 +0000590PyObject *
591PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +0000592{
593 PyUnicodeObject *unicode;
594 register Py_ssize_t i;
595 Py_ssize_t alloc;
596 const wchar_t *orig_w;
597
598 if (w == NULL) {
599 if (size == 0)
600 return PyUnicode_FromStringAndSize(NULL, 0);
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 if (size == -1) {
606 size = wcslen(w);
607 }
608
609 alloc = size;
610 orig_w = w;
611 for (i = size; i > 0; i--) {
612 if (*w > 0xFFFF)
613 alloc++;
614 w++;
615 }
616 w = orig_w;
617 unicode = _PyUnicode_New(alloc);
618 if (!unicode)
619 return NULL;
620
621 /* Copy the wchar_t data into the new object */
622 {
623 register Py_UNICODE *u;
624 u = PyUnicode_AS_UNICODE(unicode);
625 for (i = size; i > 0; i--) {
626 if (*w > 0xFFFF) {
627 wchar_t ordinal = *w++;
628 ordinal -= 0x10000;
629 *u++ = 0xD800 | (ordinal >> 10);
630 *u++ = 0xDC00 | (ordinal & 0x3FF);
631 }
632 else
633 *u++ = *w++;
634 }
635 }
636 return (PyObject *)unicode;
637}
638
639#else
640
Alexander Belopolsky40018472011-02-26 01:02:56 +0000641PyObject *
642PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643{
644 PyUnicodeObject *unicode;
645
646 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000647 if (size == 0)
648 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 PyErr_BadInternalCall();
650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 }
652
Martin v. Löwis790465f2008-04-05 20:41:37 +0000653 if (size == -1) {
654 size = wcslen(w);
655 }
656
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 unicode = _PyUnicode_New(size);
658 if (!unicode)
659 return NULL;
660
661 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000662#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000664#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000666 register Py_UNICODE *u;
667 register Py_ssize_t i;
668 u = PyUnicode_AS_UNICODE(unicode);
669 for (i = size; i > 0; i--)
670 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671 }
672#endif
673
674 return (PyObject *)unicode;
675}
676
Mark Dickinson081dfee2009-03-18 14:47:41 +0000677#endif /* CONVERT_WCHAR_TO_SURROGATES */
678
679#undef CONVERT_WCHAR_TO_SURROGATES
680
Walter Dörwald346737f2007-05-31 10:44:43 +0000681static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000682makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
683 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000684{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000685 *fmt++ = '%';
686 if (width) {
687 if (zeropad)
688 *fmt++ = '0';
689 fmt += sprintf(fmt, "%d", width);
690 }
691 if (precision)
692 fmt += sprintf(fmt, ".%d", precision);
693 if (longflag)
694 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000695 else if (longlongflag) {
696 /* longlongflag should only ever be nonzero on machines with
697 HAVE_LONG_LONG defined */
698#ifdef HAVE_LONG_LONG
699 char *f = PY_FORMAT_LONG_LONG;
700 while (*f)
701 *fmt++ = *f++;
702#else
703 /* we shouldn't ever get here */
704 assert(0);
705 *fmt++ = 'l';
706#endif
707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000708 else if (size_tflag) {
709 char *f = PY_FORMAT_SIZE_T;
710 while (*f)
711 *fmt++ = *f++;
712 }
713 *fmt++ = c;
714 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000715}
716
Victor Stinner96865452011-03-01 23:44:09 +0000717/* helper for PyUnicode_FromFormatV() */
718
719static const char*
720parse_format_flags(const char *f,
721 int *p_width, int *p_precision,
722 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
723{
724 int width, precision, longflag, longlongflag, size_tflag;
725
726 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
727 f++;
728 width = 0;
729 while (Py_ISDIGIT((unsigned)*f))
730 width = (width*10) + *f++ - '0';
731 precision = 0;
732 if (*f == '.') {
733 f++;
734 while (Py_ISDIGIT((unsigned)*f))
735 precision = (precision*10) + *f++ - '0';
736 if (*f == '%') {
737 /* "%.3%s" => f points to "3" */
738 f--;
739 }
740 }
741 if (*f == '\0') {
742 /* bogus format "%.1" => go backward, f points to "1" */
743 f--;
744 }
745 if (p_width != NULL)
746 *p_width = width;
747 if (p_precision != NULL)
748 *p_precision = precision;
749
750 /* Handle %ld, %lu, %lld and %llu. */
751 longflag = 0;
752 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +0000753 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +0000754
755 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +0000756 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +0000757 longflag = 1;
758 ++f;
759 }
760#ifdef HAVE_LONG_LONG
761 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +0000762 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +0000763 longlongflag = 1;
764 f += 2;
765 }
766#endif
767 }
768 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +0000769 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +0000770 size_tflag = 1;
771 ++f;
772 }
773 if (p_longflag != NULL)
774 *p_longflag = longflag;
775 if (p_longlongflag != NULL)
776 *p_longlongflag = longlongflag;
777 if (p_size_tflag != NULL)
778 *p_size_tflag = size_tflag;
779 return f;
780}
781
Walter Dörwaldd2034312007-05-18 16:29:38 +0000782#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
783
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000784/* size of fixed-size buffer for formatting single arguments */
785#define ITEM_BUFFER_LEN 21
786/* maximum number of characters required for output of %ld. 21 characters
787 allows for 64-bit integers (in decimal) and an optional sign. */
788#define MAX_LONG_CHARS 21
789/* maximum number of characters required for output of %lld.
790 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
791 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
792#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
793
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794PyObject *
795PyUnicode_FromFormatV(const char *format, va_list vargs)
796{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000797 va_list count;
798 Py_ssize_t callcount = 0;
799 PyObject **callresults = NULL;
800 PyObject **callresult = NULL;
801 Py_ssize_t n = 0;
802 int width = 0;
803 int precision = 0;
804 int zeropad;
805 const char* f;
806 Py_UNICODE *s;
807 PyObject *string;
808 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000809 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* use abuffer instead of buffer, if we need more space
811 * (which can happen if there's a format specifier with width). */
812 char *abuffer = NULL;
813 char *realbuffer;
814 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000815 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000816 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000817
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000818 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000819 /* step 1: count the number of %S/%R/%A/%s format specifications
820 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
821 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
822 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000823 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000824 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +0000825 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
826 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
827 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000828 ++callcount;
829 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000830 else if (128 <= (unsigned char)*f) {
831 PyErr_Format(PyExc_ValueError,
832 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000833 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000834 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000835 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000836 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000837 }
838 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000839 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000840 if (callcount) {
841 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
842 if (!callresults) {
843 PyErr_NoMemory();
844 return NULL;
845 }
846 callresult = callresults;
847 }
848 /* step 3: figure out how large a buffer we need */
849 for (f = format; *f; f++) {
850 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000851#ifdef HAVE_LONG_LONG
Victor Stinner96865452011-03-01 23:44:09 +0000852 int longlongflag;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000853#endif
Victor Stinner96865452011-03-01 23:44:09 +0000854 const char* p;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000855
Victor Stinner96865452011-03-01 23:44:09 +0000856 p = f;
857 f = parse_format_flags(f, &width, NULL,
858 NULL, &longlongflag, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859
Benjamin Peterson14339b62009-01-31 16:36:08 +0000860 switch (*f) {
861 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000862 {
863#ifndef Py_UNICODE_WIDE
864 int ordinal = va_arg(count, int);
865 if (ordinal > 0xffff)
866 n += 2;
867 else
868 n++;
869#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000870 (void)va_arg(count, int);
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000871 n++;
872#endif
873 break;
874 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000875 case '%':
876 n++;
877 break;
878 case 'd': case 'u': case 'i': case 'x':
879 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000880#ifdef HAVE_LONG_LONG
881 if (longlongflag) {
882 if (width < MAX_LONG_LONG_CHARS)
883 width = MAX_LONG_LONG_CHARS;
884 }
885 else
886#endif
887 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
888 including sign. Decimal takes the most space. This
889 isn't enough for octal. If a width is specified we
890 need more (which we allocate later). */
891 if (width < MAX_LONG_CHARS)
892 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000893 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000894 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000895 if (abuffersize < width)
896 abuffersize = width;
897 break;
898 case 's':
899 {
900 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000901 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000902 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
903 if (!str)
904 goto fail;
905 n += PyUnicode_GET_SIZE(str);
906 /* Remember the str and switch to the next slot */
907 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000908 break;
909 }
910 case 'U':
911 {
912 PyObject *obj = va_arg(count, PyObject *);
913 assert(obj && PyUnicode_Check(obj));
914 n += PyUnicode_GET_SIZE(obj);
915 break;
916 }
917 case 'V':
918 {
919 PyObject *obj = va_arg(count, PyObject *);
920 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000921 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000922 assert(obj || str);
923 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +0000924 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000925 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000926 *callresult++ = NULL;
927 }
928 else {
929 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
930 if (!str_obj)
931 goto fail;
932 n += PyUnicode_GET_SIZE(str_obj);
933 *callresult++ = str_obj;
934 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000935 break;
936 }
937 case 'S':
938 {
939 PyObject *obj = va_arg(count, PyObject *);
940 PyObject *str;
941 assert(obj);
942 str = PyObject_Str(obj);
943 if (!str)
944 goto fail;
945 n += PyUnicode_GET_SIZE(str);
946 /* Remember the str and switch to the next slot */
947 *callresult++ = str;
948 break;
949 }
950 case 'R':
951 {
952 PyObject *obj = va_arg(count, PyObject *);
953 PyObject *repr;
954 assert(obj);
955 repr = PyObject_Repr(obj);
956 if (!repr)
957 goto fail;
958 n += PyUnicode_GET_SIZE(repr);
959 /* Remember the repr and switch to the next slot */
960 *callresult++ = repr;
961 break;
962 }
963 case 'A':
964 {
965 PyObject *obj = va_arg(count, PyObject *);
966 PyObject *ascii;
967 assert(obj);
968 ascii = PyObject_ASCII(obj);
969 if (!ascii)
970 goto fail;
971 n += PyUnicode_GET_SIZE(ascii);
972 /* Remember the repr and switch to the next slot */
973 *callresult++ = ascii;
974 break;
975 }
976 case 'p':
977 (void) va_arg(count, int);
978 /* maximum 64-bit pointer representation:
979 * 0xffffffffffffffff
980 * so 19 characters is enough.
981 * XXX I count 18 -- what's the extra for?
982 */
983 n += 19;
984 break;
985 default:
986 /* if we stumble upon an unknown
987 formatting code, copy the rest of
988 the format string to the output
989 string. (we cannot just skip the
990 code, since there's no way to know
991 what's in the argument list) */
992 n += strlen(p);
993 goto expand;
994 }
995 } else
996 n++;
997 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000998 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000999 if (abuffersize > ITEM_BUFFER_LEN) {
1000 /* add 1 for sprintf's trailing null byte */
1001 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001002 if (!abuffer) {
1003 PyErr_NoMemory();
1004 goto fail;
1005 }
1006 realbuffer = abuffer;
1007 }
1008 else
1009 realbuffer = buffer;
1010 /* step 4: fill the buffer */
1011 /* Since we've analyzed how much space we need for the worst case,
1012 we don't have to resize the string.
1013 There can be no errors beyond this point. */
1014 string = PyUnicode_FromUnicode(NULL, n);
1015 if (!string)
1016 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001017
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 s = PyUnicode_AS_UNICODE(string);
1019 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001020
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 for (f = format; *f; f++) {
1022 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001023 const char* p;
1024 int longflag;
1025 int longlongflag;
1026 int size_tflag;
1027
1028 p = f;
1029 zeropad = (f[1] == '0');
1030 f = parse_format_flags(f, &width, &precision,
1031 &longflag, &longlongflag, &size_tflag);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001032
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 switch (*f) {
1034 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001035 {
1036 int ordinal = va_arg(vargs, int);
1037#ifndef Py_UNICODE_WIDE
1038 if (ordinal > 0xffff) {
1039 ordinal -= 0x10000;
1040 *s++ = 0xD800 | (ordinal >> 10);
1041 *s++ = 0xDC00 | (ordinal & 0x3FF);
1042 } else
1043#endif
1044 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001046 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001047 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001048 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
Victor Stinner6d970f42011-03-02 00:04:25 +00001050 width, precision, *f);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 if (longflag)
1052 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001053#ifdef HAVE_LONG_LONG
1054 else if (longlongflag)
1055 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1056#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001057 else if (size_tflag)
1058 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1059 else
1060 sprintf(realbuffer, fmt, va_arg(vargs, int));
1061 appendstring(realbuffer);
1062 break;
1063 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001064 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1065 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001066 if (longflag)
1067 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001068#ifdef HAVE_LONG_LONG
1069 else if (longlongflag)
1070 sprintf(realbuffer, fmt, va_arg(vargs,
1071 unsigned PY_LONG_LONG));
1072#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001073 else if (size_tflag)
1074 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1075 else
1076 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1077 appendstring(realbuffer);
1078 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001079 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001080 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001081 sprintf(realbuffer, fmt, va_arg(vargs, int));
1082 appendstring(realbuffer);
1083 break;
1084 case 's':
1085 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001086 /* unused, since we already have the result */
1087 (void) va_arg(vargs, char *);
1088 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1089 PyUnicode_GET_SIZE(*callresult));
1090 s += PyUnicode_GET_SIZE(*callresult);
1091 /* We're done with the unicode()/repr() => forget it */
1092 Py_DECREF(*callresult);
1093 /* switch to next unicode()/repr() result */
1094 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001095 break;
1096 }
1097 case 'U':
1098 {
1099 PyObject *obj = va_arg(vargs, PyObject *);
1100 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1101 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1102 s += size;
1103 break;
1104 }
1105 case 'V':
1106 {
1107 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001108 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001109 if (obj) {
1110 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1111 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1112 s += size;
1113 } else {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001114 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1115 PyUnicode_GET_SIZE(*callresult));
1116 s += PyUnicode_GET_SIZE(*callresult);
1117 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001118 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001119 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001120 break;
1121 }
1122 case 'S':
1123 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001124 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001125 {
1126 Py_UNICODE *ucopy;
1127 Py_ssize_t usize;
1128 Py_ssize_t upos;
1129 /* unused, since we already have the result */
1130 (void) va_arg(vargs, PyObject *);
1131 ucopy = PyUnicode_AS_UNICODE(*callresult);
1132 usize = PyUnicode_GET_SIZE(*callresult);
1133 for (upos = 0; upos<usize;)
1134 *s++ = ucopy[upos++];
1135 /* We're done with the unicode()/repr() => forget it */
1136 Py_DECREF(*callresult);
1137 /* switch to next unicode()/repr() result */
1138 ++callresult;
1139 break;
1140 }
1141 case 'p':
1142 sprintf(buffer, "%p", va_arg(vargs, void*));
1143 /* %p is ill-defined: ensure leading 0x. */
1144 if (buffer[1] == 'X')
1145 buffer[1] = 'x';
1146 else if (buffer[1] != 'x') {
1147 memmove(buffer+2, buffer, strlen(buffer)+1);
1148 buffer[0] = '0';
1149 buffer[1] = 'x';
1150 }
1151 appendstring(buffer);
1152 break;
1153 case '%':
1154 *s++ = '%';
1155 break;
1156 default:
1157 appendstring(p);
1158 goto end;
1159 }
Victor Stinner1205f272010-09-11 00:54:47 +00001160 }
Victor Stinner1205f272010-09-11 00:54:47 +00001161 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 *s++ = *f;
1163 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001164
Benjamin Peterson29060642009-01-31 22:14:21 +00001165 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001166 if (callresults)
1167 PyObject_Free(callresults);
1168 if (abuffer)
1169 PyObject_Free(abuffer);
1170 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1171 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001173 if (callresults) {
1174 PyObject **callresult2 = callresults;
1175 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001176 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001177 ++callresult2;
1178 }
1179 PyObject_Free(callresults);
1180 }
1181 if (abuffer)
1182 PyObject_Free(abuffer);
1183 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001184}
1185
1186#undef appendstring
1187
1188PyObject *
1189PyUnicode_FromFormat(const char *format, ...)
1190{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001191 PyObject* ret;
1192 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001193
1194#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001195 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001196#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001197 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001198#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001199 ret = PyUnicode_FromFormatV(format, vargs);
1200 va_end(vargs);
1201 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001202}
1203
Victor Stinner5593d8a2010-10-02 11:11:27 +00001204/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1205 convert a Unicode object to a wide character string.
1206
1207 - If w is NULL: return the number of wide characters (including the nul
1208 character) required to convert the unicode object. Ignore size argument.
1209
1210 - Otherwise: return the number of wide characters (excluding the nul
1211 character) written into w. Write at most size wide characters (including
1212 the nul character). */
1213static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001214unicode_aswidechar(PyUnicodeObject *unicode,
1215 wchar_t *w,
1216 Py_ssize_t size)
1217{
1218#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001219 Py_ssize_t res;
1220 if (w != NULL) {
1221 res = PyUnicode_GET_SIZE(unicode);
1222 if (size > res)
1223 size = res + 1;
1224 else
1225 res = size;
1226 memcpy(w, unicode->str, size * sizeof(wchar_t));
1227 return res;
1228 }
1229 else
1230 return PyUnicode_GET_SIZE(unicode) + 1;
1231#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1232 register const Py_UNICODE *u;
1233 const Py_UNICODE *uend;
1234 const wchar_t *worig, *wend;
1235 Py_ssize_t nchar;
1236
Victor Stinner137c34c2010-09-29 10:25:54 +00001237 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001238 uend = u + PyUnicode_GET_SIZE(unicode);
1239 if (w != NULL) {
1240 worig = w;
1241 wend = w + size;
1242 while (u != uend && w != wend) {
1243 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1244 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1245 {
1246 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1247 u += 2;
1248 }
1249 else {
1250 *w = *u;
1251 u++;
1252 }
1253 w++;
1254 }
1255 if (w != wend)
1256 *w = L'\0';
1257 return w - worig;
1258 }
1259 else {
1260 nchar = 1; /* nul character at the end */
1261 while (u != uend) {
1262 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1263 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1264 u += 2;
1265 else
1266 u++;
1267 nchar++;
1268 }
1269 }
1270 return nchar;
1271#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1272 register Py_UNICODE *u, *uend, ordinal;
1273 register Py_ssize_t i;
1274 wchar_t *worig, *wend;
1275 Py_ssize_t nchar;
1276
1277 u = PyUnicode_AS_UNICODE(unicode);
1278 uend = u + PyUnicode_GET_SIZE(u);
1279 if (w != NULL) {
1280 worig = w;
1281 wend = w + size;
1282 while (u != uend && w != wend) {
1283 ordinal = *u;
1284 if (ordinal > 0xffff) {
1285 ordinal -= 0x10000;
1286 *w++ = 0xD800 | (ordinal >> 10);
1287 *w++ = 0xDC00 | (ordinal & 0x3FF);
1288 }
1289 else
1290 *w++ = ordinal;
1291 u++;
1292 }
1293 if (w != wend)
1294 *w = 0;
1295 return w - worig;
1296 }
1297 else {
1298 nchar = 1; /* nul character */
1299 while (u != uend) {
1300 if (*u > 0xffff)
1301 nchar += 2;
1302 else
1303 nchar++;
1304 u++;
1305 }
1306 return nchar;
1307 }
1308#else
1309# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001310#endif
1311}
1312
1313Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001314PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001315 wchar_t *w,
1316 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317{
1318 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001319 PyErr_BadInternalCall();
1320 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001322 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323}
1324
Victor Stinner137c34c2010-09-29 10:25:54 +00001325wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001326PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001327 Py_ssize_t *size)
1328{
1329 wchar_t* buffer;
1330 Py_ssize_t buflen;
1331
1332 if (unicode == NULL) {
1333 PyErr_BadInternalCall();
1334 return NULL;
1335 }
1336
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001337 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001338 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001339 PyErr_NoMemory();
1340 return NULL;
1341 }
1342
Victor Stinner137c34c2010-09-29 10:25:54 +00001343 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1344 if (buffer == NULL) {
1345 PyErr_NoMemory();
1346 return NULL;
1347 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001348 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001349 if (size != NULL)
1350 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001351 return buffer;
1352}
1353
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354#endif
1355
Alexander Belopolsky40018472011-02-26 01:02:56 +00001356PyObject *
1357PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001358{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001359 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001360
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001361 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001362 PyErr_SetString(PyExc_ValueError,
1363 "chr() arg not in range(0x110000)");
1364 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001365 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001366
1367#ifndef Py_UNICODE_WIDE
1368 if (ordinal > 0xffff) {
1369 ordinal -= 0x10000;
1370 s[0] = 0xD800 | (ordinal >> 10);
1371 s[1] = 0xDC00 | (ordinal & 0x3FF);
1372 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001373 }
1374#endif
1375
Hye-Shik Chang40574832004-04-06 07:24:51 +00001376 s[0] = (Py_UNICODE)ordinal;
1377 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001378}
1379
Alexander Belopolsky40018472011-02-26 01:02:56 +00001380PyObject *
1381PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001383 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001384 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001385 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001386 Py_INCREF(obj);
1387 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001388 }
1389 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 /* For a Unicode subtype that's not a Unicode object,
1391 return a true Unicode object with the same data. */
1392 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1393 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001394 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001395 PyErr_Format(PyExc_TypeError,
1396 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001397 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001398 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001399}
1400
Alexander Belopolsky40018472011-02-26 01:02:56 +00001401PyObject *
1402PyUnicode_FromEncodedObject(register PyObject *obj,
1403 const char *encoding,
1404 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001405{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001406 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001407 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001408
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001410 PyErr_BadInternalCall();
1411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001413
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001414 /* Decoding bytes objects is the most common case and should be fast */
1415 if (PyBytes_Check(obj)) {
1416 if (PyBytes_GET_SIZE(obj) == 0) {
1417 Py_INCREF(unicode_empty);
1418 v = (PyObject *) unicode_empty;
1419 }
1420 else {
1421 v = PyUnicode_Decode(
1422 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1423 encoding, errors);
1424 }
1425 return v;
1426 }
1427
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001428 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001429 PyErr_SetString(PyExc_TypeError,
1430 "decoding str is not supported");
1431 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001432 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001433
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001434 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1435 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1436 PyErr_Format(PyExc_TypeError,
1437 "coercing to str: need bytes, bytearray "
1438 "or buffer-like object, %.80s found",
1439 Py_TYPE(obj)->tp_name);
1440 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001441 }
Tim Petersced69f82003-09-16 20:30:58 +00001442
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001443 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001444 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001445 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446 }
Tim Petersced69f82003-09-16 20:30:58 +00001447 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001448 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001449
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001450 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001451 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452}
1453
Victor Stinner600d3be2010-06-10 12:00:55 +00001454/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001455 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1456 1 on success. */
1457static int
1458normalize_encoding(const char *encoding,
1459 char *lower,
1460 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001462 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001463 char *l;
1464 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001465
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001466 e = encoding;
1467 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001468 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001469 while (*e) {
1470 if (l == l_end)
1471 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001472 if (Py_ISUPPER(*e)) {
1473 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001474 }
1475 else if (*e == '_') {
1476 *l++ = '-';
1477 e++;
1478 }
1479 else {
1480 *l++ = *e++;
1481 }
1482 }
1483 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001484 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001485}
1486
Alexander Belopolsky40018472011-02-26 01:02:56 +00001487PyObject *
1488PyUnicode_Decode(const char *s,
1489 Py_ssize_t size,
1490 const char *encoding,
1491 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00001492{
1493 PyObject *buffer = NULL, *unicode;
1494 Py_buffer info;
1495 char lower[11]; /* Enough for any encoding shortcut */
1496
1497 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001499
1500 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001501 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001502 if ((strcmp(lower, "utf-8") == 0) ||
1503 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001504 return PyUnicode_DecodeUTF8(s, size, errors);
1505 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001506 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001507 (strcmp(lower, "iso-8859-1") == 0))
1508 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001509#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001510 else if (strcmp(lower, "mbcs") == 0)
1511 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001512#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001513 else if (strcmp(lower, "ascii") == 0)
1514 return PyUnicode_DecodeASCII(s, size, errors);
1515 else if (strcmp(lower, "utf-16") == 0)
1516 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1517 else if (strcmp(lower, "utf-32") == 0)
1518 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001520
1521 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001522 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001523 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001524 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001525 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 if (buffer == NULL)
1527 goto onError;
1528 unicode = PyCodec_Decode(buffer, encoding, errors);
1529 if (unicode == NULL)
1530 goto onError;
1531 if (!PyUnicode_Check(unicode)) {
1532 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001533 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001534 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001535 Py_DECREF(unicode);
1536 goto onError;
1537 }
1538 Py_DECREF(buffer);
1539 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001540
Benjamin Peterson29060642009-01-31 22:14:21 +00001541 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 Py_XDECREF(buffer);
1543 return NULL;
1544}
1545
Alexander Belopolsky40018472011-02-26 01:02:56 +00001546PyObject *
1547PyUnicode_AsDecodedObject(PyObject *unicode,
1548 const char *encoding,
1549 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001550{
1551 PyObject *v;
1552
1553 if (!PyUnicode_Check(unicode)) {
1554 PyErr_BadArgument();
1555 goto onError;
1556 }
1557
1558 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001559 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001560
1561 /* Decode via the codec registry */
1562 v = PyCodec_Decode(unicode, encoding, errors);
1563 if (v == NULL)
1564 goto onError;
1565 return v;
1566
Benjamin Peterson29060642009-01-31 22:14:21 +00001567 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001568 return NULL;
1569}
1570
Alexander Belopolsky40018472011-02-26 01:02:56 +00001571PyObject *
1572PyUnicode_AsDecodedUnicode(PyObject *unicode,
1573 const char *encoding,
1574 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001575{
1576 PyObject *v;
1577
1578 if (!PyUnicode_Check(unicode)) {
1579 PyErr_BadArgument();
1580 goto onError;
1581 }
1582
1583 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001584 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001585
1586 /* Decode via the codec registry */
1587 v = PyCodec_Decode(unicode, encoding, errors);
1588 if (v == NULL)
1589 goto onError;
1590 if (!PyUnicode_Check(v)) {
1591 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001592 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001593 Py_TYPE(v)->tp_name);
1594 Py_DECREF(v);
1595 goto onError;
1596 }
1597 return v;
1598
Benjamin Peterson29060642009-01-31 22:14:21 +00001599 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001600 return NULL;
1601}
1602
Alexander Belopolsky40018472011-02-26 01:02:56 +00001603PyObject *
1604PyUnicode_Encode(const Py_UNICODE *s,
1605 Py_ssize_t size,
1606 const char *encoding,
1607 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608{
1609 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001610
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 unicode = PyUnicode_FromUnicode(s, size);
1612 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001613 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1615 Py_DECREF(unicode);
1616 return v;
1617}
1618
Alexander Belopolsky40018472011-02-26 01:02:56 +00001619PyObject *
1620PyUnicode_AsEncodedObject(PyObject *unicode,
1621 const char *encoding,
1622 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001623{
1624 PyObject *v;
1625
1626 if (!PyUnicode_Check(unicode)) {
1627 PyErr_BadArgument();
1628 goto onError;
1629 }
1630
1631 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001632 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001633
1634 /* Encode via the codec registry */
1635 v = PyCodec_Encode(unicode, encoding, errors);
1636 if (v == NULL)
1637 goto onError;
1638 return v;
1639
Benjamin Peterson29060642009-01-31 22:14:21 +00001640 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001641 return NULL;
1642}
1643
Victor Stinnerad158722010-10-27 00:25:46 +00001644PyObject *
1645PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001646{
Victor Stinner313a1202010-06-11 23:56:51 +00001647#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001648 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1649 PyUnicode_GET_SIZE(unicode),
1650 NULL);
1651#elif defined(__APPLE__)
1652 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1653 PyUnicode_GET_SIZE(unicode),
1654 "surrogateescape");
1655#else
1656 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001657 return PyUnicode_AsEncodedString(unicode,
1658 Py_FileSystemDefaultEncoding,
1659 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001660 }
1661 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001662 /* locale encoding with surrogateescape */
1663 wchar_t *wchar;
1664 char *bytes;
1665 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001666 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001667
1668 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1669 if (wchar == NULL)
1670 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001671 bytes = _Py_wchar2char(wchar, &error_pos);
1672 if (bytes == NULL) {
1673 if (error_pos != (size_t)-1) {
1674 char *errmsg = strerror(errno);
1675 PyObject *exc = NULL;
1676 if (errmsg == NULL)
1677 errmsg = "Py_wchar2char() failed";
1678 raise_encode_exception(&exc,
1679 "filesystemencoding",
1680 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1681 error_pos, error_pos+1,
1682 errmsg);
1683 Py_XDECREF(exc);
1684 }
1685 else
1686 PyErr_NoMemory();
1687 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001688 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001689 }
1690 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001691
1692 bytes_obj = PyBytes_FromString(bytes);
1693 PyMem_Free(bytes);
1694 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001695 }
Victor Stinnerad158722010-10-27 00:25:46 +00001696#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001697}
1698
Alexander Belopolsky40018472011-02-26 01:02:56 +00001699PyObject *
1700PyUnicode_AsEncodedString(PyObject *unicode,
1701 const char *encoding,
1702 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703{
1704 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001705 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001706
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 if (!PyUnicode_Check(unicode)) {
1708 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001709 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710 }
Fred Drakee4315f52000-05-09 19:53:39 +00001711
Victor Stinner2f283c22011-03-02 01:21:46 +00001712 if (encoding == NULL) {
1713 if (errors == NULL || strcmp(errors, "strict") == 0)
1714 return PyUnicode_AsUTF8String(unicode);
1715 else
1716 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1717 PyUnicode_GET_SIZE(unicode),
1718 errors);
1719 }
Fred Drakee4315f52000-05-09 19:53:39 +00001720
1721 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001722 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001723 if ((strcmp(lower, "utf-8") == 0) ||
1724 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00001725 {
Victor Stinner2f283c22011-03-02 01:21:46 +00001726 if (errors == NULL || strcmp(errors, "strict") == 0)
Victor Stinnera5c68c32011-03-02 01:03:14 +00001727 return PyUnicode_AsUTF8String(unicode);
Victor Stinner2f283c22011-03-02 01:21:46 +00001728 else
Victor Stinnera5c68c32011-03-02 01:03:14 +00001729 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1730 PyUnicode_GET_SIZE(unicode),
1731 errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00001732 }
Victor Stinner37296e82010-06-10 13:36:23 +00001733 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001734 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001735 (strcmp(lower, "iso-8859-1") == 0))
1736 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1737 PyUnicode_GET_SIZE(unicode),
1738 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001739#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001740 else if (strcmp(lower, "mbcs") == 0)
1741 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1742 PyUnicode_GET_SIZE(unicode),
1743 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001744#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001745 else if (strcmp(lower, "ascii") == 0)
1746 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1747 PyUnicode_GET_SIZE(unicode),
1748 errors);
1749 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750
1751 /* Encode via the codec registry */
1752 v = PyCodec_Encode(unicode, encoding, errors);
1753 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001754 return NULL;
1755
1756 /* The normal path */
1757 if (PyBytes_Check(v))
1758 return v;
1759
1760 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001761 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001762 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001763 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001764
1765 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1766 "encoder %s returned bytearray instead of bytes",
1767 encoding);
1768 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001769 Py_DECREF(v);
1770 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001771 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001772
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001773 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1774 Py_DECREF(v);
1775 return b;
1776 }
1777
1778 PyErr_Format(PyExc_TypeError,
1779 "encoder did not return a bytes object (type=%.400s)",
1780 Py_TYPE(v)->tp_name);
1781 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001782 return NULL;
1783}
1784
Alexander Belopolsky40018472011-02-26 01:02:56 +00001785PyObject *
1786PyUnicode_AsEncodedUnicode(PyObject *unicode,
1787 const char *encoding,
1788 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001789{
1790 PyObject *v;
1791
1792 if (!PyUnicode_Check(unicode)) {
1793 PyErr_BadArgument();
1794 goto onError;
1795 }
1796
1797 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001798 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001799
1800 /* Encode via the codec registry */
1801 v = PyCodec_Encode(unicode, encoding, errors);
1802 if (v == NULL)
1803 goto onError;
1804 if (!PyUnicode_Check(v)) {
1805 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001806 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001807 Py_TYPE(v)->tp_name);
1808 Py_DECREF(v);
1809 goto onError;
1810 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001812
Benjamin Peterson29060642009-01-31 22:14:21 +00001813 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 return NULL;
1815}
1816
Alexander Belopolsky40018472011-02-26 01:02:56 +00001817PyObject *
Victor Stinnerf3fd7332011-03-02 01:03:11 +00001818_PyUnicode_AsDefaultEncodedString(PyObject *unicode)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001819{
1820 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001821 if (v)
1822 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001823 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001824 PyUnicode_GET_SIZE(unicode),
1825 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001826 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001827 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001828 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001829 return v;
1830}
1831
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001832PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001833PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001834 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001835 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1836}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001837
Christian Heimes5894ba72007-11-04 11:43:14 +00001838PyObject*
1839PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1840{
Victor Stinnerad158722010-10-27 00:25:46 +00001841#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1842 return PyUnicode_DecodeMBCS(s, size, NULL);
1843#elif defined(__APPLE__)
1844 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1845#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001846 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1847 can be undefined. If it is case, decode using UTF-8. The following assumes
1848 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1849 bootstrapping process where the codecs aren't ready yet.
1850 */
1851 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001852 return PyUnicode_Decode(s, size,
1853 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001854 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001855 }
1856 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001857 /* locale encoding with surrogateescape */
1858 wchar_t *wchar;
1859 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001860 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001861
1862 if (s[size] != '\0' || size != strlen(s)) {
1863 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1864 return NULL;
1865 }
1866
Victor Stinner168e1172010-10-16 23:16:16 +00001867 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001868 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001869 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001870
Victor Stinner168e1172010-10-16 23:16:16 +00001871 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001872 PyMem_Free(wchar);
1873 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001874 }
Victor Stinnerad158722010-10-27 00:25:46 +00001875#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001876}
1877
Martin v. Löwis011e8422009-05-05 04:43:17 +00001878
1879int
1880PyUnicode_FSConverter(PyObject* arg, void* addr)
1881{
1882 PyObject *output = NULL;
1883 Py_ssize_t size;
1884 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001885 if (arg == NULL) {
1886 Py_DECREF(*(PyObject**)addr);
1887 return 1;
1888 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001889 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001890 output = arg;
1891 Py_INCREF(output);
1892 }
1893 else {
1894 arg = PyUnicode_FromObject(arg);
1895 if (!arg)
1896 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001897 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001898 Py_DECREF(arg);
1899 if (!output)
1900 return 0;
1901 if (!PyBytes_Check(output)) {
1902 Py_DECREF(output);
1903 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1904 return 0;
1905 }
1906 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001907 size = PyBytes_GET_SIZE(output);
1908 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001909 if (size != strlen(data)) {
1910 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1911 Py_DECREF(output);
1912 return 0;
1913 }
1914 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001915 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001916}
1917
1918
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001919int
1920PyUnicode_FSDecoder(PyObject* arg, void* addr)
1921{
1922 PyObject *output = NULL;
1923 Py_ssize_t size;
1924 void *data;
1925 if (arg == NULL) {
1926 Py_DECREF(*(PyObject**)addr);
1927 return 1;
1928 }
1929 if (PyUnicode_Check(arg)) {
1930 output = arg;
1931 Py_INCREF(output);
1932 }
1933 else {
1934 arg = PyBytes_FromObject(arg);
1935 if (!arg)
1936 return 0;
1937 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1938 PyBytes_GET_SIZE(arg));
1939 Py_DECREF(arg);
1940 if (!output)
1941 return 0;
1942 if (!PyUnicode_Check(output)) {
1943 Py_DECREF(output);
1944 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1945 return 0;
1946 }
1947 }
1948 size = PyUnicode_GET_SIZE(output);
1949 data = PyUnicode_AS_UNICODE(output);
1950 if (size != Py_UNICODE_strlen(data)) {
1951 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1952 Py_DECREF(output);
1953 return 0;
1954 }
1955 *(PyObject**)addr = output;
1956 return Py_CLEANUP_SUPPORTED;
1957}
1958
1959
Martin v. Löwis5b222132007-06-10 09:51:05 +00001960char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001961_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001962{
Christian Heimesf3863112007-11-22 07:46:41 +00001963 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001964 if (!PyUnicode_Check(unicode)) {
1965 PyErr_BadArgument();
1966 return NULL;
1967 }
Victor Stinnerf3fd7332011-03-02 01:03:11 +00001968 bytes = _PyUnicode_AsDefaultEncodedString(unicode);
Christian Heimesf3863112007-11-22 07:46:41 +00001969 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001970 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001971 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001972 *psize = PyBytes_GET_SIZE(bytes);
1973 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001974}
1975
1976char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001977_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001978{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001979 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001980}
1981
Alexander Belopolsky40018472011-02-26 01:02:56 +00001982Py_UNICODE *
1983PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984{
1985 if (!PyUnicode_Check(unicode)) {
1986 PyErr_BadArgument();
1987 goto onError;
1988 }
1989 return PyUnicode_AS_UNICODE(unicode);
1990
Benjamin Peterson29060642009-01-31 22:14:21 +00001991 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 return NULL;
1993}
1994
Alexander Belopolsky40018472011-02-26 01:02:56 +00001995Py_ssize_t
1996PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997{
1998 if (!PyUnicode_Check(unicode)) {
1999 PyErr_BadArgument();
2000 goto onError;
2001 }
2002 return PyUnicode_GET_SIZE(unicode);
2003
Benjamin Peterson29060642009-01-31 22:14:21 +00002004 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 return -1;
2006}
2007
Alexander Belopolsky40018472011-02-26 01:02:56 +00002008const char *
2009PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002010{
Victor Stinner42cb4622010-09-01 19:39:01 +00002011 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002012}
2013
Victor Stinner554f3f02010-06-16 23:33:54 +00002014/* create or adjust a UnicodeDecodeError */
2015static void
2016make_decode_exception(PyObject **exceptionObject,
2017 const char *encoding,
2018 const char *input, Py_ssize_t length,
2019 Py_ssize_t startpos, Py_ssize_t endpos,
2020 const char *reason)
2021{
2022 if (*exceptionObject == NULL) {
2023 *exceptionObject = PyUnicodeDecodeError_Create(
2024 encoding, input, length, startpos, endpos, reason);
2025 }
2026 else {
2027 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2028 goto onError;
2029 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2030 goto onError;
2031 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2032 goto onError;
2033 }
2034 return;
2035
2036onError:
2037 Py_DECREF(*exceptionObject);
2038 *exceptionObject = NULL;
2039}
2040
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002041/* error handling callback helper:
2042 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002043 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002044 and adjust various state variables.
2045 return 0 on success, -1 on error
2046*/
2047
Alexander Belopolsky40018472011-02-26 01:02:56 +00002048static int
2049unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2050 const char *encoding, const char *reason,
2051 const char **input, const char **inend, Py_ssize_t *startinpos,
2052 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2053 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002054{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002055 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056
2057 PyObject *restuple = NULL;
2058 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002059 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002060 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002061 Py_ssize_t requiredsize;
2062 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002063 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002064 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002065 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002066 int res = -1;
2067
2068 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002069 *errorHandler = PyCodec_LookupError(errors);
2070 if (*errorHandler == NULL)
2071 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002072 }
2073
Victor Stinner554f3f02010-06-16 23:33:54 +00002074 make_decode_exception(exceptionObject,
2075 encoding,
2076 *input, *inend - *input,
2077 *startinpos, *endinpos,
2078 reason);
2079 if (*exceptionObject == NULL)
2080 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002081
2082 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2083 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002084 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002085 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002086 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002087 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002088 }
2089 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002090 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002091
2092 /* Copy back the bytes variables, which might have been modified by the
2093 callback */
2094 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2095 if (!inputobj)
2096 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002097 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002098 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002099 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002100 *input = PyBytes_AS_STRING(inputobj);
2101 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002102 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002103 /* we can DECREF safely, as the exception has another reference,
2104 so the object won't go away. */
2105 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002106
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002107 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002108 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002109 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002110 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2111 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002112 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002113
2114 /* need more space? (at least enough for what we
2115 have+the replacement+the rest of the string (starting
2116 at the new input position), so we won't have to check space
2117 when there are no errors in the rest of the string) */
2118 repptr = PyUnicode_AS_UNICODE(repunicode);
2119 repsize = PyUnicode_GET_SIZE(repunicode);
2120 requiredsize = *outpos + repsize + insize-newpos;
2121 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002122 if (requiredsize<2*outsize)
2123 requiredsize = 2*outsize;
2124 if (_PyUnicode_Resize(output, requiredsize) < 0)
2125 goto onError;
2126 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127 }
2128 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002129 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002130 Py_UNICODE_COPY(*outptr, repptr, repsize);
2131 *outptr += repsize;
2132 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002133
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002134 /* we made it! */
2135 res = 0;
2136
Benjamin Peterson29060642009-01-31 22:14:21 +00002137 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002138 Py_XDECREF(restuple);
2139 return res;
2140}
2141
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002142/* --- UTF-7 Codec -------------------------------------------------------- */
2143
Antoine Pitrou244651a2009-05-04 18:56:13 +00002144/* See RFC2152 for details. We encode conservatively and decode liberally. */
2145
2146/* Three simple macros defining base-64. */
2147
2148/* Is c a base-64 character? */
2149
2150#define IS_BASE64(c) \
2151 (((c) >= 'A' && (c) <= 'Z') || \
2152 ((c) >= 'a' && (c) <= 'z') || \
2153 ((c) >= '0' && (c) <= '9') || \
2154 (c) == '+' || (c) == '/')
2155
2156/* given that c is a base-64 character, what is its base-64 value? */
2157
2158#define FROM_BASE64(c) \
2159 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2160 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2161 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2162 (c) == '+' ? 62 : 63)
2163
2164/* What is the base-64 character of the bottom 6 bits of n? */
2165
2166#define TO_BASE64(n) \
2167 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2168
2169/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2170 * decoded as itself. We are permissive on decoding; the only ASCII
2171 * byte not decoding to itself is the + which begins a base64
2172 * string. */
2173
2174#define DECODE_DIRECT(c) \
2175 ((c) <= 127 && (c) != '+')
2176
2177/* The UTF-7 encoder treats ASCII characters differently according to
2178 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2179 * the above). See RFC2152. This array identifies these different
2180 * sets:
2181 * 0 : "Set D"
2182 * alphanumeric and '(),-./:?
2183 * 1 : "Set O"
2184 * !"#$%&*;<=>@[]^_`{|}
2185 * 2 : "whitespace"
2186 * ht nl cr sp
2187 * 3 : special (must be base64 encoded)
2188 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2189 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002190
Tim Petersced69f82003-09-16 20:30:58 +00002191static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002192char utf7_category[128] = {
2193/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2194 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2195/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2196 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2197/* sp ! " # $ % & ' ( ) * + , - . / */
2198 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2199/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2201/* @ A B C D E F G H I J K L M N O */
2202 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2203/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2205/* ` a b c d e f g h i j k l m n o */
2206 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2207/* p q r s t u v w x y z { | } ~ del */
2208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002209};
2210
Antoine Pitrou244651a2009-05-04 18:56:13 +00002211/* ENCODE_DIRECT: this character should be encoded as itself. The
2212 * answer depends on whether we are encoding set O as itself, and also
2213 * on whether we are encoding whitespace as itself. RFC2152 makes it
2214 * clear that the answers to these questions vary between
2215 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002216
Antoine Pitrou244651a2009-05-04 18:56:13 +00002217#define ENCODE_DIRECT(c, directO, directWS) \
2218 ((c) < 128 && (c) > 0 && \
2219 ((utf7_category[(c)] == 0) || \
2220 (directWS && (utf7_category[(c)] == 2)) || \
2221 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002222
Alexander Belopolsky40018472011-02-26 01:02:56 +00002223PyObject *
2224PyUnicode_DecodeUTF7(const char *s,
2225 Py_ssize_t size,
2226 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002227{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002228 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2229}
2230
Antoine Pitrou244651a2009-05-04 18:56:13 +00002231/* The decoder. The only state we preserve is our read position,
2232 * i.e. how many characters we have consumed. So if we end in the
2233 * middle of a shift sequence we have to back off the read position
2234 * and the output to the beginning of the sequence, otherwise we lose
2235 * all the shift state (seen bits, number of bits seen, high
2236 * surrogate). */
2237
Alexander Belopolsky40018472011-02-26 01:02:56 +00002238PyObject *
2239PyUnicode_DecodeUTF7Stateful(const char *s,
2240 Py_ssize_t size,
2241 const char *errors,
2242 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002243{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002244 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002245 Py_ssize_t startinpos;
2246 Py_ssize_t endinpos;
2247 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002248 const char *e;
2249 PyUnicodeObject *unicode;
2250 Py_UNICODE *p;
2251 const char *errmsg = "";
2252 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002253 Py_UNICODE *shiftOutStart;
2254 unsigned int base64bits = 0;
2255 unsigned long base64buffer = 0;
2256 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002257 PyObject *errorHandler = NULL;
2258 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002259
2260 unicode = _PyUnicode_New(size);
2261 if (!unicode)
2262 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002263 if (size == 0) {
2264 if (consumed)
2265 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002266 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002267 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002268
2269 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002270 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002271 e = s + size;
2272
2273 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002274 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002275 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002276 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002277
Antoine Pitrou244651a2009-05-04 18:56:13 +00002278 if (inShift) { /* in a base-64 section */
2279 if (IS_BASE64(ch)) { /* consume a base-64 character */
2280 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2281 base64bits += 6;
2282 s++;
2283 if (base64bits >= 16) {
2284 /* we have enough bits for a UTF-16 value */
2285 Py_UNICODE outCh = (Py_UNICODE)
2286 (base64buffer >> (base64bits-16));
2287 base64bits -= 16;
2288 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2289 if (surrogate) {
2290 /* expecting a second surrogate */
2291 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2292#ifdef Py_UNICODE_WIDE
2293 *p++ = (((surrogate & 0x3FF)<<10)
2294 | (outCh & 0x3FF)) + 0x10000;
2295#else
2296 *p++ = surrogate;
2297 *p++ = outCh;
2298#endif
2299 surrogate = 0;
2300 }
2301 else {
2302 surrogate = 0;
2303 errmsg = "second surrogate missing";
2304 goto utf7Error;
2305 }
2306 }
2307 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2308 /* first surrogate */
2309 surrogate = outCh;
2310 }
2311 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2312 errmsg = "unexpected second surrogate";
2313 goto utf7Error;
2314 }
2315 else {
2316 *p++ = outCh;
2317 }
2318 }
2319 }
2320 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002321 inShift = 0;
2322 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002323 if (surrogate) {
2324 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002325 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002326 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002327 if (base64bits > 0) { /* left-over bits */
2328 if (base64bits >= 6) {
2329 /* We've seen at least one base-64 character */
2330 errmsg = "partial character in shift sequence";
2331 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002332 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002333 else {
2334 /* Some bits remain; they should be zero */
2335 if (base64buffer != 0) {
2336 errmsg = "non-zero padding bits in shift sequence";
2337 goto utf7Error;
2338 }
2339 }
2340 }
2341 if (ch != '-') {
2342 /* '-' is absorbed; other terminating
2343 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002344 *p++ = ch;
2345 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002346 }
2347 }
2348 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002349 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002350 s++; /* consume '+' */
2351 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002352 s++;
2353 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002354 }
2355 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002356 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002357 shiftOutStart = p;
2358 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002359 }
2360 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002361 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002362 *p++ = ch;
2363 s++;
2364 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002365 else {
2366 startinpos = s-starts;
2367 s++;
2368 errmsg = "unexpected special character";
2369 goto utf7Error;
2370 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002371 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002372utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002373 outpos = p-PyUnicode_AS_UNICODE(unicode);
2374 endinpos = s-starts;
2375 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002376 errors, &errorHandler,
2377 "utf7", errmsg,
2378 &starts, &e, &startinpos, &endinpos, &exc, &s,
2379 &unicode, &outpos, &p))
2380 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002381 }
2382
Antoine Pitrou244651a2009-05-04 18:56:13 +00002383 /* end of string */
2384
2385 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2386 /* if we're in an inconsistent state, that's an error */
2387 if (surrogate ||
2388 (base64bits >= 6) ||
2389 (base64bits > 0 && base64buffer != 0)) {
2390 outpos = p-PyUnicode_AS_UNICODE(unicode);
2391 endinpos = size;
2392 if (unicode_decode_call_errorhandler(
2393 errors, &errorHandler,
2394 "utf7", "unterminated shift sequence",
2395 &starts, &e, &startinpos, &endinpos, &exc, &s,
2396 &unicode, &outpos, &p))
2397 goto onError;
2398 if (s < e)
2399 goto restart;
2400 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002401 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002402
2403 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002404 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002405 if (inShift) {
2406 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002407 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002408 }
2409 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002410 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002411 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002412 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002413
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002414 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002415 goto onError;
2416
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002417 Py_XDECREF(errorHandler);
2418 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002419 return (PyObject *)unicode;
2420
Benjamin Peterson29060642009-01-31 22:14:21 +00002421 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002422 Py_XDECREF(errorHandler);
2423 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002424 Py_DECREF(unicode);
2425 return NULL;
2426}
2427
2428
Alexander Belopolsky40018472011-02-26 01:02:56 +00002429PyObject *
2430PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2431 Py_ssize_t size,
2432 int base64SetO,
2433 int base64WhiteSpace,
2434 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002435{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002436 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002437 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002438 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002439 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002440 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002441 unsigned int base64bits = 0;
2442 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002443 char * out;
2444 char * start;
2445
2446 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002447 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002448
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002449 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002450 return PyErr_NoMemory();
2451
Antoine Pitrou244651a2009-05-04 18:56:13 +00002452 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002453 if (v == NULL)
2454 return NULL;
2455
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002456 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002457 for (;i < size; ++i) {
2458 Py_UNICODE ch = s[i];
2459
Antoine Pitrou244651a2009-05-04 18:56:13 +00002460 if (inShift) {
2461 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2462 /* shifting out */
2463 if (base64bits) { /* output remaining bits */
2464 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2465 base64buffer = 0;
2466 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002467 }
2468 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002469 /* Characters not in the BASE64 set implicitly unshift the sequence
2470 so no '-' is required, except if the character is itself a '-' */
2471 if (IS_BASE64(ch) || ch == '-') {
2472 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002473 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002474 *out++ = (char) ch;
2475 }
2476 else {
2477 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002478 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002479 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002480 else { /* not in a shift sequence */
2481 if (ch == '+') {
2482 *out++ = '+';
2483 *out++ = '-';
2484 }
2485 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2486 *out++ = (char) ch;
2487 }
2488 else {
2489 *out++ = '+';
2490 inShift = 1;
2491 goto encode_char;
2492 }
2493 }
2494 continue;
2495encode_char:
2496#ifdef Py_UNICODE_WIDE
2497 if (ch >= 0x10000) {
2498 /* code first surrogate */
2499 base64bits += 16;
2500 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2501 while (base64bits >= 6) {
2502 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2503 base64bits -= 6;
2504 }
2505 /* prepare second surrogate */
2506 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2507 }
2508#endif
2509 base64bits += 16;
2510 base64buffer = (base64buffer << 16) | ch;
2511 while (base64bits >= 6) {
2512 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2513 base64bits -= 6;
2514 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002515 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002516 if (base64bits)
2517 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2518 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002519 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002520 if (_PyBytes_Resize(&v, out - start) < 0)
2521 return NULL;
2522 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002523}
2524
Antoine Pitrou244651a2009-05-04 18:56:13 +00002525#undef IS_BASE64
2526#undef FROM_BASE64
2527#undef TO_BASE64
2528#undef DECODE_DIRECT
2529#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002530
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531/* --- UTF-8 Codec -------------------------------------------------------- */
2532
Tim Petersced69f82003-09-16 20:30:58 +00002533static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002535 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2536 illegal prefix. See RFC 3629 for details */
2537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2542 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2543 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002544 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2549 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2550 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2551 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2552 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553};
2554
Alexander Belopolsky40018472011-02-26 01:02:56 +00002555PyObject *
2556PyUnicode_DecodeUTF8(const char *s,
2557 Py_ssize_t size,
2558 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559{
Walter Dörwald69652032004-09-07 20:24:22 +00002560 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2561}
2562
Antoine Pitrouab868312009-01-10 15:40:25 +00002563/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2564#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2565
2566/* Mask to quickly check whether a C 'long' contains a
2567 non-ASCII, UTF8-encoded char. */
2568#if (SIZEOF_LONG == 8)
2569# define ASCII_CHAR_MASK 0x8080808080808080L
2570#elif (SIZEOF_LONG == 4)
2571# define ASCII_CHAR_MASK 0x80808080L
2572#else
2573# error C 'long' size should be either 4 or 8!
2574#endif
2575
Alexander Belopolsky40018472011-02-26 01:02:56 +00002576PyObject *
2577PyUnicode_DecodeUTF8Stateful(const char *s,
2578 Py_ssize_t size,
2579 const char *errors,
2580 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002581{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002582 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002584 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002585 Py_ssize_t startinpos;
2586 Py_ssize_t endinpos;
2587 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002588 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589 PyUnicodeObject *unicode;
2590 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002591 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002592 PyObject *errorHandler = NULL;
2593 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594
2595 /* Note: size will always be longer than the resulting Unicode
2596 character count */
2597 unicode = _PyUnicode_New(size);
2598 if (!unicode)
2599 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002600 if (size == 0) {
2601 if (consumed)
2602 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605
2606 /* Unpack UTF-8 encoded data */
2607 p = unicode->str;
2608 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002609 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610
2611 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002612 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613
2614 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002615 /* Fast path for runs of ASCII characters. Given that common UTF-8
2616 input will consist of an overwhelming majority of ASCII
2617 characters, we try to optimize for this case by checking
2618 as many characters as a C 'long' can contain.
2619 First, check if we can do an aligned read, as most CPUs have
2620 a penalty for unaligned reads.
2621 */
2622 if (!((size_t) s & LONG_PTR_MASK)) {
2623 /* Help register allocation */
2624 register const char *_s = s;
2625 register Py_UNICODE *_p = p;
2626 while (_s < aligned_end) {
2627 /* Read a whole long at a time (either 4 or 8 bytes),
2628 and do a fast unrolled copy if it only contains ASCII
2629 characters. */
2630 unsigned long data = *(unsigned long *) _s;
2631 if (data & ASCII_CHAR_MASK)
2632 break;
2633 _p[0] = (unsigned char) _s[0];
2634 _p[1] = (unsigned char) _s[1];
2635 _p[2] = (unsigned char) _s[2];
2636 _p[3] = (unsigned char) _s[3];
2637#if (SIZEOF_LONG == 8)
2638 _p[4] = (unsigned char) _s[4];
2639 _p[5] = (unsigned char) _s[5];
2640 _p[6] = (unsigned char) _s[6];
2641 _p[7] = (unsigned char) _s[7];
2642#endif
2643 _s += SIZEOF_LONG;
2644 _p += SIZEOF_LONG;
2645 }
2646 s = _s;
2647 p = _p;
2648 if (s == e)
2649 break;
2650 ch = (unsigned char)*s;
2651 }
2652 }
2653
2654 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002655 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002656 s++;
2657 continue;
2658 }
2659
2660 n = utf8_code_length[ch];
2661
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002662 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002663 if (consumed)
2664 break;
2665 else {
2666 errmsg = "unexpected end of data";
2667 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002668 endinpos = startinpos+1;
2669 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2670 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002671 goto utf8Error;
2672 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674
2675 switch (n) {
2676
2677 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002678 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002679 startinpos = s-starts;
2680 endinpos = startinpos+1;
2681 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682
2683 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002684 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002685 startinpos = s-starts;
2686 endinpos = startinpos+1;
2687 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002688
2689 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002690 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002691 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002692 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002693 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002694 goto utf8Error;
2695 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002697 assert ((ch > 0x007F) && (ch <= 0x07FF));
2698 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 break;
2700
2701 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002702 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2703 will result in surrogates in range d800-dfff. Surrogates are
2704 not valid UTF-8 so they are rejected.
2705 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2706 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002707 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002708 (s[2] & 0xc0) != 0x80 ||
2709 ((unsigned char)s[0] == 0xE0 &&
2710 (unsigned char)s[1] < 0xA0) ||
2711 ((unsigned char)s[0] == 0xED &&
2712 (unsigned char)s[1] > 0x9F)) {
2713 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002714 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002715 endinpos = startinpos + 1;
2716
2717 /* if s[1] first two bits are 1 and 0, then the invalid
2718 continuation byte is s[2], so increment endinpos by 1,
2719 if not, s[1] is invalid and endinpos doesn't need to
2720 be incremented. */
2721 if ((s[1] & 0xC0) == 0x80)
2722 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002723 goto utf8Error;
2724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002726 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2727 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002728 break;
2729
2730 case 4:
2731 if ((s[1] & 0xc0) != 0x80 ||
2732 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002733 (s[3] & 0xc0) != 0x80 ||
2734 ((unsigned char)s[0] == 0xF0 &&
2735 (unsigned char)s[1] < 0x90) ||
2736 ((unsigned char)s[0] == 0xF4 &&
2737 (unsigned char)s[1] > 0x8F)) {
2738 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002739 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002740 endinpos = startinpos + 1;
2741 if ((s[1] & 0xC0) == 0x80) {
2742 endinpos++;
2743 if ((s[2] & 0xC0) == 0x80)
2744 endinpos++;
2745 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002746 goto utf8Error;
2747 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002748 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002749 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2750 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2751
Fredrik Lundh8f455852001-06-27 18:59:43 +00002752#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002753 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002754#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002755 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002756
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002757 /* translate from 10000..10FFFF to 0..FFFF */
2758 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002759
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002760 /* high surrogate = top 10 bits added to D800 */
2761 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002762
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002763 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002764 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002765#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 }
2768 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002769 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002770
Benjamin Peterson29060642009-01-31 22:14:21 +00002771 utf8Error:
2772 outpos = p-PyUnicode_AS_UNICODE(unicode);
2773 if (unicode_decode_call_errorhandler(
2774 errors, &errorHandler,
2775 "utf8", errmsg,
2776 &starts, &e, &startinpos, &endinpos, &exc, &s,
2777 &unicode, &outpos, &p))
2778 goto onError;
2779 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 }
Walter Dörwald69652032004-09-07 20:24:22 +00002781 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002782 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783
2784 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002785 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 goto onError;
2787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002788 Py_XDECREF(errorHandler);
2789 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790 return (PyObject *)unicode;
2791
Benjamin Peterson29060642009-01-31 22:14:21 +00002792 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 Py_XDECREF(errorHandler);
2794 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795 Py_DECREF(unicode);
2796 return NULL;
2797}
2798
Antoine Pitrouab868312009-01-10 15:40:25 +00002799#undef ASCII_CHAR_MASK
2800
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002801#ifdef __APPLE__
2802
2803/* Simplified UTF-8 decoder using surrogateescape error handler,
2804 used to decode the command line arguments on Mac OS X. */
2805
2806wchar_t*
2807_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2808{
2809 int n;
2810 const char *e;
2811 wchar_t *unicode, *p;
2812
2813 /* Note: size will always be longer than the resulting Unicode
2814 character count */
2815 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2816 PyErr_NoMemory();
2817 return NULL;
2818 }
2819 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2820 if (!unicode)
2821 return NULL;
2822
2823 /* Unpack UTF-8 encoded data */
2824 p = unicode;
2825 e = s + size;
2826 while (s < e) {
2827 Py_UCS4 ch = (unsigned char)*s;
2828
2829 if (ch < 0x80) {
2830 *p++ = (wchar_t)ch;
2831 s++;
2832 continue;
2833 }
2834
2835 n = utf8_code_length[ch];
2836 if (s + n > e) {
2837 goto surrogateescape;
2838 }
2839
2840 switch (n) {
2841 case 0:
2842 case 1:
2843 goto surrogateescape;
2844
2845 case 2:
2846 if ((s[1] & 0xc0) != 0x80)
2847 goto surrogateescape;
2848 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2849 assert ((ch > 0x007F) && (ch <= 0x07FF));
2850 *p++ = (wchar_t)ch;
2851 break;
2852
2853 case 3:
2854 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2855 will result in surrogates in range d800-dfff. Surrogates are
2856 not valid UTF-8 so they are rejected.
2857 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2858 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2859 if ((s[1] & 0xc0) != 0x80 ||
2860 (s[2] & 0xc0) != 0x80 ||
2861 ((unsigned char)s[0] == 0xE0 &&
2862 (unsigned char)s[1] < 0xA0) ||
2863 ((unsigned char)s[0] == 0xED &&
2864 (unsigned char)s[1] > 0x9F)) {
2865
2866 goto surrogateescape;
2867 }
2868 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2869 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2870 *p++ = (Py_UNICODE)ch;
2871 break;
2872
2873 case 4:
2874 if ((s[1] & 0xc0) != 0x80 ||
2875 (s[2] & 0xc0) != 0x80 ||
2876 (s[3] & 0xc0) != 0x80 ||
2877 ((unsigned char)s[0] == 0xF0 &&
2878 (unsigned char)s[1] < 0x90) ||
2879 ((unsigned char)s[0] == 0xF4 &&
2880 (unsigned char)s[1] > 0x8F)) {
2881 goto surrogateescape;
2882 }
2883 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2884 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2885 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2886
2887#if SIZEOF_WCHAR_T == 4
2888 *p++ = (wchar_t)ch;
2889#else
2890 /* compute and append the two surrogates: */
2891
2892 /* translate from 10000..10FFFF to 0..FFFF */
2893 ch -= 0x10000;
2894
2895 /* high surrogate = top 10 bits added to D800 */
2896 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2897
2898 /* low surrogate = bottom 10 bits added to DC00 */
2899 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2900#endif
2901 break;
2902 }
2903 s += n;
2904 continue;
2905
2906 surrogateescape:
2907 *p++ = 0xDC00 + ch;
2908 s++;
2909 }
2910 *p = L'\0';
2911 return unicode;
2912}
2913
2914#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002915
Tim Peters602f7402002-04-27 18:03:26 +00002916/* Allocation strategy: if the string is short, convert into a stack buffer
2917 and allocate exactly as much space needed at the end. Else allocate the
2918 maximum possible needed (4 result bytes per Unicode character), and return
2919 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002920*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002921PyObject *
2922PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002923 Py_ssize_t size,
2924 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002925{
Tim Peters602f7402002-04-27 18:03:26 +00002926#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002927
Guido van Rossum98297ee2007-11-06 21:34:58 +00002928 Py_ssize_t i; /* index into s of next input byte */
2929 PyObject *result; /* result string object */
2930 char *p; /* next free byte in output buffer */
2931 Py_ssize_t nallocated; /* number of result bytes allocated */
2932 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002933 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002934 PyObject *errorHandler = NULL;
2935 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002936
Tim Peters602f7402002-04-27 18:03:26 +00002937 assert(s != NULL);
2938 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939
Tim Peters602f7402002-04-27 18:03:26 +00002940 if (size <= MAX_SHORT_UNICHARS) {
2941 /* Write into the stack buffer; nallocated can't overflow.
2942 * At the end, we'll allocate exactly as much heap space as it
2943 * turns out we need.
2944 */
2945 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002946 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002947 p = stackbuf;
2948 }
2949 else {
2950 /* Overallocate on the heap, and give the excess back at the end. */
2951 nallocated = size * 4;
2952 if (nallocated / 4 != size) /* overflow! */
2953 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002954 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002955 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002956 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002957 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002958 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002959
Tim Peters602f7402002-04-27 18:03:26 +00002960 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002961 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002962
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002963 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002964 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002966
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002968 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002969 *p++ = (char)(0xc0 | (ch >> 6));
2970 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002971 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002972#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002973 /* Special case: check for high and low surrogate */
2974 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2975 Py_UCS4 ch2 = s[i];
2976 /* Combine the two surrogates to form a UCS4 value */
2977 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2978 i++;
2979
2980 /* Encode UCS4 Unicode ordinals */
2981 *p++ = (char)(0xf0 | (ch >> 18));
2982 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002983 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2984 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002985 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002986#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002987 Py_ssize_t newpos;
2988 PyObject *rep;
2989 Py_ssize_t repsize, k;
2990 rep = unicode_encode_call_errorhandler
2991 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2992 s, size, &exc, i-1, i, &newpos);
2993 if (!rep)
2994 goto error;
2995
2996 if (PyBytes_Check(rep))
2997 repsize = PyBytes_GET_SIZE(rep);
2998 else
2999 repsize = PyUnicode_GET_SIZE(rep);
3000
3001 if (repsize > 4) {
3002 Py_ssize_t offset;
3003
3004 if (result == NULL)
3005 offset = p - stackbuf;
3006 else
3007 offset = p - PyBytes_AS_STRING(result);
3008
3009 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
3010 /* integer overflow */
3011 PyErr_NoMemory();
3012 goto error;
3013 }
3014 nallocated += repsize - 4;
3015 if (result != NULL) {
3016 if (_PyBytes_Resize(&result, nallocated) < 0)
3017 goto error;
3018 } else {
3019 result = PyBytes_FromStringAndSize(NULL, nallocated);
3020 if (result == NULL)
3021 goto error;
3022 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3023 }
3024 p = PyBytes_AS_STRING(result) + offset;
3025 }
3026
3027 if (PyBytes_Check(rep)) {
3028 char *prep = PyBytes_AS_STRING(rep);
3029 for(k = repsize; k > 0; k--)
3030 *p++ = *prep++;
3031 } else /* rep is unicode */ {
3032 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3033 Py_UNICODE c;
3034
3035 for(k=0; k<repsize; k++) {
3036 c = prep[k];
3037 if (0x80 <= c) {
3038 raise_encode_exception(&exc, "utf-8", s, size,
3039 i-1, i, "surrogates not allowed");
3040 goto error;
3041 }
3042 *p++ = (char)prep[k];
3043 }
3044 }
3045 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003046#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003047 }
Victor Stinner445a6232010-04-22 20:01:57 +00003048#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003049 } else if (ch < 0x10000) {
3050 *p++ = (char)(0xe0 | (ch >> 12));
3051 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3052 *p++ = (char)(0x80 | (ch & 0x3f));
3053 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003054 /* Encode UCS4 Unicode ordinals */
3055 *p++ = (char)(0xf0 | (ch >> 18));
3056 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3057 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3058 *p++ = (char)(0x80 | (ch & 0x3f));
3059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003061
Guido van Rossum98297ee2007-11-06 21:34:58 +00003062 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003063 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003064 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003065 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003066 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003067 }
3068 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003069 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003070 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003071 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003072 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003073 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003074 Py_XDECREF(errorHandler);
3075 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003076 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003077 error:
3078 Py_XDECREF(errorHandler);
3079 Py_XDECREF(exc);
3080 Py_XDECREF(result);
3081 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003082
Tim Peters602f7402002-04-27 18:03:26 +00003083#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084}
3085
Alexander Belopolsky40018472011-02-26 01:02:56 +00003086PyObject *
3087PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088{
Victor Stinnera5c68c32011-03-02 01:03:14 +00003089 PyObject *utf8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 if (!PyUnicode_Check(unicode)) {
3091 PyErr_BadArgument();
3092 return NULL;
3093 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003094 utf8 = _PyUnicode_AsDefaultEncodedString(unicode);
3095 if (utf8 == NULL)
3096 return NULL;
3097 Py_INCREF(utf8);
3098 return utf8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099}
3100
Walter Dörwald41980ca2007-08-16 21:55:45 +00003101/* --- UTF-32 Codec ------------------------------------------------------- */
3102
3103PyObject *
3104PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003105 Py_ssize_t size,
3106 const char *errors,
3107 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003108{
3109 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3110}
3111
3112PyObject *
3113PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003114 Py_ssize_t size,
3115 const char *errors,
3116 int *byteorder,
3117 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003118{
3119 const char *starts = s;
3120 Py_ssize_t startinpos;
3121 Py_ssize_t endinpos;
3122 Py_ssize_t outpos;
3123 PyUnicodeObject *unicode;
3124 Py_UNICODE *p;
3125#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003126 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003127 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003128#else
3129 const int pairs = 0;
3130#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003131 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003132 int bo = 0; /* assume native ordering by default */
3133 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003134 /* Offsets from q for retrieving bytes in the right order. */
3135#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3136 int iorder[] = {0, 1, 2, 3};
3137#else
3138 int iorder[] = {3, 2, 1, 0};
3139#endif
3140 PyObject *errorHandler = NULL;
3141 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003142
Walter Dörwald41980ca2007-08-16 21:55:45 +00003143 q = (unsigned char *)s;
3144 e = q + size;
3145
3146 if (byteorder)
3147 bo = *byteorder;
3148
3149 /* Check for BOM marks (U+FEFF) in the input and adjust current
3150 byte order setting accordingly. In native mode, the leading BOM
3151 mark is skipped, in all other modes, it is copied to the output
3152 stream as-is (giving a ZWNBSP character). */
3153 if (bo == 0) {
3154 if (size >= 4) {
3155 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003156 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003157#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003158 if (bom == 0x0000FEFF) {
3159 q += 4;
3160 bo = -1;
3161 }
3162 else if (bom == 0xFFFE0000) {
3163 q += 4;
3164 bo = 1;
3165 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003166#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003167 if (bom == 0x0000FEFF) {
3168 q += 4;
3169 bo = 1;
3170 }
3171 else if (bom == 0xFFFE0000) {
3172 q += 4;
3173 bo = -1;
3174 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003175#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003176 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003177 }
3178
3179 if (bo == -1) {
3180 /* force LE */
3181 iorder[0] = 0;
3182 iorder[1] = 1;
3183 iorder[2] = 2;
3184 iorder[3] = 3;
3185 }
3186 else if (bo == 1) {
3187 /* force BE */
3188 iorder[0] = 3;
3189 iorder[1] = 2;
3190 iorder[2] = 1;
3191 iorder[3] = 0;
3192 }
3193
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003194 /* On narrow builds we split characters outside the BMP into two
3195 codepoints => count how much extra space we need. */
3196#ifndef Py_UNICODE_WIDE
3197 for (qq = q; qq < e; qq += 4)
3198 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3199 pairs++;
3200#endif
3201
3202 /* This might be one to much, because of a BOM */
3203 unicode = _PyUnicode_New((size+3)/4+pairs);
3204 if (!unicode)
3205 return NULL;
3206 if (size == 0)
3207 return (PyObject *)unicode;
3208
3209 /* Unpack UTF-32 encoded data */
3210 p = unicode->str;
3211
Walter Dörwald41980ca2007-08-16 21:55:45 +00003212 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003213 Py_UCS4 ch;
3214 /* remaining bytes at the end? (size should be divisible by 4) */
3215 if (e-q<4) {
3216 if (consumed)
3217 break;
3218 errmsg = "truncated data";
3219 startinpos = ((const char *)q)-starts;
3220 endinpos = ((const char *)e)-starts;
3221 goto utf32Error;
3222 /* The remaining input chars are ignored if the callback
3223 chooses to skip the input */
3224 }
3225 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3226 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003227
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 if (ch >= 0x110000)
3229 {
3230 errmsg = "codepoint not in range(0x110000)";
3231 startinpos = ((const char *)q)-starts;
3232 endinpos = startinpos+4;
3233 goto utf32Error;
3234 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003235#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003236 if (ch >= 0x10000)
3237 {
3238 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3239 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3240 }
3241 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003242#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003243 *p++ = ch;
3244 q += 4;
3245 continue;
3246 utf32Error:
3247 outpos = p-PyUnicode_AS_UNICODE(unicode);
3248 if (unicode_decode_call_errorhandler(
3249 errors, &errorHandler,
3250 "utf32", errmsg,
3251 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3252 &unicode, &outpos, &p))
3253 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003254 }
3255
3256 if (byteorder)
3257 *byteorder = bo;
3258
3259 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003260 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003261
3262 /* Adjust length */
3263 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3264 goto onError;
3265
3266 Py_XDECREF(errorHandler);
3267 Py_XDECREF(exc);
3268 return (PyObject *)unicode;
3269
Benjamin Peterson29060642009-01-31 22:14:21 +00003270 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003271 Py_DECREF(unicode);
3272 Py_XDECREF(errorHandler);
3273 Py_XDECREF(exc);
3274 return NULL;
3275}
3276
3277PyObject *
3278PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003279 Py_ssize_t size,
3280 const char *errors,
3281 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003282{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003283 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003284 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003285 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003286#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003287 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003288#else
3289 const int pairs = 0;
3290#endif
3291 /* Offsets from p for storing byte pairs in the right order. */
3292#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3293 int iorder[] = {0, 1, 2, 3};
3294#else
3295 int iorder[] = {3, 2, 1, 0};
3296#endif
3297
Benjamin Peterson29060642009-01-31 22:14:21 +00003298#define STORECHAR(CH) \
3299 do { \
3300 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3301 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3302 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3303 p[iorder[0]] = (CH) & 0xff; \
3304 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003305 } while(0)
3306
3307 /* In narrow builds we can output surrogate pairs as one codepoint,
3308 so we need less space. */
3309#ifndef Py_UNICODE_WIDE
3310 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003311 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3312 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3313 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003314#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003315 nsize = (size - pairs + (byteorder == 0));
3316 bytesize = nsize * 4;
3317 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003318 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003319 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003320 if (v == NULL)
3321 return NULL;
3322
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003323 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003324 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003325 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003326 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003327 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003328
3329 if (byteorder == -1) {
3330 /* force LE */
3331 iorder[0] = 0;
3332 iorder[1] = 1;
3333 iorder[2] = 2;
3334 iorder[3] = 3;
3335 }
3336 else if (byteorder == 1) {
3337 /* force BE */
3338 iorder[0] = 3;
3339 iorder[1] = 2;
3340 iorder[2] = 1;
3341 iorder[3] = 0;
3342 }
3343
3344 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003345 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003346#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003347 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3348 Py_UCS4 ch2 = *s;
3349 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3350 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3351 s++;
3352 size--;
3353 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003354 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003355#endif
3356 STORECHAR(ch);
3357 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003358
3359 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003360 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003361#undef STORECHAR
3362}
3363
Alexander Belopolsky40018472011-02-26 01:02:56 +00003364PyObject *
3365PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003366{
3367 if (!PyUnicode_Check(unicode)) {
3368 PyErr_BadArgument();
3369 return NULL;
3370 }
3371 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003372 PyUnicode_GET_SIZE(unicode),
3373 NULL,
3374 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003375}
3376
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377/* --- UTF-16 Codec ------------------------------------------------------- */
3378
Tim Peters772747b2001-08-09 22:21:55 +00003379PyObject *
3380PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003381 Py_ssize_t size,
3382 const char *errors,
3383 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384{
Walter Dörwald69652032004-09-07 20:24:22 +00003385 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3386}
3387
Antoine Pitrouab868312009-01-10 15:40:25 +00003388/* Two masks for fast checking of whether a C 'long' may contain
3389 UTF16-encoded surrogate characters. This is an efficient heuristic,
3390 assuming that non-surrogate characters with a code point >= 0x8000 are
3391 rare in most input.
3392 FAST_CHAR_MASK is used when the input is in native byte ordering,
3393 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003394*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003395#if (SIZEOF_LONG == 8)
3396# define FAST_CHAR_MASK 0x8000800080008000L
3397# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3398#elif (SIZEOF_LONG == 4)
3399# define FAST_CHAR_MASK 0x80008000L
3400# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3401#else
3402# error C 'long' size should be either 4 or 8!
3403#endif
3404
Walter Dörwald69652032004-09-07 20:24:22 +00003405PyObject *
3406PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003407 Py_ssize_t size,
3408 const char *errors,
3409 int *byteorder,
3410 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003411{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003412 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003413 Py_ssize_t startinpos;
3414 Py_ssize_t endinpos;
3415 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416 PyUnicodeObject *unicode;
3417 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003418 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003419 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003420 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003421 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003422 /* Offsets from q for retrieving byte pairs in the right order. */
3423#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3424 int ihi = 1, ilo = 0;
3425#else
3426 int ihi = 0, ilo = 1;
3427#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003428 PyObject *errorHandler = NULL;
3429 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430
3431 /* Note: size will always be longer than the resulting Unicode
3432 character count */
3433 unicode = _PyUnicode_New(size);
3434 if (!unicode)
3435 return NULL;
3436 if (size == 0)
3437 return (PyObject *)unicode;
3438
3439 /* Unpack UTF-16 encoded data */
3440 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003441 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003442 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443
3444 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003445 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003447 /* Check for BOM marks (U+FEFF) in the input and adjust current
3448 byte order setting accordingly. In native mode, the leading BOM
3449 mark is skipped, in all other modes, it is copied to the output
3450 stream as-is (giving a ZWNBSP character). */
3451 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003452 if (size >= 2) {
3453 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003454#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003455 if (bom == 0xFEFF) {
3456 q += 2;
3457 bo = -1;
3458 }
3459 else if (bom == 0xFFFE) {
3460 q += 2;
3461 bo = 1;
3462 }
Tim Petersced69f82003-09-16 20:30:58 +00003463#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003464 if (bom == 0xFEFF) {
3465 q += 2;
3466 bo = 1;
3467 }
3468 else if (bom == 0xFFFE) {
3469 q += 2;
3470 bo = -1;
3471 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003472#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003473 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475
Tim Peters772747b2001-08-09 22:21:55 +00003476 if (bo == -1) {
3477 /* force LE */
3478 ihi = 1;
3479 ilo = 0;
3480 }
3481 else if (bo == 1) {
3482 /* force BE */
3483 ihi = 0;
3484 ilo = 1;
3485 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003486#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3487 native_ordering = ilo < ihi;
3488#else
3489 native_ordering = ilo > ihi;
3490#endif
Tim Peters772747b2001-08-09 22:21:55 +00003491
Antoine Pitrouab868312009-01-10 15:40:25 +00003492 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003493 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003494 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003495 /* First check for possible aligned read of a C 'long'. Unaligned
3496 reads are more expensive, better to defer to another iteration. */
3497 if (!((size_t) q & LONG_PTR_MASK)) {
3498 /* Fast path for runs of non-surrogate chars. */
3499 register const unsigned char *_q = q;
3500 Py_UNICODE *_p = p;
3501 if (native_ordering) {
3502 /* Native ordering is simple: as long as the input cannot
3503 possibly contain a surrogate char, do an unrolled copy
3504 of several 16-bit code points to the target object.
3505 The non-surrogate check is done on several input bytes
3506 at a time (as many as a C 'long' can contain). */
3507 while (_q < aligned_end) {
3508 unsigned long data = * (unsigned long *) _q;
3509 if (data & FAST_CHAR_MASK)
3510 break;
3511 _p[0] = ((unsigned short *) _q)[0];
3512 _p[1] = ((unsigned short *) _q)[1];
3513#if (SIZEOF_LONG == 8)
3514 _p[2] = ((unsigned short *) _q)[2];
3515 _p[3] = ((unsigned short *) _q)[3];
3516#endif
3517 _q += SIZEOF_LONG;
3518 _p += SIZEOF_LONG / 2;
3519 }
3520 }
3521 else {
3522 /* Byteswapped ordering is similar, but we must decompose
3523 the copy bytewise, and take care of zero'ing out the
3524 upper bytes if the target object is in 32-bit units
3525 (that is, in UCS-4 builds). */
3526 while (_q < aligned_end) {
3527 unsigned long data = * (unsigned long *) _q;
3528 if (data & SWAPPED_FAST_CHAR_MASK)
3529 break;
3530 /* Zero upper bytes in UCS-4 builds */
3531#if (Py_UNICODE_SIZE > 2)
3532 _p[0] = 0;
3533 _p[1] = 0;
3534#if (SIZEOF_LONG == 8)
3535 _p[2] = 0;
3536 _p[3] = 0;
3537#endif
3538#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003539 /* Issue #4916; UCS-4 builds on big endian machines must
3540 fill the two last bytes of each 4-byte unit. */
3541#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3542# define OFF 2
3543#else
3544# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003545#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003546 ((unsigned char *) _p)[OFF + 1] = _q[0];
3547 ((unsigned char *) _p)[OFF + 0] = _q[1];
3548 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3549 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3550#if (SIZEOF_LONG == 8)
3551 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3552 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3553 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3554 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3555#endif
3556#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003557 _q += SIZEOF_LONG;
3558 _p += SIZEOF_LONG / 2;
3559 }
3560 }
3561 p = _p;
3562 q = _q;
3563 if (q >= e)
3564 break;
3565 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003566 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567
Benjamin Peterson14339b62009-01-31 16:36:08 +00003568 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003569
3570 if (ch < 0xD800 || ch > 0xDFFF) {
3571 *p++ = ch;
3572 continue;
3573 }
3574
3575 /* UTF-16 code pair: */
3576 if (q > e) {
3577 errmsg = "unexpected end of data";
3578 startinpos = (((const char *)q) - 2) - starts;
3579 endinpos = ((const char *)e) + 1 - starts;
3580 goto utf16Error;
3581 }
3582 if (0xD800 <= ch && ch <= 0xDBFF) {
3583 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3584 q += 2;
3585 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003586#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003587 *p++ = ch;
3588 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003589#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003590 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003591#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003592 continue;
3593 }
3594 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003595 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003596 startinpos = (((const char *)q)-4)-starts;
3597 endinpos = startinpos+2;
3598 goto utf16Error;
3599 }
3600
Benjamin Peterson14339b62009-01-31 16:36:08 +00003601 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003602 errmsg = "illegal encoding";
3603 startinpos = (((const char *)q)-2)-starts;
3604 endinpos = startinpos+2;
3605 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003606
Benjamin Peterson29060642009-01-31 22:14:21 +00003607 utf16Error:
3608 outpos = p - PyUnicode_AS_UNICODE(unicode);
3609 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003610 errors,
3611 &errorHandler,
3612 "utf16", errmsg,
3613 &starts,
3614 (const char **)&e,
3615 &startinpos,
3616 &endinpos,
3617 &exc,
3618 (const char **)&q,
3619 &unicode,
3620 &outpos,
3621 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003622 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003624 /* remaining byte at the end? (size should be even) */
3625 if (e == q) {
3626 if (!consumed) {
3627 errmsg = "truncated data";
3628 startinpos = ((const char *)q) - starts;
3629 endinpos = ((const char *)e) + 1 - starts;
3630 outpos = p - PyUnicode_AS_UNICODE(unicode);
3631 if (unicode_decode_call_errorhandler(
3632 errors,
3633 &errorHandler,
3634 "utf16", errmsg,
3635 &starts,
3636 (const char **)&e,
3637 &startinpos,
3638 &endinpos,
3639 &exc,
3640 (const char **)&q,
3641 &unicode,
3642 &outpos,
3643 &p))
3644 goto onError;
3645 /* The remaining input chars are ignored if the callback
3646 chooses to skip the input */
3647 }
3648 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649
3650 if (byteorder)
3651 *byteorder = bo;
3652
Walter Dörwald69652032004-09-07 20:24:22 +00003653 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003654 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003655
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003657 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 goto onError;
3659
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 Py_XDECREF(errorHandler);
3661 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 return (PyObject *)unicode;
3663
Benjamin Peterson29060642009-01-31 22:14:21 +00003664 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003666 Py_XDECREF(errorHandler);
3667 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 return NULL;
3669}
3670
Antoine Pitrouab868312009-01-10 15:40:25 +00003671#undef FAST_CHAR_MASK
3672#undef SWAPPED_FAST_CHAR_MASK
3673
Tim Peters772747b2001-08-09 22:21:55 +00003674PyObject *
3675PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 Py_ssize_t size,
3677 const char *errors,
3678 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003680 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003681 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003682 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003683#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003684 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003685#else
3686 const int pairs = 0;
3687#endif
Tim Peters772747b2001-08-09 22:21:55 +00003688 /* Offsets from p for storing byte pairs in the right order. */
3689#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3690 int ihi = 1, ilo = 0;
3691#else
3692 int ihi = 0, ilo = 1;
3693#endif
3694
Benjamin Peterson29060642009-01-31 22:14:21 +00003695#define STORECHAR(CH) \
3696 do { \
3697 p[ihi] = ((CH) >> 8) & 0xff; \
3698 p[ilo] = (CH) & 0xff; \
3699 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003700 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003702#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003703 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003704 if (s[i] >= 0x10000)
3705 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003706#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003707 /* 2 * (size + pairs + (byteorder == 0)) */
3708 if (size > PY_SSIZE_T_MAX ||
3709 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003710 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003711 nsize = size + pairs + (byteorder == 0);
3712 bytesize = nsize * 2;
3713 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003714 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003715 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716 if (v == NULL)
3717 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003719 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003721 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003722 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003723 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003724
3725 if (byteorder == -1) {
3726 /* force LE */
3727 ihi = 1;
3728 ilo = 0;
3729 }
3730 else if (byteorder == 1) {
3731 /* force BE */
3732 ihi = 0;
3733 ilo = 1;
3734 }
3735
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003736 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003737 Py_UNICODE ch = *s++;
3738 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003739#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003740 if (ch >= 0x10000) {
3741 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3742 ch = 0xD800 | ((ch-0x10000) >> 10);
3743 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003744#endif
Tim Peters772747b2001-08-09 22:21:55 +00003745 STORECHAR(ch);
3746 if (ch2)
3747 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003748 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003749
3750 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003751 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003752#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753}
3754
Alexander Belopolsky40018472011-02-26 01:02:56 +00003755PyObject *
3756PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757{
3758 if (!PyUnicode_Check(unicode)) {
3759 PyErr_BadArgument();
3760 return NULL;
3761 }
3762 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003763 PyUnicode_GET_SIZE(unicode),
3764 NULL,
3765 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766}
3767
3768/* --- Unicode Escape Codec ----------------------------------------------- */
3769
Fredrik Lundh06d12682001-01-24 07:59:11 +00003770static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003771
Alexander Belopolsky40018472011-02-26 01:02:56 +00003772PyObject *
3773PyUnicode_DecodeUnicodeEscape(const char *s,
3774 Py_ssize_t size,
3775 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003777 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003778 Py_ssize_t startinpos;
3779 Py_ssize_t endinpos;
3780 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003785 char* message;
3786 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 PyObject *errorHandler = NULL;
3788 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003789
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 /* Escaped strings will always be longer than the resulting
3791 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792 length after conversion to the true value.
3793 (but if the error callback returns a long replacement string
3794 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795 v = _PyUnicode_New(size);
3796 if (v == NULL)
3797 goto onError;
3798 if (size == 0)
3799 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003800
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003801 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003803
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804 while (s < end) {
3805 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003806 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003807 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808
3809 /* Non-escape characters are interpreted as Unicode ordinals */
3810 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003811 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812 continue;
3813 }
3814
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003815 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 /* \ - Escapes */
3817 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003818 c = *s++;
3819 if (s > end)
3820 c = '\0'; /* Invalid after \ */
3821 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822
Benjamin Peterson29060642009-01-31 22:14:21 +00003823 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824 case '\n': break;
3825 case '\\': *p++ = '\\'; break;
3826 case '\'': *p++ = '\''; break;
3827 case '\"': *p++ = '\"'; break;
3828 case 'b': *p++ = '\b'; break;
3829 case 'f': *p++ = '\014'; break; /* FF */
3830 case 't': *p++ = '\t'; break;
3831 case 'n': *p++ = '\n'; break;
3832 case 'r': *p++ = '\r'; break;
3833 case 'v': *p++ = '\013'; break; /* VT */
3834 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3835
Benjamin Peterson29060642009-01-31 22:14:21 +00003836 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 case '0': case '1': case '2': case '3':
3838 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003839 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003840 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003841 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003842 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003843 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003845 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846 break;
3847
Benjamin Peterson29060642009-01-31 22:14:21 +00003848 /* hex escapes */
3849 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003850 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003851 digits = 2;
3852 message = "truncated \\xXX escape";
3853 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854
Benjamin Peterson29060642009-01-31 22:14:21 +00003855 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003857 digits = 4;
3858 message = "truncated \\uXXXX escape";
3859 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860
Benjamin Peterson29060642009-01-31 22:14:21 +00003861 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003862 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003863 digits = 8;
3864 message = "truncated \\UXXXXXXXX escape";
3865 hexescape:
3866 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003867 outpos = p-PyUnicode_AS_UNICODE(v);
3868 if (s+digits>end) {
3869 endinpos = size;
3870 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003871 errors, &errorHandler,
3872 "unicodeescape", "end of string in escape sequence",
3873 &starts, &end, &startinpos, &endinpos, &exc, &s,
3874 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003875 goto onError;
3876 goto nextByte;
3877 }
3878 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003879 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003880 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 endinpos = (s+i+1)-starts;
3882 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003883 errors, &errorHandler,
3884 "unicodeescape", message,
3885 &starts, &end, &startinpos, &endinpos, &exc, &s,
3886 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003887 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003888 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003889 }
3890 chr = (chr<<4) & ~0xF;
3891 if (c >= '0' && c <= '9')
3892 chr += c - '0';
3893 else if (c >= 'a' && c <= 'f')
3894 chr += 10 + c - 'a';
3895 else
3896 chr += 10 + c - 'A';
3897 }
3898 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003899 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003900 /* _decoding_error will have already written into the
3901 target buffer. */
3902 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003903 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003904 /* when we get here, chr is a 32-bit unicode character */
3905 if (chr <= 0xffff)
3906 /* UCS-2 character */
3907 *p++ = (Py_UNICODE) chr;
3908 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003909 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003910 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003911#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003912 *p++ = chr;
3913#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003914 chr -= 0x10000L;
3915 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003916 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003917#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003918 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003919 endinpos = s-starts;
3920 outpos = p-PyUnicode_AS_UNICODE(v);
3921 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003922 errors, &errorHandler,
3923 "unicodeescape", "illegal Unicode character",
3924 &starts, &end, &startinpos, &endinpos, &exc, &s,
3925 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003926 goto onError;
3927 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003928 break;
3929
Benjamin Peterson29060642009-01-31 22:14:21 +00003930 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003931 case 'N':
3932 message = "malformed \\N character escape";
3933 if (ucnhash_CAPI == NULL) {
3934 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003935 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003936 if (ucnhash_CAPI == NULL)
3937 goto ucnhashError;
3938 }
3939 if (*s == '{') {
3940 const char *start = s+1;
3941 /* look for the closing brace */
3942 while (*s != '}' && s < end)
3943 s++;
3944 if (s > start && s < end && *s == '}') {
3945 /* found a name. look it up in the unicode database */
3946 message = "unknown Unicode character name";
3947 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003948 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003949 goto store;
3950 }
3951 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952 endinpos = s-starts;
3953 outpos = p-PyUnicode_AS_UNICODE(v);
3954 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003955 errors, &errorHandler,
3956 "unicodeescape", message,
3957 &starts, &end, &startinpos, &endinpos, &exc, &s,
3958 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003959 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003960 break;
3961
3962 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003963 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 message = "\\ at end of string";
3965 s--;
3966 endinpos = s-starts;
3967 outpos = p-PyUnicode_AS_UNICODE(v);
3968 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003969 errors, &errorHandler,
3970 "unicodeescape", message,
3971 &starts, &end, &startinpos, &endinpos, &exc, &s,
3972 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003973 goto onError;
3974 }
3975 else {
3976 *p++ = '\\';
3977 *p++ = (unsigned char)s[-1];
3978 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003979 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003981 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003982 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003984 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003986 Py_XDECREF(errorHandler);
3987 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003989
Benjamin Peterson29060642009-01-31 22:14:21 +00003990 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003991 PyErr_SetString(
3992 PyExc_UnicodeError,
3993 "\\N escapes not supported (can't load unicodedata module)"
3994 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003995 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003996 Py_XDECREF(errorHandler);
3997 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003998 return NULL;
3999
Benjamin Peterson29060642009-01-31 22:14:21 +00004000 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004001 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004002 Py_XDECREF(errorHandler);
4003 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 return NULL;
4005}
4006
4007/* Return a Unicode-Escape string version of the Unicode object.
4008
4009 If quotes is true, the string is enclosed in u"" or u'' quotes as
4010 appropriate.
4011
4012*/
4013
Thomas Wouters477c8d52006-05-27 19:21:47 +00004014Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004015 Py_ssize_t size,
4016 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00004017{
4018 /* like wcschr, but doesn't stop at NULL characters */
4019
4020 while (size-- > 0) {
4021 if (*s == ch)
4022 return s;
4023 s++;
4024 }
4025
4026 return NULL;
4027}
Barry Warsaw51ac5802000-03-20 16:36:48 +00004028
Walter Dörwald79e913e2007-05-12 11:08:06 +00004029static const char *hexdigits = "0123456789abcdef";
4030
Alexander Belopolsky40018472011-02-26 01:02:56 +00004031PyObject *
4032PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
4033 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004035 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004038#ifdef Py_UNICODE_WIDE
4039 const Py_ssize_t expandsize = 10;
4040#else
4041 const Py_ssize_t expandsize = 6;
4042#endif
4043
Thomas Wouters89f507f2006-12-13 04:49:30 +00004044 /* XXX(nnorwitz): rather than over-allocating, it would be
4045 better to choose a different scheme. Perhaps scan the
4046 first N-chars of the string and allocate based on that size.
4047 */
4048 /* Initial allocation is based on the longest-possible unichr
4049 escape.
4050
4051 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4052 unichr, so in this case it's the longest unichr escape. In
4053 narrow (UTF-16) builds this is five chars per source unichr
4054 since there are two unichrs in the surrogate pair, so in narrow
4055 (UTF-16) builds it's not the longest unichr escape.
4056
4057 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4058 so in the narrow (UTF-16) build case it's the longest unichr
4059 escape.
4060 */
4061
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004062 if (size == 0)
4063 return PyBytes_FromStringAndSize(NULL, 0);
4064
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004065 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004066 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004067
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004068 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004069 2
4070 + expandsize*size
4071 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 if (repr == NULL)
4073 return NULL;
4074
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004075 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 while (size-- > 0) {
4078 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004079
Walter Dörwald79e913e2007-05-12 11:08:06 +00004080 /* Escape backslashes */
4081 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 *p++ = '\\';
4083 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004084 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004085 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004086
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004087#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004088 /* Map 21-bit characters to '\U00xxxxxx' */
4089 else if (ch >= 0x10000) {
4090 *p++ = '\\';
4091 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004092 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4093 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4094 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4095 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4096 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4097 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4098 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4099 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004100 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004101 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004102#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004103 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4104 else if (ch >= 0xD800 && ch < 0xDC00) {
4105 Py_UNICODE ch2;
4106 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004107
Benjamin Peterson29060642009-01-31 22:14:21 +00004108 ch2 = *s++;
4109 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004110 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4112 *p++ = '\\';
4113 *p++ = 'U';
4114 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4115 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4116 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4117 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4118 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4119 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4120 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4121 *p++ = hexdigits[ucs & 0x0000000F];
4122 continue;
4123 }
4124 /* Fall through: isolated surrogates are copied as-is */
4125 s--;
4126 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004127 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004128#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004129
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004131 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 *p++ = '\\';
4133 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004134 *p++ = hexdigits[(ch >> 12) & 0x000F];
4135 *p++ = hexdigits[(ch >> 8) & 0x000F];
4136 *p++ = hexdigits[(ch >> 4) & 0x000F];
4137 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004139
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004140 /* Map special whitespace to '\t', \n', '\r' */
4141 else if (ch == '\t') {
4142 *p++ = '\\';
4143 *p++ = 't';
4144 }
4145 else if (ch == '\n') {
4146 *p++ = '\\';
4147 *p++ = 'n';
4148 }
4149 else if (ch == '\r') {
4150 *p++ = '\\';
4151 *p++ = 'r';
4152 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004153
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004154 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004155 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004157 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004158 *p++ = hexdigits[(ch >> 4) & 0x000F];
4159 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004160 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004161
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162 /* Copy everything else as-is */
4163 else
4164 *p++ = (char) ch;
4165 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004167 assert(p - PyBytes_AS_STRING(repr) > 0);
4168 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4169 return NULL;
4170 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171}
4172
Alexander Belopolsky40018472011-02-26 01:02:56 +00004173PyObject *
4174PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004176 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 if (!PyUnicode_Check(unicode)) {
4178 PyErr_BadArgument();
4179 return NULL;
4180 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004181 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4182 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004183 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184}
4185
4186/* --- Raw Unicode Escape Codec ------------------------------------------- */
4187
Alexander Belopolsky40018472011-02-26 01:02:56 +00004188PyObject *
4189PyUnicode_DecodeRawUnicodeEscape(const char *s,
4190 Py_ssize_t size,
4191 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004194 Py_ssize_t startinpos;
4195 Py_ssize_t endinpos;
4196 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 const char *end;
4200 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 PyObject *errorHandler = NULL;
4202 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004203
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204 /* Escaped strings will always be longer than the resulting
4205 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 length after conversion to the true value. (But decoding error
4207 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004208 v = _PyUnicode_New(size);
4209 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004210 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004211 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214 end = s + size;
4215 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 unsigned char c;
4217 Py_UCS4 x;
4218 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004219 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004220
Benjamin Peterson29060642009-01-31 22:14:21 +00004221 /* Non-escape characters are interpreted as Unicode ordinals */
4222 if (*s != '\\') {
4223 *p++ = (unsigned char)*s++;
4224 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004225 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 startinpos = s-starts;
4227
4228 /* \u-escapes are only interpreted iff the number of leading
4229 backslashes if odd */
4230 bs = s;
4231 for (;s < end;) {
4232 if (*s != '\\')
4233 break;
4234 *p++ = (unsigned char)*s++;
4235 }
4236 if (((s - bs) & 1) == 0 ||
4237 s >= end ||
4238 (*s != 'u' && *s != 'U')) {
4239 continue;
4240 }
4241 p--;
4242 count = *s=='u' ? 4 : 8;
4243 s++;
4244
4245 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4246 outpos = p-PyUnicode_AS_UNICODE(v);
4247 for (x = 0, i = 0; i < count; ++i, ++s) {
4248 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004249 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 endinpos = s-starts;
4251 if (unicode_decode_call_errorhandler(
4252 errors, &errorHandler,
4253 "rawunicodeescape", "truncated \\uXXXX",
4254 &starts, &end, &startinpos, &endinpos, &exc, &s,
4255 &v, &outpos, &p))
4256 goto onError;
4257 goto nextByte;
4258 }
4259 x = (x<<4) & ~0xF;
4260 if (c >= '0' && c <= '9')
4261 x += c - '0';
4262 else if (c >= 'a' && c <= 'f')
4263 x += 10 + c - 'a';
4264 else
4265 x += 10 + c - 'A';
4266 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004267 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 /* UCS-2 character */
4269 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004270 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004271 /* UCS-4 character. Either store directly, or as
4272 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004273#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004275#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 x -= 0x10000L;
4277 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4278 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004279#endif
4280 } else {
4281 endinpos = s-starts;
4282 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004283 if (unicode_decode_call_errorhandler(
4284 errors, &errorHandler,
4285 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 &starts, &end, &startinpos, &endinpos, &exc, &s,
4287 &v, &outpos, &p))
4288 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004289 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 nextByte:
4291 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004293 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 Py_XDECREF(errorHandler);
4296 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004298
Benjamin Peterson29060642009-01-31 22:14:21 +00004299 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004301 Py_XDECREF(errorHandler);
4302 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303 return NULL;
4304}
4305
Alexander Belopolsky40018472011-02-26 01:02:56 +00004306PyObject *
4307PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4308 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004310 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311 char *p;
4312 char *q;
4313
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004314#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004315 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004316#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004317 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004318#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004319
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004320 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004321 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004322
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004323 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324 if (repr == NULL)
4325 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004326 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004327 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004329 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 while (size-- > 0) {
4331 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004332#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004333 /* Map 32-bit characters to '\Uxxxxxxxx' */
4334 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004335 *p++ = '\\';
4336 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004337 *p++ = hexdigits[(ch >> 28) & 0xf];
4338 *p++ = hexdigits[(ch >> 24) & 0xf];
4339 *p++ = hexdigits[(ch >> 20) & 0xf];
4340 *p++ = hexdigits[(ch >> 16) & 0xf];
4341 *p++ = hexdigits[(ch >> 12) & 0xf];
4342 *p++ = hexdigits[(ch >> 8) & 0xf];
4343 *p++ = hexdigits[(ch >> 4) & 0xf];
4344 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004345 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004346 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004347#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004348 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4349 if (ch >= 0xD800 && ch < 0xDC00) {
4350 Py_UNICODE ch2;
4351 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004352
Benjamin Peterson29060642009-01-31 22:14:21 +00004353 ch2 = *s++;
4354 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004355 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004356 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4357 *p++ = '\\';
4358 *p++ = 'U';
4359 *p++ = hexdigits[(ucs >> 28) & 0xf];
4360 *p++ = hexdigits[(ucs >> 24) & 0xf];
4361 *p++ = hexdigits[(ucs >> 20) & 0xf];
4362 *p++ = hexdigits[(ucs >> 16) & 0xf];
4363 *p++ = hexdigits[(ucs >> 12) & 0xf];
4364 *p++ = hexdigits[(ucs >> 8) & 0xf];
4365 *p++ = hexdigits[(ucs >> 4) & 0xf];
4366 *p++ = hexdigits[ucs & 0xf];
4367 continue;
4368 }
4369 /* Fall through: isolated surrogates are copied as-is */
4370 s--;
4371 size++;
4372 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004373#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004374 /* Map 16-bit characters to '\uxxxx' */
4375 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376 *p++ = '\\';
4377 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004378 *p++ = hexdigits[(ch >> 12) & 0xf];
4379 *p++ = hexdigits[(ch >> 8) & 0xf];
4380 *p++ = hexdigits[(ch >> 4) & 0xf];
4381 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004382 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004383 /* Copy everything else as-is */
4384 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385 *p++ = (char) ch;
4386 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004387 size = p - q;
4388
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004389 assert(size > 0);
4390 if (_PyBytes_Resize(&repr, size) < 0)
4391 return NULL;
4392 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393}
4394
Alexander Belopolsky40018472011-02-26 01:02:56 +00004395PyObject *
4396PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004398 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004400 PyErr_BadArgument();
4401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004403 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4404 PyUnicode_GET_SIZE(unicode));
4405
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004406 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407}
4408
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004409/* --- Unicode Internal Codec ------------------------------------------- */
4410
Alexander Belopolsky40018472011-02-26 01:02:56 +00004411PyObject *
4412_PyUnicode_DecodeUnicodeInternal(const char *s,
4413 Py_ssize_t size,
4414 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004415{
4416 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004417 Py_ssize_t startinpos;
4418 Py_ssize_t endinpos;
4419 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004420 PyUnicodeObject *v;
4421 Py_UNICODE *p;
4422 const char *end;
4423 const char *reason;
4424 PyObject *errorHandler = NULL;
4425 PyObject *exc = NULL;
4426
Neal Norwitzd43069c2006-01-08 01:12:10 +00004427#ifdef Py_UNICODE_WIDE
4428 Py_UNICODE unimax = PyUnicode_GetMax();
4429#endif
4430
Thomas Wouters89f507f2006-12-13 04:49:30 +00004431 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004432 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4433 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004435 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004437 p = PyUnicode_AS_UNICODE(v);
4438 end = s + size;
4439
4440 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004441 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004442 /* We have to sanity check the raw data, otherwise doom looms for
4443 some malformed UCS-4 data. */
4444 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004445#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004446 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004447#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004448 end-s < Py_UNICODE_SIZE
4449 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004450 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004451 startinpos = s - starts;
4452 if (end-s < Py_UNICODE_SIZE) {
4453 endinpos = end-starts;
4454 reason = "truncated input";
4455 }
4456 else {
4457 endinpos = s - starts + Py_UNICODE_SIZE;
4458 reason = "illegal code point (> 0x10FFFF)";
4459 }
4460 outpos = p - PyUnicode_AS_UNICODE(v);
4461 if (unicode_decode_call_errorhandler(
4462 errors, &errorHandler,
4463 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004464 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004465 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004466 goto onError;
4467 }
4468 }
4469 else {
4470 p++;
4471 s += Py_UNICODE_SIZE;
4472 }
4473 }
4474
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004475 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004476 goto onError;
4477 Py_XDECREF(errorHandler);
4478 Py_XDECREF(exc);
4479 return (PyObject *)v;
4480
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004482 Py_XDECREF(v);
4483 Py_XDECREF(errorHandler);
4484 Py_XDECREF(exc);
4485 return NULL;
4486}
4487
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488/* --- Latin-1 Codec ------------------------------------------------------ */
4489
Alexander Belopolsky40018472011-02-26 01:02:56 +00004490PyObject *
4491PyUnicode_DecodeLatin1(const char *s,
4492 Py_ssize_t size,
4493 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494{
4495 PyUnicodeObject *v;
4496 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004497 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004498
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004500 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 Py_UNICODE r = *(unsigned char*)s;
4502 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004503 }
4504
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505 v = _PyUnicode_New(size);
4506 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004511 e = s + size;
4512 /* Unrolling the copy makes it much faster by reducing the looping
4513 overhead. This is similar to what many memcpy() implementations do. */
4514 unrolled_end = e - 4;
4515 while (s < unrolled_end) {
4516 p[0] = (unsigned char) s[0];
4517 p[1] = (unsigned char) s[1];
4518 p[2] = (unsigned char) s[2];
4519 p[3] = (unsigned char) s[3];
4520 s += 4;
4521 p += 4;
4522 }
4523 while (s < e)
4524 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004526
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528 Py_XDECREF(v);
4529 return NULL;
4530}
4531
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004533static void
4534make_encode_exception(PyObject **exceptionObject,
4535 const char *encoding,
4536 const Py_UNICODE *unicode, Py_ssize_t size,
4537 Py_ssize_t startpos, Py_ssize_t endpos,
4538 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004540 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 *exceptionObject = PyUnicodeEncodeError_Create(
4542 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543 }
4544 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4546 goto onError;
4547 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4548 goto onError;
4549 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4550 goto onError;
4551 return;
4552 onError:
4553 Py_DECREF(*exceptionObject);
4554 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555 }
4556}
4557
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004559static void
4560raise_encode_exception(PyObject **exceptionObject,
4561 const char *encoding,
4562 const Py_UNICODE *unicode, Py_ssize_t size,
4563 Py_ssize_t startpos, Py_ssize_t endpos,
4564 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565{
4566 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004567 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570}
4571
4572/* error handling callback helper:
4573 build arguments, call the callback and check the arguments,
4574 put the result into newpos and return the replacement string, which
4575 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004576static PyObject *
4577unicode_encode_call_errorhandler(const char *errors,
4578 PyObject **errorHandler,
4579 const char *encoding, const char *reason,
4580 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4581 Py_ssize_t startpos, Py_ssize_t endpos,
4582 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004584 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585
4586 PyObject *restuple;
4587 PyObject *resunicode;
4588
4589 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004590 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004592 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004593 }
4594
4595 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004596 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004597 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004598 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599
4600 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004601 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004602 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004603 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004605 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004606 Py_DECREF(restuple);
4607 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004608 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004609 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 &resunicode, newpos)) {
4611 Py_DECREF(restuple);
4612 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004613 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004614 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4615 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4616 Py_DECREF(restuple);
4617 return NULL;
4618 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004619 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004620 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004621 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004622 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4623 Py_DECREF(restuple);
4624 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004625 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626 Py_INCREF(resunicode);
4627 Py_DECREF(restuple);
4628 return resunicode;
4629}
4630
Alexander Belopolsky40018472011-02-26 01:02:56 +00004631static PyObject *
4632unicode_encode_ucs1(const Py_UNICODE *p,
4633 Py_ssize_t size,
4634 const char *errors,
4635 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636{
4637 /* output object */
4638 PyObject *res;
4639 /* pointers to the beginning and end+1 of input */
4640 const Py_UNICODE *startp = p;
4641 const Py_UNICODE *endp = p + size;
4642 /* pointer to the beginning of the unencodable characters */
4643 /* const Py_UNICODE *badp = NULL; */
4644 /* pointer into the output */
4645 char *str;
4646 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004647 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004648 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4649 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650 PyObject *errorHandler = NULL;
4651 PyObject *exc = NULL;
4652 /* the following variable is used for caching string comparisons
4653 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4654 int known_errorHandler = -1;
4655
4656 /* allocate enough for a simple encoding without
4657 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004658 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004659 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004660 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004661 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004662 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004663 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664 ressize = size;
4665
4666 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004667 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668
Benjamin Peterson29060642009-01-31 22:14:21 +00004669 /* can we encode this? */
4670 if (c<limit) {
4671 /* no overflow check, because we know that the space is enough */
4672 *str++ = (char)c;
4673 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004674 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 else {
4676 Py_ssize_t unicodepos = p-startp;
4677 Py_ssize_t requiredsize;
4678 PyObject *repunicode;
4679 Py_ssize_t repsize;
4680 Py_ssize_t newpos;
4681 Py_ssize_t respos;
4682 Py_UNICODE *uni2;
4683 /* startpos for collecting unencodable chars */
4684 const Py_UNICODE *collstart = p;
4685 const Py_UNICODE *collend = p;
4686 /* find all unecodable characters */
4687 while ((collend < endp) && ((*collend)>=limit))
4688 ++collend;
4689 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4690 if (known_errorHandler==-1) {
4691 if ((errors==NULL) || (!strcmp(errors, "strict")))
4692 known_errorHandler = 1;
4693 else if (!strcmp(errors, "replace"))
4694 known_errorHandler = 2;
4695 else if (!strcmp(errors, "ignore"))
4696 known_errorHandler = 3;
4697 else if (!strcmp(errors, "xmlcharrefreplace"))
4698 known_errorHandler = 4;
4699 else
4700 known_errorHandler = 0;
4701 }
4702 switch (known_errorHandler) {
4703 case 1: /* strict */
4704 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4705 goto onError;
4706 case 2: /* replace */
4707 while (collstart++<collend)
4708 *str++ = '?'; /* fall through */
4709 case 3: /* ignore */
4710 p = collend;
4711 break;
4712 case 4: /* xmlcharrefreplace */
4713 respos = str - PyBytes_AS_STRING(res);
4714 /* determine replacement size (temporarily (mis)uses p) */
4715 for (p = collstart, repsize = 0; p < collend; ++p) {
4716 if (*p<10)
4717 repsize += 2+1+1;
4718 else if (*p<100)
4719 repsize += 2+2+1;
4720 else if (*p<1000)
4721 repsize += 2+3+1;
4722 else if (*p<10000)
4723 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004724#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004725 else
4726 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004727#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004728 else if (*p<100000)
4729 repsize += 2+5+1;
4730 else if (*p<1000000)
4731 repsize += 2+6+1;
4732 else
4733 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004734#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004735 }
4736 requiredsize = respos+repsize+(endp-collend);
4737 if (requiredsize > ressize) {
4738 if (requiredsize<2*ressize)
4739 requiredsize = 2*ressize;
4740 if (_PyBytes_Resize(&res, requiredsize))
4741 goto onError;
4742 str = PyBytes_AS_STRING(res) + respos;
4743 ressize = requiredsize;
4744 }
4745 /* generate replacement (temporarily (mis)uses p) */
4746 for (p = collstart; p < collend; ++p) {
4747 str += sprintf(str, "&#%d;", (int)*p);
4748 }
4749 p = collend;
4750 break;
4751 default:
4752 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4753 encoding, reason, startp, size, &exc,
4754 collstart-startp, collend-startp, &newpos);
4755 if (repunicode == NULL)
4756 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004757 if (PyBytes_Check(repunicode)) {
4758 /* Directly copy bytes result to output. */
4759 repsize = PyBytes_Size(repunicode);
4760 if (repsize > 1) {
4761 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004762 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004763 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4764 Py_DECREF(repunicode);
4765 goto onError;
4766 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004767 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004768 ressize += repsize-1;
4769 }
4770 memcpy(str, PyBytes_AsString(repunicode), repsize);
4771 str += repsize;
4772 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004773 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004774 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004775 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004776 /* need more space? (at least enough for what we
4777 have+the replacement+the rest of the string, so
4778 we won't have to check space for encodable characters) */
4779 respos = str - PyBytes_AS_STRING(res);
4780 repsize = PyUnicode_GET_SIZE(repunicode);
4781 requiredsize = respos+repsize+(endp-collend);
4782 if (requiredsize > ressize) {
4783 if (requiredsize<2*ressize)
4784 requiredsize = 2*ressize;
4785 if (_PyBytes_Resize(&res, requiredsize)) {
4786 Py_DECREF(repunicode);
4787 goto onError;
4788 }
4789 str = PyBytes_AS_STRING(res) + respos;
4790 ressize = requiredsize;
4791 }
4792 /* check if there is anything unencodable in the replacement
4793 and copy it to the output */
4794 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4795 c = *uni2;
4796 if (c >= limit) {
4797 raise_encode_exception(&exc, encoding, startp, size,
4798 unicodepos, unicodepos+1, reason);
4799 Py_DECREF(repunicode);
4800 goto onError;
4801 }
4802 *str = (char)c;
4803 }
4804 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004805 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004806 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004807 }
4808 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004809 /* Resize if we allocated to much */
4810 size = str - PyBytes_AS_STRING(res);
4811 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004812 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004813 if (_PyBytes_Resize(&res, size) < 0)
4814 goto onError;
4815 }
4816
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004817 Py_XDECREF(errorHandler);
4818 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004819 return res;
4820
4821 onError:
4822 Py_XDECREF(res);
4823 Py_XDECREF(errorHandler);
4824 Py_XDECREF(exc);
4825 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826}
4827
Alexander Belopolsky40018472011-02-26 01:02:56 +00004828PyObject *
4829PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4830 Py_ssize_t size,
4831 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834}
4835
Alexander Belopolsky40018472011-02-26 01:02:56 +00004836PyObject *
4837PyUnicode_AsLatin1String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838{
4839 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004840 PyErr_BadArgument();
4841 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 }
4843 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 PyUnicode_GET_SIZE(unicode),
4845 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846}
4847
4848/* --- 7-bit ASCII Codec -------------------------------------------------- */
4849
Alexander Belopolsky40018472011-02-26 01:02:56 +00004850PyObject *
4851PyUnicode_DecodeASCII(const char *s,
4852 Py_ssize_t size,
4853 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004855 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 PyUnicodeObject *v;
4857 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004858 Py_ssize_t startinpos;
4859 Py_ssize_t endinpos;
4860 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004861 const char *e;
4862 PyObject *errorHandler = NULL;
4863 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004864
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004866 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 Py_UNICODE r = *(unsigned char*)s;
4868 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004869 }
Tim Petersced69f82003-09-16 20:30:58 +00004870
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 v = _PyUnicode_New(size);
4872 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004873 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004875 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004877 e = s + size;
4878 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004879 register unsigned char c = (unsigned char)*s;
4880 if (c < 128) {
4881 *p++ = c;
4882 ++s;
4883 }
4884 else {
4885 startinpos = s-starts;
4886 endinpos = startinpos + 1;
4887 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4888 if (unicode_decode_call_errorhandler(
4889 errors, &errorHandler,
4890 "ascii", "ordinal not in range(128)",
4891 &starts, &e, &startinpos, &endinpos, &exc, &s,
4892 &v, &outpos, &p))
4893 goto onError;
4894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004896 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004897 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4898 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004899 Py_XDECREF(errorHandler);
4900 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004902
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905 Py_XDECREF(errorHandler);
4906 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907 return NULL;
4908}
4909
Alexander Belopolsky40018472011-02-26 01:02:56 +00004910PyObject *
4911PyUnicode_EncodeASCII(const Py_UNICODE *p,
4912 Py_ssize_t size,
4913 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004915 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916}
4917
Alexander Belopolsky40018472011-02-26 01:02:56 +00004918PyObject *
4919PyUnicode_AsASCIIString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920{
4921 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 PyErr_BadArgument();
4923 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924 }
4925 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 PyUnicode_GET_SIZE(unicode),
4927 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928}
4929
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004930#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004931
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004932/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004933
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004934#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004935#define NEED_RETRY
4936#endif
4937
4938/* XXX This code is limited to "true" double-byte encodings, as
4939 a) it assumes an incomplete character consists of a single byte, and
4940 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004942
Alexander Belopolsky40018472011-02-26 01:02:56 +00004943static int
4944is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004945{
4946 const char *curr = s + offset;
4947
4948 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004949 const char *prev = CharPrev(s, curr);
4950 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004951 }
4952 return 0;
4953}
4954
4955/*
4956 * Decode MBCS string into unicode object. If 'final' is set, converts
4957 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4958 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004959static int
4960decode_mbcs(PyUnicodeObject **v,
4961 const char *s, /* MBCS string */
4962 int size, /* sizeof MBCS string */
4963 int final,
4964 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004965{
4966 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004967 Py_ssize_t n;
4968 DWORD usize;
4969 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004970
4971 assert(size >= 0);
4972
Victor Stinner554f3f02010-06-16 23:33:54 +00004973 /* check and handle 'errors' arg */
4974 if (errors==NULL || strcmp(errors, "strict")==0)
4975 flags = MB_ERR_INVALID_CHARS;
4976 else if (strcmp(errors, "ignore")==0)
4977 flags = 0;
4978 else {
4979 PyErr_Format(PyExc_ValueError,
4980 "mbcs encoding does not support errors='%s'",
4981 errors);
4982 return -1;
4983 }
4984
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004985 /* Skip trailing lead-byte unless 'final' is set */
4986 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004987 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004988
4989 /* First get the size of the result */
4990 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004991 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4992 if (usize==0)
4993 goto mbcs_decode_error;
4994 } else
4995 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004996
4997 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 /* Create unicode object */
4999 *v = _PyUnicode_New(usize);
5000 if (*v == NULL)
5001 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005002 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005003 }
5004 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005005 /* Extend unicode object */
5006 n = PyUnicode_GET_SIZE(*v);
5007 if (_PyUnicode_Resize(v, n + usize) < 0)
5008 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005009 }
5010
5011 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00005012 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005014 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
5015 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00005016 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005017 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005018 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00005019
5020mbcs_decode_error:
5021 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
5022 we raise a UnicodeDecodeError - else it is a 'generic'
5023 windows error
5024 */
5025 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
5026 /* Ideally, we should get reason from FormatMessage - this
5027 is the Windows 2000 English version of the message
5028 */
5029 PyObject *exc = NULL;
5030 const char *reason = "No mapping for the Unicode character exists "
5031 "in the target multi-byte code page.";
5032 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
5033 if (exc != NULL) {
5034 PyCodec_StrictErrors(exc);
5035 Py_DECREF(exc);
5036 }
5037 } else {
5038 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5039 }
5040 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005041}
5042
Alexander Belopolsky40018472011-02-26 01:02:56 +00005043PyObject *
5044PyUnicode_DecodeMBCSStateful(const char *s,
5045 Py_ssize_t size,
5046 const char *errors,
5047 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005048{
5049 PyUnicodeObject *v = NULL;
5050 int done;
5051
5052 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005054
5055#ifdef NEED_RETRY
5056 retry:
5057 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005058 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005059 else
5060#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005061 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005062
5063 if (done < 0) {
5064 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005066 }
5067
5068 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005069 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005070
5071#ifdef NEED_RETRY
5072 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005073 s += done;
5074 size -= done;
5075 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005076 }
5077#endif
5078
5079 return (PyObject *)v;
5080}
5081
Alexander Belopolsky40018472011-02-26 01:02:56 +00005082PyObject *
5083PyUnicode_DecodeMBCS(const char *s,
5084 Py_ssize_t size,
5085 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005086{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005087 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5088}
5089
5090/*
5091 * Convert unicode into string object (MBCS).
5092 * Returns 0 if succeed, -1 otherwise.
5093 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005094static int
5095encode_mbcs(PyObject **repr,
5096 const Py_UNICODE *p, /* unicode */
5097 int size, /* size of unicode */
5098 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005099{
Victor Stinner554f3f02010-06-16 23:33:54 +00005100 BOOL usedDefaultChar = FALSE;
5101 BOOL *pusedDefaultChar;
5102 int mbcssize;
5103 Py_ssize_t n;
5104 PyObject *exc = NULL;
5105 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005106
5107 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005108
Victor Stinner554f3f02010-06-16 23:33:54 +00005109 /* check and handle 'errors' arg */
5110 if (errors==NULL || strcmp(errors, "strict")==0) {
5111 flags = WC_NO_BEST_FIT_CHARS;
5112 pusedDefaultChar = &usedDefaultChar;
5113 } else if (strcmp(errors, "replace")==0) {
5114 flags = 0;
5115 pusedDefaultChar = NULL;
5116 } else {
5117 PyErr_Format(PyExc_ValueError,
5118 "mbcs encoding does not support errors='%s'",
5119 errors);
5120 return -1;
5121 }
5122
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005123 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005124 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005125 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5126 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005127 if (mbcssize == 0) {
5128 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5129 return -1;
5130 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005131 /* If we used a default char, then we failed! */
5132 if (pusedDefaultChar && *pusedDefaultChar)
5133 goto mbcs_encode_error;
5134 } else {
5135 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005136 }
5137
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005138 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005139 /* Create string object */
5140 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5141 if (*repr == NULL)
5142 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005143 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005144 }
5145 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005146 /* Extend string object */
5147 n = PyBytes_Size(*repr);
5148 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5149 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005150 }
5151
5152 /* Do the conversion */
5153 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005154 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005155 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5156 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5158 return -1;
5159 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005160 if (pusedDefaultChar && *pusedDefaultChar)
5161 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005162 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005163 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005164
5165mbcs_encode_error:
5166 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5167 Py_XDECREF(exc);
5168 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005169}
5170
Alexander Belopolsky40018472011-02-26 01:02:56 +00005171PyObject *
5172PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5173 Py_ssize_t size,
5174 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005175{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005176 PyObject *repr = NULL;
5177 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005178
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005179#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005181 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005182 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005183 else
5184#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005185 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005186
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005187 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 Py_XDECREF(repr);
5189 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005190 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005191
5192#ifdef NEED_RETRY
5193 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 p += INT_MAX;
5195 size -= INT_MAX;
5196 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005197 }
5198#endif
5199
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005200 return repr;
5201}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005202
Alexander Belopolsky40018472011-02-26 01:02:56 +00005203PyObject *
5204PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005205{
5206 if (!PyUnicode_Check(unicode)) {
5207 PyErr_BadArgument();
5208 return NULL;
5209 }
5210 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005211 PyUnicode_GET_SIZE(unicode),
5212 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005213}
5214
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005215#undef NEED_RETRY
5216
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005217#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005218
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219/* --- Character Mapping Codec -------------------------------------------- */
5220
Alexander Belopolsky40018472011-02-26 01:02:56 +00005221PyObject *
5222PyUnicode_DecodeCharmap(const char *s,
5223 Py_ssize_t size,
5224 PyObject *mapping,
5225 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005227 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005228 Py_ssize_t startinpos;
5229 Py_ssize_t endinpos;
5230 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005231 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232 PyUnicodeObject *v;
5233 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005234 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005235 PyObject *errorHandler = NULL;
5236 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005237 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005238 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005239
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 /* Default to Latin-1 */
5241 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243
5244 v = _PyUnicode_New(size);
5245 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005248 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005250 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005251 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005252 mapstring = PyUnicode_AS_UNICODE(mapping);
5253 maplen = PyUnicode_GET_SIZE(mapping);
5254 while (s < e) {
5255 unsigned char ch = *s;
5256 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257
Benjamin Peterson29060642009-01-31 22:14:21 +00005258 if (ch < maplen)
5259 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 if (x == 0xfffe) {
5262 /* undefined mapping */
5263 outpos = p-PyUnicode_AS_UNICODE(v);
5264 startinpos = s-starts;
5265 endinpos = startinpos+1;
5266 if (unicode_decode_call_errorhandler(
5267 errors, &errorHandler,
5268 "charmap", "character maps to <undefined>",
5269 &starts, &e, &startinpos, &endinpos, &exc, &s,
5270 &v, &outpos, &p)) {
5271 goto onError;
5272 }
5273 continue;
5274 }
5275 *p++ = x;
5276 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005277 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005278 }
5279 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005280 while (s < e) {
5281 unsigned char ch = *s;
5282 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005283
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5285 w = PyLong_FromLong((long)ch);
5286 if (w == NULL)
5287 goto onError;
5288 x = PyObject_GetItem(mapping, w);
5289 Py_DECREF(w);
5290 if (x == NULL) {
5291 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5292 /* No mapping found means: mapping is undefined. */
5293 PyErr_Clear();
5294 x = Py_None;
5295 Py_INCREF(x);
5296 } else
5297 goto onError;
5298 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005299
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 /* Apply mapping */
5301 if (PyLong_Check(x)) {
5302 long value = PyLong_AS_LONG(x);
5303 if (value < 0 || value > 65535) {
5304 PyErr_SetString(PyExc_TypeError,
5305 "character mapping must be in range(65536)");
5306 Py_DECREF(x);
5307 goto onError;
5308 }
5309 *p++ = (Py_UNICODE)value;
5310 }
5311 else if (x == Py_None) {
5312 /* undefined mapping */
5313 outpos = p-PyUnicode_AS_UNICODE(v);
5314 startinpos = s-starts;
5315 endinpos = startinpos+1;
5316 if (unicode_decode_call_errorhandler(
5317 errors, &errorHandler,
5318 "charmap", "character maps to <undefined>",
5319 &starts, &e, &startinpos, &endinpos, &exc, &s,
5320 &v, &outpos, &p)) {
5321 Py_DECREF(x);
5322 goto onError;
5323 }
5324 Py_DECREF(x);
5325 continue;
5326 }
5327 else if (PyUnicode_Check(x)) {
5328 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005329
Benjamin Peterson29060642009-01-31 22:14:21 +00005330 if (targetsize == 1)
5331 /* 1-1 mapping */
5332 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005333
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 else if (targetsize > 1) {
5335 /* 1-n mapping */
5336 if (targetsize > extrachars) {
5337 /* resize first */
5338 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5339 Py_ssize_t needed = (targetsize - extrachars) + \
5340 (targetsize << 2);
5341 extrachars += needed;
5342 /* XXX overflow detection missing */
5343 if (_PyUnicode_Resize(&v,
5344 PyUnicode_GET_SIZE(v) + needed) < 0) {
5345 Py_DECREF(x);
5346 goto onError;
5347 }
5348 p = PyUnicode_AS_UNICODE(v) + oldpos;
5349 }
5350 Py_UNICODE_COPY(p,
5351 PyUnicode_AS_UNICODE(x),
5352 targetsize);
5353 p += targetsize;
5354 extrachars -= targetsize;
5355 }
5356 /* 1-0 mapping: skip the character */
5357 }
5358 else {
5359 /* wrong return value */
5360 PyErr_SetString(PyExc_TypeError,
5361 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005362 Py_DECREF(x);
5363 goto onError;
5364 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 Py_DECREF(x);
5366 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005367 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 }
5369 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5371 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005372 Py_XDECREF(errorHandler);
5373 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005375
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377 Py_XDECREF(errorHandler);
5378 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 Py_XDECREF(v);
5380 return NULL;
5381}
5382
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005383/* Charmap encoding: the lookup table */
5384
Alexander Belopolsky40018472011-02-26 01:02:56 +00005385struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00005386 PyObject_HEAD
5387 unsigned char level1[32];
5388 int count2, count3;
5389 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005390};
5391
5392static PyObject*
5393encoding_map_size(PyObject *obj, PyObject* args)
5394{
5395 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005396 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005397 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005398}
5399
5400static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005401 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005402 PyDoc_STR("Return the size (in bytes) of this object") },
5403 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005404};
5405
5406static void
5407encoding_map_dealloc(PyObject* o)
5408{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005409 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005410}
5411
5412static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005413 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 "EncodingMap", /*tp_name*/
5415 sizeof(struct encoding_map), /*tp_basicsize*/
5416 0, /*tp_itemsize*/
5417 /* methods */
5418 encoding_map_dealloc, /*tp_dealloc*/
5419 0, /*tp_print*/
5420 0, /*tp_getattr*/
5421 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005422 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005423 0, /*tp_repr*/
5424 0, /*tp_as_number*/
5425 0, /*tp_as_sequence*/
5426 0, /*tp_as_mapping*/
5427 0, /*tp_hash*/
5428 0, /*tp_call*/
5429 0, /*tp_str*/
5430 0, /*tp_getattro*/
5431 0, /*tp_setattro*/
5432 0, /*tp_as_buffer*/
5433 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5434 0, /*tp_doc*/
5435 0, /*tp_traverse*/
5436 0, /*tp_clear*/
5437 0, /*tp_richcompare*/
5438 0, /*tp_weaklistoffset*/
5439 0, /*tp_iter*/
5440 0, /*tp_iternext*/
5441 encoding_map_methods, /*tp_methods*/
5442 0, /*tp_members*/
5443 0, /*tp_getset*/
5444 0, /*tp_base*/
5445 0, /*tp_dict*/
5446 0, /*tp_descr_get*/
5447 0, /*tp_descr_set*/
5448 0, /*tp_dictoffset*/
5449 0, /*tp_init*/
5450 0, /*tp_alloc*/
5451 0, /*tp_new*/
5452 0, /*tp_free*/
5453 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005454};
5455
5456PyObject*
5457PyUnicode_BuildEncodingMap(PyObject* string)
5458{
5459 Py_UNICODE *decode;
5460 PyObject *result;
5461 struct encoding_map *mresult;
5462 int i;
5463 int need_dict = 0;
5464 unsigned char level1[32];
5465 unsigned char level2[512];
5466 unsigned char *mlevel1, *mlevel2, *mlevel3;
5467 int count2 = 0, count3 = 0;
5468
5469 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5470 PyErr_BadArgument();
5471 return NULL;
5472 }
5473 decode = PyUnicode_AS_UNICODE(string);
5474 memset(level1, 0xFF, sizeof level1);
5475 memset(level2, 0xFF, sizeof level2);
5476
5477 /* If there isn't a one-to-one mapping of NULL to \0,
5478 or if there are non-BMP characters, we need to use
5479 a mapping dictionary. */
5480 if (decode[0] != 0)
5481 need_dict = 1;
5482 for (i = 1; i < 256; i++) {
5483 int l1, l2;
5484 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005485#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005486 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005487#endif
5488 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005489 need_dict = 1;
5490 break;
5491 }
5492 if (decode[i] == 0xFFFE)
5493 /* unmapped character */
5494 continue;
5495 l1 = decode[i] >> 11;
5496 l2 = decode[i] >> 7;
5497 if (level1[l1] == 0xFF)
5498 level1[l1] = count2++;
5499 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005500 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005501 }
5502
5503 if (count2 >= 0xFF || count3 >= 0xFF)
5504 need_dict = 1;
5505
5506 if (need_dict) {
5507 PyObject *result = PyDict_New();
5508 PyObject *key, *value;
5509 if (!result)
5510 return NULL;
5511 for (i = 0; i < 256; i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00005512 key = PyLong_FromLong(decode[i]);
5513 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005514 if (!key || !value)
5515 goto failed1;
5516 if (PyDict_SetItem(result, key, value) == -1)
5517 goto failed1;
5518 Py_DECREF(key);
5519 Py_DECREF(value);
5520 }
5521 return result;
5522 failed1:
5523 Py_XDECREF(key);
5524 Py_XDECREF(value);
5525 Py_DECREF(result);
5526 return NULL;
5527 }
5528
5529 /* Create a three-level trie */
5530 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5531 16*count2 + 128*count3 - 1);
5532 if (!result)
5533 return PyErr_NoMemory();
5534 PyObject_Init(result, &EncodingMapType);
5535 mresult = (struct encoding_map*)result;
5536 mresult->count2 = count2;
5537 mresult->count3 = count3;
5538 mlevel1 = mresult->level1;
5539 mlevel2 = mresult->level23;
5540 mlevel3 = mresult->level23 + 16*count2;
5541 memcpy(mlevel1, level1, 32);
5542 memset(mlevel2, 0xFF, 16*count2);
5543 memset(mlevel3, 0, 128*count3);
5544 count3 = 0;
5545 for (i = 1; i < 256; i++) {
5546 int o1, o2, o3, i2, i3;
5547 if (decode[i] == 0xFFFE)
5548 /* unmapped character */
5549 continue;
5550 o1 = decode[i]>>11;
5551 o2 = (decode[i]>>7) & 0xF;
5552 i2 = 16*mlevel1[o1] + o2;
5553 if (mlevel2[i2] == 0xFF)
5554 mlevel2[i2] = count3++;
5555 o3 = decode[i] & 0x7F;
5556 i3 = 128*mlevel2[i2] + o3;
5557 mlevel3[i3] = i;
5558 }
5559 return result;
5560}
5561
5562static int
5563encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5564{
5565 struct encoding_map *map = (struct encoding_map*)mapping;
5566 int l1 = c>>11;
5567 int l2 = (c>>7) & 0xF;
5568 int l3 = c & 0x7F;
5569 int i;
5570
5571#ifdef Py_UNICODE_WIDE
5572 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005573 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005574 }
5575#endif
5576 if (c == 0)
5577 return 0;
5578 /* level 1*/
5579 i = map->level1[l1];
5580 if (i == 0xFF) {
5581 return -1;
5582 }
5583 /* level 2*/
5584 i = map->level23[16*i+l2];
5585 if (i == 0xFF) {
5586 return -1;
5587 }
5588 /* level 3 */
5589 i = map->level23[16*map->count2 + 128*i + l3];
5590 if (i == 0) {
5591 return -1;
5592 }
5593 return i;
5594}
5595
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005596/* Lookup the character ch in the mapping. If the character
5597 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005598 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005599static PyObject *
5600charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601{
Christian Heimes217cfd12007-12-02 14:31:20 +00005602 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603 PyObject *x;
5604
5605 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005606 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005607 x = PyObject_GetItem(mapping, w);
5608 Py_DECREF(w);
5609 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5611 /* No mapping found means: mapping is undefined. */
5612 PyErr_Clear();
5613 x = Py_None;
5614 Py_INCREF(x);
5615 return x;
5616 } else
5617 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005619 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005621 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 long value = PyLong_AS_LONG(x);
5623 if (value < 0 || value > 255) {
5624 PyErr_SetString(PyExc_TypeError,
5625 "character mapping must be in range(256)");
5626 Py_DECREF(x);
5627 return NULL;
5628 }
5629 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005631 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 /* wrong return value */
5635 PyErr_Format(PyExc_TypeError,
5636 "character mapping must return integer, bytes or None, not %.400s",
5637 x->ob_type->tp_name);
5638 Py_DECREF(x);
5639 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 }
5641}
5642
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005643static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005644charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005645{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005646 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5647 /* exponentially overallocate to minimize reallocations */
5648 if (requiredsize < 2*outsize)
5649 requiredsize = 2*outsize;
5650 if (_PyBytes_Resize(outobj, requiredsize))
5651 return -1;
5652 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005653}
5654
Benjamin Peterson14339b62009-01-31 16:36:08 +00005655typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00005657} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005658/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005659 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005660 space is available. Return a new reference to the object that
5661 was put in the output buffer, or Py_None, if the mapping was undefined
5662 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005663 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005664static charmapencode_result
5665charmapencode_output(Py_UNICODE c, PyObject *mapping,
5666 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005667{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005668 PyObject *rep;
5669 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005670 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005671
Christian Heimes90aa7642007-12-19 02:45:37 +00005672 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005673 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005675 if (res == -1)
5676 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 if (outsize<requiredsize)
5678 if (charmapencode_resize(outobj, outpos, requiredsize))
5679 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005680 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 outstart[(*outpos)++] = (char)res;
5682 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005683 }
5684
5685 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005688 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 Py_DECREF(rep);
5690 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005691 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 if (PyLong_Check(rep)) {
5693 Py_ssize_t requiredsize = *outpos+1;
5694 if (outsize<requiredsize)
5695 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5696 Py_DECREF(rep);
5697 return enc_EXCEPTION;
5698 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005699 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005701 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 else {
5703 const char *repchars = PyBytes_AS_STRING(rep);
5704 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5705 Py_ssize_t requiredsize = *outpos+repsize;
5706 if (outsize<requiredsize)
5707 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5708 Py_DECREF(rep);
5709 return enc_EXCEPTION;
5710 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005711 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 memcpy(outstart + *outpos, repchars, repsize);
5713 *outpos += repsize;
5714 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005716 Py_DECREF(rep);
5717 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005718}
5719
5720/* handle an error in PyUnicode_EncodeCharmap
5721 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005722static int
5723charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005724 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005725 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005726 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005727 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005728{
5729 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005730 Py_ssize_t repsize;
5731 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005732 Py_UNICODE *uni2;
5733 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005734 Py_ssize_t collstartpos = *inpos;
5735 Py_ssize_t collendpos = *inpos+1;
5736 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737 char *encoding = "charmap";
5738 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005739 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005740
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741 /* find all unencodable characters */
5742 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005743 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005744 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 int res = encoding_map_lookup(p[collendpos], mapping);
5746 if (res != -1)
5747 break;
5748 ++collendpos;
5749 continue;
5750 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005751
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 rep = charmapencode_lookup(p[collendpos], mapping);
5753 if (rep==NULL)
5754 return -1;
5755 else if (rep!=Py_None) {
5756 Py_DECREF(rep);
5757 break;
5758 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005759 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 }
5762 /* cache callback name lookup
5763 * (if not done yet, i.e. it's the first error) */
5764 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 if ((errors==NULL) || (!strcmp(errors, "strict")))
5766 *known_errorHandler = 1;
5767 else if (!strcmp(errors, "replace"))
5768 *known_errorHandler = 2;
5769 else if (!strcmp(errors, "ignore"))
5770 *known_errorHandler = 3;
5771 else if (!strcmp(errors, "xmlcharrefreplace"))
5772 *known_errorHandler = 4;
5773 else
5774 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005775 }
5776 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005777 case 1: /* strict */
5778 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5779 return -1;
5780 case 2: /* replace */
5781 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 x = charmapencode_output('?', mapping, res, respos);
5783 if (x==enc_EXCEPTION) {
5784 return -1;
5785 }
5786 else if (x==enc_FAILED) {
5787 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5788 return -1;
5789 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005790 }
5791 /* fall through */
5792 case 3: /* ignore */
5793 *inpos = collendpos;
5794 break;
5795 case 4: /* xmlcharrefreplace */
5796 /* generate replacement (temporarily (mis)uses p) */
5797 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 char buffer[2+29+1+1];
5799 char *cp;
5800 sprintf(buffer, "&#%d;", (int)p[collpos]);
5801 for (cp = buffer; *cp; ++cp) {
5802 x = charmapencode_output(*cp, mapping, res, respos);
5803 if (x==enc_EXCEPTION)
5804 return -1;
5805 else if (x==enc_FAILED) {
5806 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5807 return -1;
5808 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005809 }
5810 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005811 *inpos = collendpos;
5812 break;
5813 default:
5814 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 encoding, reason, p, size, exceptionObject,
5816 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005817 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005819 if (PyBytes_Check(repunicode)) {
5820 /* Directly copy bytes result to output. */
5821 Py_ssize_t outsize = PyBytes_Size(*res);
5822 Py_ssize_t requiredsize;
5823 repsize = PyBytes_Size(repunicode);
5824 requiredsize = *respos + repsize;
5825 if (requiredsize > outsize)
5826 /* Make room for all additional bytes. */
5827 if (charmapencode_resize(res, respos, requiredsize)) {
5828 Py_DECREF(repunicode);
5829 return -1;
5830 }
5831 memcpy(PyBytes_AsString(*res) + *respos,
5832 PyBytes_AsString(repunicode), repsize);
5833 *respos += repsize;
5834 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005835 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005836 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005837 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005838 /* generate replacement */
5839 repsize = PyUnicode_GET_SIZE(repunicode);
5840 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 x = charmapencode_output(*uni2, mapping, res, respos);
5842 if (x==enc_EXCEPTION) {
5843 return -1;
5844 }
5845 else if (x==enc_FAILED) {
5846 Py_DECREF(repunicode);
5847 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5848 return -1;
5849 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005850 }
5851 *inpos = newpos;
5852 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005853 }
5854 return 0;
5855}
5856
Alexander Belopolsky40018472011-02-26 01:02:56 +00005857PyObject *
5858PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5859 Py_ssize_t size,
5860 PyObject *mapping,
5861 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005863 /* output object */
5864 PyObject *res = NULL;
5865 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005866 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005869 PyObject *errorHandler = NULL;
5870 PyObject *exc = NULL;
5871 /* the following variable is used for caching string comparisons
5872 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5873 * 3=ignore, 4=xmlcharrefreplace */
5874 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875
5876 /* Default to Latin-1 */
5877 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005880 /* allocate enough for a simple encoding without
5881 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005882 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005883 if (res == NULL)
5884 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005885 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005886 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005888 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 /* try to encode it */
5890 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5891 if (x==enc_EXCEPTION) /* error */
5892 goto onError;
5893 if (x==enc_FAILED) { /* unencodable character */
5894 if (charmap_encoding_error(p, size, &inpos, mapping,
5895 &exc,
5896 &known_errorHandler, &errorHandler, errors,
5897 &res, &respos)) {
5898 goto onError;
5899 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005900 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 else
5902 /* done with this character => adjust input position */
5903 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005907 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005908 if (_PyBytes_Resize(&res, respos) < 0)
5909 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005910
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005911 Py_XDECREF(exc);
5912 Py_XDECREF(errorHandler);
5913 return res;
5914
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916 Py_XDECREF(res);
5917 Py_XDECREF(exc);
5918 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 return NULL;
5920}
5921
Alexander Belopolsky40018472011-02-26 01:02:56 +00005922PyObject *
5923PyUnicode_AsCharmapString(PyObject *unicode,
5924 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925{
5926 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 PyErr_BadArgument();
5928 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 }
5930 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 PyUnicode_GET_SIZE(unicode),
5932 mapping,
5933 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934}
5935
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005936/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005937static void
5938make_translate_exception(PyObject **exceptionObject,
5939 const Py_UNICODE *unicode, Py_ssize_t size,
5940 Py_ssize_t startpos, Py_ssize_t endpos,
5941 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005943 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005944 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 }
5947 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5949 goto onError;
5950 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5951 goto onError;
5952 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5953 goto onError;
5954 return;
5955 onError:
5956 Py_DECREF(*exceptionObject);
5957 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 }
5959}
5960
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005961/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005962static void
5963raise_translate_exception(PyObject **exceptionObject,
5964 const Py_UNICODE *unicode, Py_ssize_t size,
5965 Py_ssize_t startpos, Py_ssize_t endpos,
5966 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005967{
5968 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005970 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005972}
5973
5974/* error handling callback helper:
5975 build arguments, call the callback and check the arguments,
5976 put the result into newpos and return the replacement string, which
5977 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005978static PyObject *
5979unicode_translate_call_errorhandler(const char *errors,
5980 PyObject **errorHandler,
5981 const char *reason,
5982 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5983 Py_ssize_t startpos, Py_ssize_t endpos,
5984 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005985{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005986 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005988 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005989 PyObject *restuple;
5990 PyObject *resunicode;
5991
5992 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005993 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005994 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005996 }
5997
5998 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006000 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006002
6003 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006005 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006006 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006007 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00006008 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 Py_DECREF(restuple);
6010 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011 }
6012 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 &resunicode, &i_newpos)) {
6014 Py_DECREF(restuple);
6015 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006016 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006017 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006019 else
6020 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006021 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6023 Py_DECREF(restuple);
6024 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006025 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026 Py_INCREF(resunicode);
6027 Py_DECREF(restuple);
6028 return resunicode;
6029}
6030
6031/* Lookup the character ch in the mapping and put the result in result,
6032 which must be decrefed by the caller.
6033 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006034static int
6035charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006036{
Christian Heimes217cfd12007-12-02 14:31:20 +00006037 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006038 PyObject *x;
6039
6040 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042 x = PyObject_GetItem(mapping, w);
6043 Py_DECREF(w);
6044 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6046 /* No mapping found means: use 1:1 mapping. */
6047 PyErr_Clear();
6048 *result = NULL;
6049 return 0;
6050 } else
6051 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006052 }
6053 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 *result = x;
6055 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 }
Christian Heimes217cfd12007-12-02 14:31:20 +00006057 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 long value = PyLong_AS_LONG(x);
6059 long max = PyUnicode_GetMax();
6060 if (value < 0 || value > max) {
6061 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00006062 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 Py_DECREF(x);
6064 return -1;
6065 }
6066 *result = x;
6067 return 0;
6068 }
6069 else if (PyUnicode_Check(x)) {
6070 *result = x;
6071 return 0;
6072 }
6073 else {
6074 /* wrong return value */
6075 PyErr_SetString(PyExc_TypeError,
6076 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006077 Py_DECREF(x);
6078 return -1;
6079 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006080}
6081/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 if not reallocate and adjust various state variables.
6083 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006084static int
6085charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006087{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006088 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006089 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 /* remember old output position */
6091 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6092 /* exponentially overallocate to minimize reallocations */
6093 if (requiredsize < 2 * oldsize)
6094 requiredsize = 2 * oldsize;
6095 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6096 return -1;
6097 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006098 }
6099 return 0;
6100}
6101/* lookup the character, put the result in the output string and adjust
6102 various state variables. Return a new reference to the object that
6103 was put in the output buffer in *result, or Py_None, if the mapping was
6104 undefined (in which case no character was written).
6105 The called must decref result.
6106 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006107static int
6108charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
6109 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6110 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111{
Walter Dörwald4894c302003-10-24 14:25:28 +00006112 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006114 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 /* not found => default to 1:1 mapping */
6116 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006117 }
6118 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006120 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 /* no overflow check, because we know that the space is enough */
6122 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006123 }
6124 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6126 if (repsize==1) {
6127 /* no overflow check, because we know that the space is enough */
6128 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6129 }
6130 else if (repsize!=0) {
6131 /* more than one character */
6132 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6133 (insize - (curinp-startinp)) +
6134 repsize - 1;
6135 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6136 return -1;
6137 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6138 *outp += repsize;
6139 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140 }
6141 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143 return 0;
6144}
6145
Alexander Belopolsky40018472011-02-26 01:02:56 +00006146PyObject *
6147PyUnicode_TranslateCharmap(const Py_UNICODE *p,
6148 Py_ssize_t size,
6149 PyObject *mapping,
6150 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006152 /* output object */
6153 PyObject *res = NULL;
6154 /* pointers to the beginning and end+1 of input */
6155 const Py_UNICODE *startp = p;
6156 const Py_UNICODE *endp = p + size;
6157 /* pointer into the output */
6158 Py_UNICODE *str;
6159 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006160 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006161 char *reason = "character maps to <undefined>";
6162 PyObject *errorHandler = NULL;
6163 PyObject *exc = NULL;
6164 /* the following variable is used for caching string comparisons
6165 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6166 * 3=ignore, 4=xmlcharrefreplace */
6167 int known_errorHandler = -1;
6168
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006170 PyErr_BadArgument();
6171 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006173
6174 /* allocate enough for a simple 1:1 translation without
6175 replacements, if we need more, we'll resize */
6176 res = PyUnicode_FromUnicode(NULL, size);
6177 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006180 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006181 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006183 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 /* try to encode it */
6185 PyObject *x = NULL;
6186 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6187 Py_XDECREF(x);
6188 goto onError;
6189 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006190 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 if (x!=Py_None) /* it worked => adjust input pointer */
6192 ++p;
6193 else { /* untranslatable character */
6194 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6195 Py_ssize_t repsize;
6196 Py_ssize_t newpos;
6197 Py_UNICODE *uni2;
6198 /* startpos for collecting untranslatable chars */
6199 const Py_UNICODE *collstart = p;
6200 const Py_UNICODE *collend = p+1;
6201 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 /* find all untranslatable characters */
6204 while (collend < endp) {
6205 if (charmaptranslate_lookup(*collend, mapping, &x))
6206 goto onError;
6207 Py_XDECREF(x);
6208 if (x!=Py_None)
6209 break;
6210 ++collend;
6211 }
6212 /* cache callback name lookup
6213 * (if not done yet, i.e. it's the first error) */
6214 if (known_errorHandler==-1) {
6215 if ((errors==NULL) || (!strcmp(errors, "strict")))
6216 known_errorHandler = 1;
6217 else if (!strcmp(errors, "replace"))
6218 known_errorHandler = 2;
6219 else if (!strcmp(errors, "ignore"))
6220 known_errorHandler = 3;
6221 else if (!strcmp(errors, "xmlcharrefreplace"))
6222 known_errorHandler = 4;
6223 else
6224 known_errorHandler = 0;
6225 }
6226 switch (known_errorHandler) {
6227 case 1: /* strict */
6228 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006229 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 case 2: /* replace */
6231 /* No need to check for space, this is a 1:1 replacement */
6232 for (coll = collstart; coll<collend; ++coll)
6233 *str++ = '?';
6234 /* fall through */
6235 case 3: /* ignore */
6236 p = collend;
6237 break;
6238 case 4: /* xmlcharrefreplace */
6239 /* generate replacement (temporarily (mis)uses p) */
6240 for (p = collstart; p < collend; ++p) {
6241 char buffer[2+29+1+1];
6242 char *cp;
6243 sprintf(buffer, "&#%d;", (int)*p);
6244 if (charmaptranslate_makespace(&res, &str,
6245 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6246 goto onError;
6247 for (cp = buffer; *cp; ++cp)
6248 *str++ = *cp;
6249 }
6250 p = collend;
6251 break;
6252 default:
6253 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6254 reason, startp, size, &exc,
6255 collstart-startp, collend-startp, &newpos);
6256 if (repunicode == NULL)
6257 goto onError;
6258 /* generate replacement */
6259 repsize = PyUnicode_GET_SIZE(repunicode);
6260 if (charmaptranslate_makespace(&res, &str,
6261 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6262 Py_DECREF(repunicode);
6263 goto onError;
6264 }
6265 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6266 *str++ = *uni2;
6267 p = startp + newpos;
6268 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006269 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006270 }
6271 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006272 /* Resize if we allocated to much */
6273 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006274 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 if (PyUnicode_Resize(&res, respos) < 0)
6276 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006277 }
6278 Py_XDECREF(exc);
6279 Py_XDECREF(errorHandler);
6280 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006283 Py_XDECREF(res);
6284 Py_XDECREF(exc);
6285 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 return NULL;
6287}
6288
Alexander Belopolsky40018472011-02-26 01:02:56 +00006289PyObject *
6290PyUnicode_Translate(PyObject *str,
6291 PyObject *mapping,
6292 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293{
6294 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006295
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 str = PyUnicode_FromObject(str);
6297 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 PyUnicode_GET_SIZE(str),
6301 mapping,
6302 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 Py_DECREF(str);
6304 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006305
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307 Py_XDECREF(str);
6308 return NULL;
6309}
Tim Petersced69f82003-09-16 20:30:58 +00006310
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006311PyObject *
6312PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6313 Py_ssize_t length)
6314{
6315 PyObject *result;
6316 Py_UNICODE *p; /* write pointer into result */
6317 Py_ssize_t i;
6318 /* Copy to a new string */
6319 result = (PyObject *)_PyUnicode_New(length);
6320 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6321 if (result == NULL)
6322 return result;
6323 p = PyUnicode_AS_UNICODE(result);
6324 /* Iterate over code points */
6325 for (i = 0; i < length; i++) {
6326 Py_UNICODE ch =s[i];
6327 if (ch > 127) {
6328 int decimal = Py_UNICODE_TODECIMAL(ch);
6329 if (decimal >= 0)
6330 p[i] = '0' + decimal;
6331 }
6332 }
6333 return result;
6334}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006335/* --- Decimal Encoder ---------------------------------------------------- */
6336
Alexander Belopolsky40018472011-02-26 01:02:56 +00006337int
6338PyUnicode_EncodeDecimal(Py_UNICODE *s,
6339 Py_ssize_t length,
6340 char *output,
6341 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006342{
6343 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006344 PyObject *errorHandler = NULL;
6345 PyObject *exc = NULL;
6346 const char *encoding = "decimal";
6347 const char *reason = "invalid decimal Unicode string";
6348 /* the following variable is used for caching string comparisons
6349 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6350 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006351
6352 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 PyErr_BadArgument();
6354 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006355 }
6356
6357 p = s;
6358 end = s + length;
6359 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 register Py_UNICODE ch = *p;
6361 int decimal;
6362 PyObject *repunicode;
6363 Py_ssize_t repsize;
6364 Py_ssize_t newpos;
6365 Py_UNICODE *uni2;
6366 Py_UNICODE *collstart;
6367 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006368
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006370 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 ++p;
6372 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006373 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 decimal = Py_UNICODE_TODECIMAL(ch);
6375 if (decimal >= 0) {
6376 *output++ = '0' + decimal;
6377 ++p;
6378 continue;
6379 }
6380 if (0 < ch && ch < 256) {
6381 *output++ = (char)ch;
6382 ++p;
6383 continue;
6384 }
6385 /* All other characters are considered unencodable */
6386 collstart = p;
6387 collend = p+1;
6388 while (collend < end) {
6389 if ((0 < *collend && *collend < 256) ||
6390 !Py_UNICODE_ISSPACE(*collend) ||
6391 Py_UNICODE_TODECIMAL(*collend))
6392 break;
6393 }
6394 /* cache callback name lookup
6395 * (if not done yet, i.e. it's the first error) */
6396 if (known_errorHandler==-1) {
6397 if ((errors==NULL) || (!strcmp(errors, "strict")))
6398 known_errorHandler = 1;
6399 else if (!strcmp(errors, "replace"))
6400 known_errorHandler = 2;
6401 else if (!strcmp(errors, "ignore"))
6402 known_errorHandler = 3;
6403 else if (!strcmp(errors, "xmlcharrefreplace"))
6404 known_errorHandler = 4;
6405 else
6406 known_errorHandler = 0;
6407 }
6408 switch (known_errorHandler) {
6409 case 1: /* strict */
6410 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6411 goto onError;
6412 case 2: /* replace */
6413 for (p = collstart; p < collend; ++p)
6414 *output++ = '?';
6415 /* fall through */
6416 case 3: /* ignore */
6417 p = collend;
6418 break;
6419 case 4: /* xmlcharrefreplace */
6420 /* generate replacement (temporarily (mis)uses p) */
6421 for (p = collstart; p < collend; ++p)
6422 output += sprintf(output, "&#%d;", (int)*p);
6423 p = collend;
6424 break;
6425 default:
6426 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6427 encoding, reason, s, length, &exc,
6428 collstart-s, collend-s, &newpos);
6429 if (repunicode == NULL)
6430 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006431 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006432 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006433 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6434 Py_DECREF(repunicode);
6435 goto onError;
6436 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 /* generate replacement */
6438 repsize = PyUnicode_GET_SIZE(repunicode);
6439 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6440 Py_UNICODE ch = *uni2;
6441 if (Py_UNICODE_ISSPACE(ch))
6442 *output++ = ' ';
6443 else {
6444 decimal = Py_UNICODE_TODECIMAL(ch);
6445 if (decimal >= 0)
6446 *output++ = '0' + decimal;
6447 else if (0 < ch && ch < 256)
6448 *output++ = (char)ch;
6449 else {
6450 Py_DECREF(repunicode);
6451 raise_encode_exception(&exc, encoding,
6452 s, length, collstart-s, collend-s, reason);
6453 goto onError;
6454 }
6455 }
6456 }
6457 p = s + newpos;
6458 Py_DECREF(repunicode);
6459 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006460 }
6461 /* 0-terminate the output string */
6462 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463 Py_XDECREF(exc);
6464 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006465 return 0;
6466
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006468 Py_XDECREF(exc);
6469 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006470 return -1;
6471}
6472
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473/* --- Helpers ------------------------------------------------------------ */
6474
Eric Smith8c663262007-08-25 02:26:07 +00006475#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006476#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006477
Thomas Wouters477c8d52006-05-27 19:21:47 +00006478#include "stringlib/count.h"
6479#include "stringlib/find.h"
6480#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006481#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006482
Eric Smith5807c412008-05-11 21:00:57 +00006483#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006484#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006485#include "stringlib/localeutil.h"
6486
Thomas Wouters477c8d52006-05-27 19:21:47 +00006487/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006488#define ADJUST_INDICES(start, end, len) \
6489 if (end > len) \
6490 end = len; \
6491 else if (end < 0) { \
6492 end += len; \
6493 if (end < 0) \
6494 end = 0; \
6495 } \
6496 if (start < 0) { \
6497 start += len; \
6498 if (start < 0) \
6499 start = 0; \
6500 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006501
Alexander Belopolsky40018472011-02-26 01:02:56 +00006502Py_ssize_t
6503PyUnicode_Count(PyObject *str,
6504 PyObject *substr,
6505 Py_ssize_t start,
6506 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006508 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006509 PyUnicodeObject* str_obj;
6510 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006511
Thomas Wouters477c8d52006-05-27 19:21:47 +00006512 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6513 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006515 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6516 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 Py_DECREF(str_obj);
6518 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 }
Tim Petersced69f82003-09-16 20:30:58 +00006520
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006521 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006522 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006523 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6524 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006525 );
6526
6527 Py_DECREF(sub_obj);
6528 Py_DECREF(str_obj);
6529
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 return result;
6531}
6532
Alexander Belopolsky40018472011-02-26 01:02:56 +00006533Py_ssize_t
6534PyUnicode_Find(PyObject *str,
6535 PyObject *sub,
6536 Py_ssize_t start,
6537 Py_ssize_t end,
6538 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006540 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006541
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006543 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006545 sub = PyUnicode_FromObject(sub);
6546 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 Py_DECREF(str);
6548 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 }
Tim Petersced69f82003-09-16 20:30:58 +00006550
Thomas Wouters477c8d52006-05-27 19:21:47 +00006551 if (direction > 0)
6552 result = stringlib_find_slice(
6553 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6554 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6555 start, end
6556 );
6557 else
6558 result = stringlib_rfind_slice(
6559 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6560 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6561 start, end
6562 );
6563
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006565 Py_DECREF(sub);
6566
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 return result;
6568}
6569
Alexander Belopolsky40018472011-02-26 01:02:56 +00006570static int
6571tailmatch(PyUnicodeObject *self,
6572 PyUnicodeObject *substring,
6573 Py_ssize_t start,
6574 Py_ssize_t end,
6575 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 if (substring->length == 0)
6578 return 1;
6579
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006580 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 end -= substring->length;
6582 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006583 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584
6585 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 if (Py_UNICODE_MATCH(self, end, substring))
6587 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 } else {
6589 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 }
6592
6593 return 0;
6594}
6595
Alexander Belopolsky40018472011-02-26 01:02:56 +00006596Py_ssize_t
6597PyUnicode_Tailmatch(PyObject *str,
6598 PyObject *substr,
6599 Py_ssize_t start,
6600 Py_ssize_t end,
6601 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006603 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006604
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 str = PyUnicode_FromObject(str);
6606 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 substr = PyUnicode_FromObject(substr);
6609 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 Py_DECREF(str);
6611 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 }
Tim Petersced69f82003-09-16 20:30:58 +00006613
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006615 (PyUnicodeObject *)substr,
6616 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 Py_DECREF(str);
6618 Py_DECREF(substr);
6619 return result;
6620}
6621
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622/* Apply fixfct filter to the Unicode object self and return a
6623 reference to the modified object */
6624
Alexander Belopolsky40018472011-02-26 01:02:56 +00006625static PyObject *
6626fixup(PyUnicodeObject *self,
6627 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628{
6629
6630 PyUnicodeObject *u;
6631
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006632 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006635
6636 Py_UNICODE_COPY(u->str, self->str, self->length);
6637
Tim Peters7a29bd52001-09-12 03:03:31 +00006638 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006639 /* fixfct should return TRUE if it modified the buffer. If
6640 FALSE, return a reference to the original buffer instead
6641 (to save space, not time) */
6642 Py_INCREF(self);
6643 Py_DECREF(u);
6644 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 }
6646 return (PyObject*) u;
6647}
6648
Alexander Belopolsky40018472011-02-26 01:02:56 +00006649static int
6650fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006652 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 Py_UNICODE *s = self->str;
6654 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006655
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006658
Benjamin Peterson29060642009-01-31 22:14:21 +00006659 ch = Py_UNICODE_TOUPPER(*s);
6660 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 *s = ch;
6663 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 s++;
6665 }
6666
6667 return status;
6668}
6669
Alexander Belopolsky40018472011-02-26 01:02:56 +00006670static int
6671fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006673 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 Py_UNICODE *s = self->str;
6675 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006676
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006679
Benjamin Peterson29060642009-01-31 22:14:21 +00006680 ch = Py_UNICODE_TOLOWER(*s);
6681 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006683 *s = ch;
6684 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685 s++;
6686 }
6687
6688 return status;
6689}
6690
Alexander Belopolsky40018472011-02-26 01:02:56 +00006691static int
6692fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006694 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 Py_UNICODE *s = self->str;
6696 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006697
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 while (len-- > 0) {
6699 if (Py_UNICODE_ISUPPER(*s)) {
6700 *s = Py_UNICODE_TOLOWER(*s);
6701 status = 1;
6702 } else if (Py_UNICODE_ISLOWER(*s)) {
6703 *s = Py_UNICODE_TOUPPER(*s);
6704 status = 1;
6705 }
6706 s++;
6707 }
6708
6709 return status;
6710}
6711
Alexander Belopolsky40018472011-02-26 01:02:56 +00006712static int
6713fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006715 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006716 Py_UNICODE *s = self->str;
6717 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006718
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006719 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006720 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006721 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 *s = Py_UNICODE_TOUPPER(*s);
6723 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006725 s++;
6726 while (--len > 0) {
6727 if (Py_UNICODE_ISUPPER(*s)) {
6728 *s = Py_UNICODE_TOLOWER(*s);
6729 status = 1;
6730 }
6731 s++;
6732 }
6733 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734}
6735
Alexander Belopolsky40018472011-02-26 01:02:56 +00006736static int
6737fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738{
6739 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6740 register Py_UNICODE *e;
6741 int previous_is_cased;
6742
6743 /* Shortcut for single character strings */
6744 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6746 if (*p != ch) {
6747 *p = ch;
6748 return 1;
6749 }
6750 else
6751 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 }
Tim Petersced69f82003-09-16 20:30:58 +00006753
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 e = p + PyUnicode_GET_SIZE(self);
6755 previous_is_cased = 0;
6756 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006758
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 if (previous_is_cased)
6760 *p = Py_UNICODE_TOLOWER(ch);
6761 else
6762 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006763
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 if (Py_UNICODE_ISLOWER(ch) ||
6765 Py_UNICODE_ISUPPER(ch) ||
6766 Py_UNICODE_ISTITLE(ch))
6767 previous_is_cased = 1;
6768 else
6769 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 }
6771 return 1;
6772}
6773
Tim Peters8ce9f162004-08-27 01:49:32 +00006774PyObject *
6775PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776{
Skip Montanaro6543b452004-09-16 03:28:13 +00006777 const Py_UNICODE blank = ' ';
6778 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006779 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006780 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006781 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6782 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006783 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6784 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006785 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006786 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787
Tim Peters05eba1f2004-08-27 21:32:02 +00006788 fseq = PySequence_Fast(seq, "");
6789 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006790 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006791 }
6792
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006793 /* NOTE: the following code can't call back into Python code,
6794 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006795 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006796
Tim Peters05eba1f2004-08-27 21:32:02 +00006797 seqlen = PySequence_Fast_GET_SIZE(fseq);
6798 /* If empty sequence, return u"". */
6799 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006800 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6801 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006802 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006803 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006804 /* If singleton sequence with an exact Unicode, return that. */
6805 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006806 item = items[0];
6807 if (PyUnicode_CheckExact(item)) {
6808 Py_INCREF(item);
6809 res = (PyUnicodeObject *)item;
6810 goto Done;
6811 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006812 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006813 else {
6814 /* Set up sep and seplen */
6815 if (separator == NULL) {
6816 sep = &blank;
6817 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006818 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006819 else {
6820 if (!PyUnicode_Check(separator)) {
6821 PyErr_Format(PyExc_TypeError,
6822 "separator: expected str instance,"
6823 " %.80s found",
6824 Py_TYPE(separator)->tp_name);
6825 goto onError;
6826 }
6827 sep = PyUnicode_AS_UNICODE(separator);
6828 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006829 }
6830 }
6831
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006832 /* There are at least two things to join, or else we have a subclass
6833 * of str in the sequence.
6834 * Do a pre-pass to figure out the total amount of space we'll
6835 * need (sz), and see whether all argument are strings.
6836 */
6837 sz = 0;
6838 for (i = 0; i < seqlen; i++) {
6839 const Py_ssize_t old_sz = sz;
6840 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 if (!PyUnicode_Check(item)) {
6842 PyErr_Format(PyExc_TypeError,
6843 "sequence item %zd: expected str instance,"
6844 " %.80s found",
6845 i, Py_TYPE(item)->tp_name);
6846 goto onError;
6847 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006848 sz += PyUnicode_GET_SIZE(item);
6849 if (i != 0)
6850 sz += seplen;
6851 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6852 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006853 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006854 goto onError;
6855 }
6856 }
Tim Petersced69f82003-09-16 20:30:58 +00006857
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006858 res = _PyUnicode_New(sz);
6859 if (res == NULL)
6860 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006861
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006862 /* Catenate everything. */
6863 res_p = PyUnicode_AS_UNICODE(res);
6864 for (i = 0; i < seqlen; ++i) {
6865 Py_ssize_t itemlen;
6866 item = items[i];
6867 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 /* Copy item, and maybe the separator. */
6869 if (i) {
6870 Py_UNICODE_COPY(res_p, sep, seplen);
6871 res_p += seplen;
6872 }
6873 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6874 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006875 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006876
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006878 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 return (PyObject *)res;
6880
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006882 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006883 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884 return NULL;
6885}
6886
Alexander Belopolsky40018472011-02-26 01:02:56 +00006887static PyUnicodeObject *
6888pad(PyUnicodeObject *self,
6889 Py_ssize_t left,
6890 Py_ssize_t right,
6891 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892{
6893 PyUnicodeObject *u;
6894
6895 if (left < 0)
6896 left = 0;
6897 if (right < 0)
6898 right = 0;
6899
Tim Peters7a29bd52001-09-12 03:03:31 +00006900 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 Py_INCREF(self);
6902 return self;
6903 }
6904
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006905 if (left > PY_SSIZE_T_MAX - self->length ||
6906 right > PY_SSIZE_T_MAX - (left + self->length)) {
6907 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6908 return NULL;
6909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 u = _PyUnicode_New(left + self->length + right);
6911 if (u) {
6912 if (left)
6913 Py_UNICODE_FILL(u->str, fill, left);
6914 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6915 if (right)
6916 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6917 }
6918
6919 return u;
6920}
6921
Alexander Belopolsky40018472011-02-26 01:02:56 +00006922PyObject *
6923PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926
6927 string = PyUnicode_FromObject(string);
6928 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006931 list = stringlib_splitlines(
6932 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6933 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934
6935 Py_DECREF(string);
6936 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937}
6938
Alexander Belopolsky40018472011-02-26 01:02:56 +00006939static PyObject *
6940split(PyUnicodeObject *self,
6941 PyUnicodeObject *substring,
6942 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006945 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006948 return stringlib_split_whitespace(
6949 (PyObject*) self, self->str, self->length, maxcount
6950 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006952 return stringlib_split(
6953 (PyObject*) self, self->str, self->length,
6954 substring->str, substring->length,
6955 maxcount
6956 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957}
6958
Alexander Belopolsky40018472011-02-26 01:02:56 +00006959static PyObject *
6960rsplit(PyUnicodeObject *self,
6961 PyUnicodeObject *substring,
6962 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006963{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006964 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006965 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006966
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006967 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006968 return stringlib_rsplit_whitespace(
6969 (PyObject*) self, self->str, self->length, maxcount
6970 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006971
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006972 return stringlib_rsplit(
6973 (PyObject*) self, self->str, self->length,
6974 substring->str, substring->length,
6975 maxcount
6976 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006977}
6978
Alexander Belopolsky40018472011-02-26 01:02:56 +00006979static PyObject *
6980replace(PyUnicodeObject *self,
6981 PyUnicodeObject *str1,
6982 PyUnicodeObject *str2,
6983 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984{
6985 PyUnicodeObject *u;
6986
6987 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006989 else if (maxcount == 0 || self->length == 0)
6990 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991
Thomas Wouters477c8d52006-05-27 19:21:47 +00006992 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006993 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006994 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006995 if (str1->length == 0)
6996 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006997 if (str1->length == 1) {
6998 /* replace characters */
6999 Py_UNICODE u1, u2;
7000 if (!findchar(self->str, self->length, str1->str[0]))
7001 goto nothing;
7002 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7003 if (!u)
7004 return NULL;
7005 Py_UNICODE_COPY(u->str, self->str, self->length);
7006 u1 = str1->str[0];
7007 u2 = str2->str[0];
7008 for (i = 0; i < u->length; i++)
7009 if (u->str[i] == u1) {
7010 if (--maxcount < 0)
7011 break;
7012 u->str[i] = u2;
7013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007015 i = stringlib_find(
7016 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00007018 if (i < 0)
7019 goto nothing;
7020 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7021 if (!u)
7022 return NULL;
7023 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007024
7025 /* change everything in-place, starting with this one */
7026 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7027 i += str1->length;
7028
7029 while ( --maxcount > 0) {
7030 i = stringlib_find(self->str+i, self->length-i,
7031 str1->str, str1->length,
7032 i);
7033 if (i == -1)
7034 break;
7035 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7036 i += str1->length;
7037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007040
Brett Cannonb94767f2011-02-22 20:15:44 +00007041 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007042 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043 Py_UNICODE *p;
7044
7045 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007046 n = stringlib_count(self->str, self->length, str1->str, str1->length,
7047 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007048 if (n == 0)
7049 goto nothing;
7050 /* new_size = self->length + n * (str2->length - str1->length)); */
7051 delta = (str2->length - str1->length);
7052 if (delta == 0) {
7053 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007055 product = n * (str2->length - str1->length);
7056 if ((product / (str2->length - str1->length)) != n) {
7057 PyErr_SetString(PyExc_OverflowError,
7058 "replace string is too long");
7059 return NULL;
7060 }
7061 new_size = self->length + product;
7062 if (new_size < 0) {
7063 PyErr_SetString(PyExc_OverflowError,
7064 "replace string is too long");
7065 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 }
7067 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007068 u = _PyUnicode_New(new_size);
7069 if (!u)
7070 return NULL;
7071 i = 0;
7072 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007073 if (str1->length > 0) {
7074 while (n-- > 0) {
7075 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007076 j = stringlib_find(self->str+i, self->length-i,
7077 str1->str, str1->length,
7078 i);
7079 if (j == -1)
7080 break;
7081 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007082 /* copy unchanged part [i:j] */
7083 Py_UNICODE_COPY(p, self->str+i, j-i);
7084 p += j - i;
7085 }
7086 /* copy substitution string */
7087 if (str2->length > 0) {
7088 Py_UNICODE_COPY(p, str2->str, str2->length);
7089 p += str2->length;
7090 }
7091 i = j + str1->length;
7092 }
7093 if (i < self->length)
7094 /* copy tail [i:] */
7095 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7096 } else {
7097 /* interleave */
7098 while (n > 0) {
7099 Py_UNICODE_COPY(p, str2->str, str2->length);
7100 p += str2->length;
7101 if (--n <= 0)
7102 break;
7103 *p++ = self->str[i++];
7104 }
7105 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007109
Benjamin Peterson29060642009-01-31 22:14:21 +00007110 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007111 /* nothing to replace; return original string (when possible) */
7112 if (PyUnicode_CheckExact(self)) {
7113 Py_INCREF(self);
7114 return (PyObject *) self;
7115 }
7116 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117}
7118
7119/* --- Unicode Object Methods --------------------------------------------- */
7120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007121PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007122 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123\n\
7124Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007125characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126
7127static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007128unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 return fixup(self, fixtitle);
7131}
7132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007133PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135\n\
7136Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007137have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138
7139static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007140unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142 return fixup(self, fixcapitalize);
7143}
7144
7145#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007146PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007147 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148\n\
7149Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007150normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151
7152static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007153unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154{
7155 PyObject *list;
7156 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007157 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 /* Split into words */
7160 list = split(self, NULL, -1);
7161 if (!list)
7162 return NULL;
7163
7164 /* Capitalize each word */
7165 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7166 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007167 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168 if (item == NULL)
7169 goto onError;
7170 Py_DECREF(PyList_GET_ITEM(list, i));
7171 PyList_SET_ITEM(list, i, item);
7172 }
7173
7174 /* Join the words to form a new string */
7175 item = PyUnicode_Join(NULL, list);
7176
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178 Py_DECREF(list);
7179 return (PyObject *)item;
7180}
7181#endif
7182
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007183/* Argument converter. Coerces to a single unicode character */
7184
7185static int
7186convert_uc(PyObject *obj, void *addr)
7187{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007188 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7189 PyObject *uniobj;
7190 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007191
Benjamin Peterson14339b62009-01-31 16:36:08 +00007192 uniobj = PyUnicode_FromObject(obj);
7193 if (uniobj == NULL) {
7194 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007195 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007196 return 0;
7197 }
7198 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7199 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007201 Py_DECREF(uniobj);
7202 return 0;
7203 }
7204 unistr = PyUnicode_AS_UNICODE(uniobj);
7205 *fillcharloc = unistr[0];
7206 Py_DECREF(uniobj);
7207 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007208}
7209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007210PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007213Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007214done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215
7216static PyObject *
7217unicode_center(PyUnicodeObject *self, PyObject *args)
7218{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007219 Py_ssize_t marg, left;
7220 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007221 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222
Thomas Woutersde017742006-02-16 19:34:37 +00007223 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 return NULL;
7225
Tim Peters7a29bd52001-09-12 03:03:31 +00007226 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227 Py_INCREF(self);
7228 return (PyObject*) self;
7229 }
7230
7231 marg = width - self->length;
7232 left = marg / 2 + (marg & width & 1);
7233
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007234 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235}
7236
Marc-André Lemburge5034372000-08-08 08:04:29 +00007237#if 0
7238
7239/* This code should go into some future Unicode collation support
7240 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007241 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007242
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007243/* speedy UTF-16 code point order comparison */
7244/* gleaned from: */
7245/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7246
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007247static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007248{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007249 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007250 0, 0, 0, 0, 0, 0, 0, 0,
7251 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007252 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007253};
7254
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255static int
7256unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7257{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007258 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007259
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260 Py_UNICODE *s1 = str1->str;
7261 Py_UNICODE *s2 = str2->str;
7262
7263 len1 = str1->length;
7264 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007265
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007267 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007268
7269 c1 = *s1++;
7270 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007271
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 if (c1 > (1<<11) * 26)
7273 c1 += utf16Fixup[c1>>11];
7274 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007275 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007276 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007277
7278 if (c1 != c2)
7279 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007280
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007281 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282 }
7283
7284 return (len1 < len2) ? -1 : (len1 != len2);
7285}
7286
Marc-André Lemburge5034372000-08-08 08:04:29 +00007287#else
7288
7289static int
7290unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7291{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007292 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007293
7294 Py_UNICODE *s1 = str1->str;
7295 Py_UNICODE *s2 = str2->str;
7296
7297 len1 = str1->length;
7298 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007299
Marc-André Lemburge5034372000-08-08 08:04:29 +00007300 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007301 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007302
Fredrik Lundh45714e92001-06-26 16:39:36 +00007303 c1 = *s1++;
7304 c2 = *s2++;
7305
7306 if (c1 != c2)
7307 return (c1 < c2) ? -1 : 1;
7308
Marc-André Lemburge5034372000-08-08 08:04:29 +00007309 len1--; len2--;
7310 }
7311
7312 return (len1 < len2) ? -1 : (len1 != len2);
7313}
7314
7315#endif
7316
Alexander Belopolsky40018472011-02-26 01:02:56 +00007317int
7318PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007320 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7321 return unicode_compare((PyUnicodeObject *)left,
7322 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007323 PyErr_Format(PyExc_TypeError,
7324 "Can't compare %.100s and %.100s",
7325 left->ob_type->tp_name,
7326 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327 return -1;
7328}
7329
Martin v. Löwis5b222132007-06-10 09:51:05 +00007330int
7331PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7332{
7333 int i;
7334 Py_UNICODE *id;
7335 assert(PyUnicode_Check(uni));
7336 id = PyUnicode_AS_UNICODE(uni);
7337 /* Compare Unicode string and source character set string */
7338 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 if (id[i] != str[i])
7340 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007341 /* This check keeps Python strings that end in '\0' from comparing equal
7342 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007343 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007345 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007346 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007347 return 0;
7348}
7349
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007350
Benjamin Peterson29060642009-01-31 22:14:21 +00007351#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007352 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007353
Alexander Belopolsky40018472011-02-26 01:02:56 +00007354PyObject *
7355PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007356{
7357 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007358
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007359 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7360 PyObject *v;
7361 if (((PyUnicodeObject *) left)->length !=
7362 ((PyUnicodeObject *) right)->length) {
7363 if (op == Py_EQ) {
7364 Py_INCREF(Py_False);
7365 return Py_False;
7366 }
7367 if (op == Py_NE) {
7368 Py_INCREF(Py_True);
7369 return Py_True;
7370 }
7371 }
7372 if (left == right)
7373 result = 0;
7374 else
7375 result = unicode_compare((PyUnicodeObject *)left,
7376 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007377
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007378 /* Convert the return value to a Boolean */
7379 switch (op) {
7380 case Py_EQ:
7381 v = TEST_COND(result == 0);
7382 break;
7383 case Py_NE:
7384 v = TEST_COND(result != 0);
7385 break;
7386 case Py_LE:
7387 v = TEST_COND(result <= 0);
7388 break;
7389 case Py_GE:
7390 v = TEST_COND(result >= 0);
7391 break;
7392 case Py_LT:
7393 v = TEST_COND(result == -1);
7394 break;
7395 case Py_GT:
7396 v = TEST_COND(result == 1);
7397 break;
7398 default:
7399 PyErr_BadArgument();
7400 return NULL;
7401 }
7402 Py_INCREF(v);
7403 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007404 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007405
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007406 Py_INCREF(Py_NotImplemented);
7407 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007408}
7409
Alexander Belopolsky40018472011-02-26 01:02:56 +00007410int
7411PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007412{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007413 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007414 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007415
7416 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007417 sub = PyUnicode_FromObject(element);
7418 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007419 PyErr_Format(PyExc_TypeError,
7420 "'in <string>' requires string as left operand, not %s",
7421 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007422 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007423 }
7424
Thomas Wouters477c8d52006-05-27 19:21:47 +00007425 str = PyUnicode_FromObject(container);
7426 if (!str) {
7427 Py_DECREF(sub);
7428 return -1;
7429 }
7430
7431 result = stringlib_contains_obj(str, sub);
7432
7433 Py_DECREF(str);
7434 Py_DECREF(sub);
7435
Guido van Rossum403d68b2000-03-13 15:55:09 +00007436 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007437}
7438
Guido van Rossumd57fd912000-03-10 22:53:23 +00007439/* Concat to string or Unicode object giving a new Unicode object. */
7440
Alexander Belopolsky40018472011-02-26 01:02:56 +00007441PyObject *
7442PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443{
7444 PyUnicodeObject *u = NULL, *v = NULL, *w;
7445
7446 /* Coerce the two arguments */
7447 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7448 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007449 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7451 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453
7454 /* Shortcuts */
7455 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007456 Py_DECREF(v);
7457 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007458 }
7459 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 Py_DECREF(u);
7461 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007462 }
7463
7464 /* Concat the two Unicode strings */
7465 w = _PyUnicode_New(u->length + v->length);
7466 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007467 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007468 Py_UNICODE_COPY(w->str, u->str, u->length);
7469 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7470
7471 Py_DECREF(u);
7472 Py_DECREF(v);
7473 return (PyObject *)w;
7474
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476 Py_XDECREF(u);
7477 Py_XDECREF(v);
7478 return NULL;
7479}
7480
Walter Dörwald1ab83302007-05-18 17:15:44 +00007481void
7482PyUnicode_Append(PyObject **pleft, PyObject *right)
7483{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007484 PyObject *new;
7485 if (*pleft == NULL)
7486 return;
7487 if (right == NULL || !PyUnicode_Check(*pleft)) {
7488 Py_DECREF(*pleft);
7489 *pleft = NULL;
7490 return;
7491 }
7492 new = PyUnicode_Concat(*pleft, right);
7493 Py_DECREF(*pleft);
7494 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007495}
7496
7497void
7498PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7499{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007500 PyUnicode_Append(pleft, right);
7501 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007502}
7503
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007504PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007505 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007506\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007507Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007508string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007509interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007510
7511static PyObject *
7512unicode_count(PyUnicodeObject *self, PyObject *args)
7513{
7514 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007515 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007516 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007517 PyObject *result;
7518
Guido van Rossumb8872e62000-05-09 14:14:27 +00007519 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007520 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007521 return NULL;
7522
7523 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007524 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007527
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007528 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007529 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007530 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007531 substring->str, substring->length,
7532 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007533 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007534
7535 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007536
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537 return result;
7538}
7539
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007540PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007541 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007542\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007543Encode S using the codec registered for encoding. Default encoding\n\
7544is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007545handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007546a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7547'xmlcharrefreplace' as well as any other name registered with\n\
7548codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549
7550static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007551unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007552{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007553 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554 char *encoding = NULL;
7555 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007556
Benjamin Peterson308d6372009-09-18 21:42:35 +00007557 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7558 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007560 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007561}
7562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007563PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007564 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565\n\
7566Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007567If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568
7569static PyObject*
7570unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7571{
7572 Py_UNICODE *e;
7573 Py_UNICODE *p;
7574 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007575 Py_UNICODE *qe;
7576 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577 PyUnicodeObject *u;
7578 int tabsize = 8;
7579
7580 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007581 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582
Thomas Wouters7e474022000-07-16 12:04:32 +00007583 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007584 i = 0; /* chars up to and including most recent \n or \r */
7585 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7586 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587 for (p = self->str; p < e; p++)
7588 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 if (tabsize > 0) {
7590 incr = tabsize - (j % tabsize); /* cannot overflow */
7591 if (j > PY_SSIZE_T_MAX - incr)
7592 goto overflow1;
7593 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007594 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 if (j > PY_SSIZE_T_MAX - 1)
7598 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599 j++;
7600 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007601 if (i > PY_SSIZE_T_MAX - j)
7602 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007604 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605 }
7606 }
7607
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007608 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007609 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007610
Guido van Rossumd57fd912000-03-10 22:53:23 +00007611 /* Second pass: create output string and fill it */
7612 u = _PyUnicode_New(i + j);
7613 if (!u)
7614 return NULL;
7615
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007616 j = 0; /* same as in first pass */
7617 q = u->str; /* next output char */
7618 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619
7620 for (p = self->str; p < e; p++)
7621 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 if (tabsize > 0) {
7623 i = tabsize - (j % tabsize);
7624 j += i;
7625 while (i--) {
7626 if (q >= qe)
7627 goto overflow2;
7628 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007629 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007631 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007632 else {
7633 if (q >= qe)
7634 goto overflow2;
7635 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007636 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007637 if (*p == '\n' || *p == '\r')
7638 j = 0;
7639 }
7640
7641 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007642
7643 overflow2:
7644 Py_DECREF(u);
7645 overflow1:
7646 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7647 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007648}
7649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007650PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007651 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652\n\
7653Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007654such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007655arguments start and end are interpreted as in slice notation.\n\
7656\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007657Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658
7659static PyObject *
7660unicode_find(PyUnicodeObject *self, PyObject *args)
7661{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007662 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007663 Py_ssize_t start;
7664 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007665 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666
Christian Heimes9cd17752007-11-18 19:35:23 +00007667 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669
Thomas Wouters477c8d52006-05-27 19:21:47 +00007670 result = stringlib_find_slice(
7671 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7672 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7673 start, end
7674 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675
7676 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007677
Christian Heimes217cfd12007-12-02 14:31:20 +00007678 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679}
7680
7681static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007682unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683{
7684 if (index < 0 || index >= self->length) {
7685 PyErr_SetString(PyExc_IndexError, "string index out of range");
7686 return NULL;
7687 }
7688
7689 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7690}
7691
Guido van Rossumc2504932007-09-18 19:42:40 +00007692/* Believe it or not, this produces the same value for ASCII strings
7693 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007694static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007695unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007696{
Guido van Rossumc2504932007-09-18 19:42:40 +00007697 Py_ssize_t len;
7698 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007699 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007700
7701 if (self->hash != -1)
7702 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007703 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007704 p = self->str;
7705 x = *p << 7;
7706 while (--len >= 0)
7707 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007708 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007709 if (x == -1)
7710 x = -2;
7711 self->hash = x;
7712 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713}
7714
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007715PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007716 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007718Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719
7720static PyObject *
7721unicode_index(PyUnicodeObject *self, PyObject *args)
7722{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007723 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007724 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007725 Py_ssize_t start;
7726 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727
Christian Heimes9cd17752007-11-18 19:35:23 +00007728 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730
Thomas Wouters477c8d52006-05-27 19:21:47 +00007731 result = stringlib_find_slice(
7732 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7733 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7734 start, end
7735 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736
7737 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007738
Guido van Rossumd57fd912000-03-10 22:53:23 +00007739 if (result < 0) {
7740 PyErr_SetString(PyExc_ValueError, "substring not found");
7741 return NULL;
7742 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007743
Christian Heimes217cfd12007-12-02 14:31:20 +00007744 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745}
7746
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007747PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007750Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007751at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752
7753static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007754unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755{
7756 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7757 register const Py_UNICODE *e;
7758 int cased;
7759
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760 /* Shortcut for single character strings */
7761 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007764 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007765 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007766 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007767
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768 e = p + PyUnicode_GET_SIZE(self);
7769 cased = 0;
7770 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007772
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7774 return PyBool_FromLong(0);
7775 else if (!cased && Py_UNICODE_ISLOWER(ch))
7776 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007778 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779}
7780
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007781PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007783\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007784Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007785at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007786
7787static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007788unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789{
7790 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7791 register const Py_UNICODE *e;
7792 int cased;
7793
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794 /* Shortcut for single character strings */
7795 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007798 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007799 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007801
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 e = p + PyUnicode_GET_SIZE(self);
7803 cased = 0;
7804 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007805 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007806
Benjamin Peterson29060642009-01-31 22:14:21 +00007807 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7808 return PyBool_FromLong(0);
7809 else if (!cased && Py_UNICODE_ISUPPER(ch))
7810 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007812 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813}
7814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007815PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007816 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007818Return True if S is a titlecased string and there is at least one\n\
7819character in S, i.e. upper- and titlecase characters may only\n\
7820follow uncased characters and lowercase characters only cased ones.\n\
7821Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822
7823static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007824unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825{
7826 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7827 register const Py_UNICODE *e;
7828 int cased, previous_is_cased;
7829
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830 /* Shortcut for single character strings */
7831 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7833 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007835 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007836 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007837 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007838
Guido van Rossumd57fd912000-03-10 22:53:23 +00007839 e = p + PyUnicode_GET_SIZE(self);
7840 cased = 0;
7841 previous_is_cased = 0;
7842 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007844
Benjamin Peterson29060642009-01-31 22:14:21 +00007845 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7846 if (previous_is_cased)
7847 return PyBool_FromLong(0);
7848 previous_is_cased = 1;
7849 cased = 1;
7850 }
7851 else if (Py_UNICODE_ISLOWER(ch)) {
7852 if (!previous_is_cased)
7853 return PyBool_FromLong(0);
7854 previous_is_cased = 1;
7855 cased = 1;
7856 }
7857 else
7858 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007860 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861}
7862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007863PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007865\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007866Return True if all characters in S are whitespace\n\
7867and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007868
7869static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007870unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007871{
7872 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7873 register const Py_UNICODE *e;
7874
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875 /* Shortcut for single character strings */
7876 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 Py_UNICODE_ISSPACE(*p))
7878 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007880 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007881 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007883
Guido van Rossumd57fd912000-03-10 22:53:23 +00007884 e = p + PyUnicode_GET_SIZE(self);
7885 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 if (!Py_UNICODE_ISSPACE(*p))
7887 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007888 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007889 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890}
7891
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007892PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007894\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007895Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007896and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007897
7898static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007899unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007900{
7901 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7902 register const Py_UNICODE *e;
7903
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007904 /* Shortcut for single character strings */
7905 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 Py_UNICODE_ISALPHA(*p))
7907 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007908
7909 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007910 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007912
7913 e = p + PyUnicode_GET_SIZE(self);
7914 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007915 if (!Py_UNICODE_ISALPHA(*p))
7916 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007917 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007918 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007919}
7920
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007921PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007922 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007923\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007924Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007925and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007926
7927static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007928unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007929{
7930 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7931 register const Py_UNICODE *e;
7932
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007933 /* Shortcut for single character strings */
7934 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 Py_UNICODE_ISALNUM(*p))
7936 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007937
7938 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007939 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007940 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007941
7942 e = p + PyUnicode_GET_SIZE(self);
7943 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007944 if (!Py_UNICODE_ISALNUM(*p))
7945 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007946 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007947 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007948}
7949
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007950PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007951 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007953Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007954False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955
7956static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007957unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958{
7959 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7960 register const Py_UNICODE *e;
7961
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962 /* Shortcut for single character strings */
7963 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007964 Py_UNICODE_ISDECIMAL(*p))
7965 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007967 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007968 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007969 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007970
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971 e = p + PyUnicode_GET_SIZE(self);
7972 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 if (!Py_UNICODE_ISDECIMAL(*p))
7974 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007976 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977}
7978
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007979PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007982Return True if all characters in S are digits\n\
7983and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984
7985static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007986unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987{
7988 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7989 register const Py_UNICODE *e;
7990
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 /* Shortcut for single character strings */
7992 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 Py_UNICODE_ISDIGIT(*p))
7994 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007996 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007997 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007999
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 e = p + PyUnicode_GET_SIZE(self);
8001 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 if (!Py_UNICODE_ISDIGIT(*p))
8003 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008005 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006}
8007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008008PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00008011Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008012False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013
8014static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008015unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016{
8017 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8018 register const Py_UNICODE *e;
8019
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 /* Shortcut for single character strings */
8021 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 Py_UNICODE_ISNUMERIC(*p))
8023 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008025 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008026 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008028
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029 e = p + PyUnicode_GET_SIZE(self);
8030 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 if (!Py_UNICODE_ISNUMERIC(*p))
8032 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008034 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035}
8036
Martin v. Löwis47383402007-08-15 07:32:56 +00008037int
8038PyUnicode_IsIdentifier(PyObject *self)
8039{
8040 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
8041 register const Py_UNICODE *e;
8042
8043 /* Special case for empty strings */
8044 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008046
8047 /* PEP 3131 says that the first character must be in
8048 XID_Start and subsequent characters in XID_Continue,
8049 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00008050 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00008051 letters, digits, underscore). However, given the current
8052 definition of XID_Start and XID_Continue, it is sufficient
8053 to check just for these, except that _ must be allowed
8054 as starting an identifier. */
8055 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
8056 return 0;
8057
8058 e = p + PyUnicode_GET_SIZE(self);
8059 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 if (!_PyUnicode_IsXidContinue(*p))
8061 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008062 }
8063 return 1;
8064}
8065
8066PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008068\n\
8069Return True if S is a valid identifier according\n\
8070to the language definition.");
8071
8072static PyObject*
8073unicode_isidentifier(PyObject *self)
8074{
8075 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8076}
8077
Georg Brandl559e5d72008-06-11 18:37:52 +00008078PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008080\n\
8081Return True if all characters in S are considered\n\
8082printable in repr() or S is empty, False otherwise.");
8083
8084static PyObject*
8085unicode_isprintable(PyObject *self)
8086{
8087 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8088 register const Py_UNICODE *e;
8089
8090 /* Shortcut for single character strings */
8091 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8092 Py_RETURN_TRUE;
8093 }
8094
8095 e = p + PyUnicode_GET_SIZE(self);
8096 for (; p < e; p++) {
8097 if (!Py_UNICODE_ISPRINTABLE(*p)) {
8098 Py_RETURN_FALSE;
8099 }
8100 }
8101 Py_RETURN_TRUE;
8102}
8103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008104PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008105 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106\n\
8107Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008108iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109
8110static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008111unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008113 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114}
8115
Martin v. Löwis18e16552006-02-15 17:27:45 +00008116static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117unicode_length(PyUnicodeObject *self)
8118{
8119 return self->length;
8120}
8121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008122PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008124\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008125Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008126done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008127
8128static PyObject *
8129unicode_ljust(PyUnicodeObject *self, PyObject *args)
8130{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008131 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008132 Py_UNICODE fillchar = ' ';
8133
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008134 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135 return NULL;
8136
Tim Peters7a29bd52001-09-12 03:03:31 +00008137 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138 Py_INCREF(self);
8139 return (PyObject*) self;
8140 }
8141
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008142 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143}
8144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008145PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008148Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149
8150static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008151unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008152{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 return fixup(self, fixlower);
8154}
8155
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008156#define LEFTSTRIP 0
8157#define RIGHTSTRIP 1
8158#define BOTHSTRIP 2
8159
8160/* Arrays indexed by above */
8161static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8162
8163#define STRIPNAME(i) (stripformat[i]+3)
8164
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008165/* externally visible for str.strip(unicode) */
8166PyObject *
8167_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8168{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008169 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8170 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8171 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8172 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8173 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008174
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008176
Benjamin Peterson14339b62009-01-31 16:36:08 +00008177 i = 0;
8178 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8180 i++;
8181 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008182 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008183
Benjamin Peterson14339b62009-01-31 16:36:08 +00008184 j = len;
8185 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 do {
8187 j--;
8188 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8189 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008190 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008191
Benjamin Peterson14339b62009-01-31 16:36:08 +00008192 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008193 Py_INCREF(self);
8194 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008195 }
8196 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008198}
8199
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200
8201static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008202do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008204 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8205 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008206
Benjamin Peterson14339b62009-01-31 16:36:08 +00008207 i = 0;
8208 if (striptype != RIGHTSTRIP) {
8209 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8210 i++;
8211 }
8212 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008213
Benjamin Peterson14339b62009-01-31 16:36:08 +00008214 j = len;
8215 if (striptype != LEFTSTRIP) {
8216 do {
8217 j--;
8218 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8219 j++;
8220 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008221
Benjamin Peterson14339b62009-01-31 16:36:08 +00008222 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8223 Py_INCREF(self);
8224 return (PyObject*)self;
8225 }
8226 else
8227 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228}
8229
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008230
8231static PyObject *
8232do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8233{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008234 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008235
Benjamin Peterson14339b62009-01-31 16:36:08 +00008236 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8237 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008238
Benjamin Peterson14339b62009-01-31 16:36:08 +00008239 if (sep != NULL && sep != Py_None) {
8240 if (PyUnicode_Check(sep))
8241 return _PyUnicode_XStrip(self, striptype, sep);
8242 else {
8243 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 "%s arg must be None or str",
8245 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008246 return NULL;
8247 }
8248 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008249
Benjamin Peterson14339b62009-01-31 16:36:08 +00008250 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008251}
8252
8253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008254PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008256\n\
8257Return a copy of the string S with leading and trailing\n\
8258whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008259If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008260
8261static PyObject *
8262unicode_strip(PyUnicodeObject *self, PyObject *args)
8263{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008264 if (PyTuple_GET_SIZE(args) == 0)
8265 return do_strip(self, BOTHSTRIP); /* Common case */
8266 else
8267 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008268}
8269
8270
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008271PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008273\n\
8274Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008275If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008276
8277static PyObject *
8278unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8279{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008280 if (PyTuple_GET_SIZE(args) == 0)
8281 return do_strip(self, LEFTSTRIP); /* Common case */
8282 else
8283 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008284}
8285
8286
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008287PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008288 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008289\n\
8290Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008291If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008292
8293static PyObject *
8294unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8295{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008296 if (PyTuple_GET_SIZE(args) == 0)
8297 return do_strip(self, RIGHTSTRIP); /* Common case */
8298 else
8299 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008300}
8301
8302
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008304unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305{
8306 PyUnicodeObject *u;
8307 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008308 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008309 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310
Georg Brandl222de0f2009-04-12 12:01:50 +00008311 if (len < 1) {
8312 Py_INCREF(unicode_empty);
8313 return (PyObject *)unicode_empty;
8314 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315
Tim Peters7a29bd52001-09-12 03:03:31 +00008316 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317 /* no repeat, return original string */
8318 Py_INCREF(str);
8319 return (PyObject*) str;
8320 }
Tim Peters8f422462000-09-09 06:13:41 +00008321
8322 /* ensure # of chars needed doesn't overflow int and # of bytes
8323 * needed doesn't overflow size_t
8324 */
8325 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008326 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008327 PyErr_SetString(PyExc_OverflowError,
8328 "repeated string is too long");
8329 return NULL;
8330 }
8331 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8332 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8333 PyErr_SetString(PyExc_OverflowError,
8334 "repeated string is too long");
8335 return NULL;
8336 }
8337 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 if (!u)
8339 return NULL;
8340
8341 p = u->str;
8342
Georg Brandl222de0f2009-04-12 12:01:50 +00008343 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008344 Py_UNICODE_FILL(p, str->str[0], len);
8345 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008346 Py_ssize_t done = str->length; /* number of characters copied this far */
8347 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008349 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008350 Py_UNICODE_COPY(p+done, p, n);
8351 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 }
8354
8355 return (PyObject*) u;
8356}
8357
Alexander Belopolsky40018472011-02-26 01:02:56 +00008358PyObject *
8359PyUnicode_Replace(PyObject *obj,
8360 PyObject *subobj,
8361 PyObject *replobj,
8362 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363{
8364 PyObject *self;
8365 PyObject *str1;
8366 PyObject *str2;
8367 PyObject *result;
8368
8369 self = PyUnicode_FromObject(obj);
8370 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008372 str1 = PyUnicode_FromObject(subobj);
8373 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 Py_DECREF(self);
8375 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376 }
8377 str2 = PyUnicode_FromObject(replobj);
8378 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 Py_DECREF(self);
8380 Py_DECREF(str1);
8381 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 }
Tim Petersced69f82003-09-16 20:30:58 +00008383 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 (PyUnicodeObject *)str1,
8385 (PyUnicodeObject *)str2,
8386 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387 Py_DECREF(self);
8388 Py_DECREF(str1);
8389 Py_DECREF(str2);
8390 return result;
8391}
8392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008393PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008394 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395\n\
8396Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008397old replaced by new. If the optional argument count is\n\
8398given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399
8400static PyObject*
8401unicode_replace(PyUnicodeObject *self, PyObject *args)
8402{
8403 PyUnicodeObject *str1;
8404 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008405 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008406 PyObject *result;
8407
Martin v. Löwis18e16552006-02-15 17:27:45 +00008408 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008409 return NULL;
8410 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8411 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008414 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 Py_DECREF(str1);
8416 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008418
8419 result = replace(self, str1, str2, maxcount);
8420
8421 Py_DECREF(str1);
8422 Py_DECREF(str2);
8423 return result;
8424}
8425
Alexander Belopolsky40018472011-02-26 01:02:56 +00008426static PyObject *
8427unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008429 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008430 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008431 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8432 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8433
8434 /* XXX(nnorwitz): rather than over-allocating, it would be
8435 better to choose a different scheme. Perhaps scan the
8436 first N-chars of the string and allocate based on that size.
8437 */
8438 /* Initial allocation is based on the longest-possible unichr
8439 escape.
8440
8441 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8442 unichr, so in this case it's the longest unichr escape. In
8443 narrow (UTF-16) builds this is five chars per source unichr
8444 since there are two unichrs in the surrogate pair, so in narrow
8445 (UTF-16) builds it's not the longest unichr escape.
8446
8447 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8448 so in the narrow (UTF-16) build case it's the longest unichr
8449 escape.
8450 */
8451
Walter Dörwald1ab83302007-05-18 17:15:44 +00008452 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008454#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008456#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008458#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008460 if (repr == NULL)
8461 return NULL;
8462
Walter Dörwald1ab83302007-05-18 17:15:44 +00008463 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008464
8465 /* Add quote */
8466 *p++ = (findchar(s, size, '\'') &&
8467 !findchar(s, size, '"')) ? '"' : '\'';
8468 while (size-- > 0) {
8469 Py_UNICODE ch = *s++;
8470
8471 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008472 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008473 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008474 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008475 continue;
8476 }
8477
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008479 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008480 *p++ = '\\';
8481 *p++ = 't';
8482 }
8483 else if (ch == '\n') {
8484 *p++ = '\\';
8485 *p++ = 'n';
8486 }
8487 else if (ch == '\r') {
8488 *p++ = '\\';
8489 *p++ = 'r';
8490 }
8491
8492 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008493 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008494 *p++ = '\\';
8495 *p++ = 'x';
8496 *p++ = hexdigits[(ch >> 4) & 0x000F];
8497 *p++ = hexdigits[ch & 0x000F];
8498 }
8499
Georg Brandl559e5d72008-06-11 18:37:52 +00008500 /* Copy ASCII characters as-is */
8501 else if (ch < 0x7F) {
8502 *p++ = ch;
8503 }
8504
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008506 else {
8507 Py_UCS4 ucs = ch;
8508
8509#ifndef Py_UNICODE_WIDE
8510 Py_UNICODE ch2 = 0;
8511 /* Get code point from surrogate pair */
8512 if (size > 0) {
8513 ch2 = *s;
8514 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008516 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008518 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008519 size--;
8520 }
8521 }
8522#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008523 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008524 (categories Z* and C* except ASCII space)
8525 */
8526 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8527 /* Map 8-bit characters to '\xhh' */
8528 if (ucs <= 0xff) {
8529 *p++ = '\\';
8530 *p++ = 'x';
8531 *p++ = hexdigits[(ch >> 4) & 0x000F];
8532 *p++ = hexdigits[ch & 0x000F];
8533 }
8534 /* Map 21-bit characters to '\U00xxxxxx' */
8535 else if (ucs >= 0x10000) {
8536 *p++ = '\\';
8537 *p++ = 'U';
8538 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8539 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8540 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8541 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8542 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8543 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8544 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8545 *p++ = hexdigits[ucs & 0x0000000F];
8546 }
8547 /* Map 16-bit characters to '\uxxxx' */
8548 else {
8549 *p++ = '\\';
8550 *p++ = 'u';
8551 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8552 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8553 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8554 *p++ = hexdigits[ucs & 0x000F];
8555 }
8556 }
8557 /* Copy characters as-is */
8558 else {
8559 *p++ = ch;
8560#ifndef Py_UNICODE_WIDE
8561 if (ucs >= 0x10000)
8562 *p++ = ch2;
8563#endif
8564 }
8565 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008566 }
8567 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008568 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008569
8570 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008571 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008572 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573}
8574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008575PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008577\n\
8578Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008579such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580arguments start and end are interpreted as in slice notation.\n\
8581\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008582Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583
8584static PyObject *
8585unicode_rfind(PyUnicodeObject *self, PyObject *args)
8586{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008587 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008588 Py_ssize_t start;
8589 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008590 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591
Christian Heimes9cd17752007-11-18 19:35:23 +00008592 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008593 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594
Thomas Wouters477c8d52006-05-27 19:21:47 +00008595 result = stringlib_rfind_slice(
8596 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8597 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8598 start, end
8599 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600
8601 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008602
Christian Heimes217cfd12007-12-02 14:31:20 +00008603 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008604}
8605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008606PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008609Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610
8611static PyObject *
8612unicode_rindex(PyUnicodeObject *self, PyObject *args)
8613{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008614 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008615 Py_ssize_t start;
8616 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008617 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618
Christian Heimes9cd17752007-11-18 19:35:23 +00008619 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008620 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621
Thomas Wouters477c8d52006-05-27 19:21:47 +00008622 result = stringlib_rfind_slice(
8623 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8624 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8625 start, end
8626 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627
8628 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008629
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630 if (result < 0) {
8631 PyErr_SetString(PyExc_ValueError, "substring not found");
8632 return NULL;
8633 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008634 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635}
8636
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008637PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008640Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008641done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642
8643static PyObject *
8644unicode_rjust(PyUnicodeObject *self, PyObject *args)
8645{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008646 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008647 Py_UNICODE fillchar = ' ';
8648
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008649 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650 return NULL;
8651
Tim Peters7a29bd52001-09-12 03:03:31 +00008652 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653 Py_INCREF(self);
8654 return (PyObject*) self;
8655 }
8656
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008657 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658}
8659
Alexander Belopolsky40018472011-02-26 01:02:56 +00008660PyObject *
8661PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662{
8663 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008664
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 s = PyUnicode_FromObject(s);
8666 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008667 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008668 if (sep != NULL) {
8669 sep = PyUnicode_FromObject(sep);
8670 if (sep == NULL) {
8671 Py_DECREF(s);
8672 return NULL;
8673 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674 }
8675
8676 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8677
8678 Py_DECREF(s);
8679 Py_XDECREF(sep);
8680 return result;
8681}
8682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008683PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685\n\
8686Return a list of the words in S, using sep as the\n\
8687delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008688splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008689whitespace string is a separator and empty strings are\n\
8690removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691
8692static PyObject*
8693unicode_split(PyUnicodeObject *self, PyObject *args)
8694{
8695 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008696 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697
Martin v. Löwis18e16552006-02-15 17:27:45 +00008698 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008699 return NULL;
8700
8701 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707}
8708
Thomas Wouters477c8d52006-05-27 19:21:47 +00008709PyObject *
8710PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8711{
8712 PyObject* str_obj;
8713 PyObject* sep_obj;
8714 PyObject* out;
8715
8716 str_obj = PyUnicode_FromObject(str_in);
8717 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008719 sep_obj = PyUnicode_FromObject(sep_in);
8720 if (!sep_obj) {
8721 Py_DECREF(str_obj);
8722 return NULL;
8723 }
8724
8725 out = stringlib_partition(
8726 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8727 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8728 );
8729
8730 Py_DECREF(sep_obj);
8731 Py_DECREF(str_obj);
8732
8733 return out;
8734}
8735
8736
8737PyObject *
8738PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8739{
8740 PyObject* str_obj;
8741 PyObject* sep_obj;
8742 PyObject* out;
8743
8744 str_obj = PyUnicode_FromObject(str_in);
8745 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008747 sep_obj = PyUnicode_FromObject(sep_in);
8748 if (!sep_obj) {
8749 Py_DECREF(str_obj);
8750 return NULL;
8751 }
8752
8753 out = stringlib_rpartition(
8754 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8755 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8756 );
8757
8758 Py_DECREF(sep_obj);
8759 Py_DECREF(str_obj);
8760
8761 return out;
8762}
8763
8764PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008766\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008767Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008768the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008769found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008770
8771static PyObject*
8772unicode_partition(PyUnicodeObject *self, PyObject *separator)
8773{
8774 return PyUnicode_Partition((PyObject *)self, separator);
8775}
8776
8777PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008778 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008779\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008780Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008781the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008782separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008783
8784static PyObject*
8785unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8786{
8787 return PyUnicode_RPartition((PyObject *)self, separator);
8788}
8789
Alexander Belopolsky40018472011-02-26 01:02:56 +00008790PyObject *
8791PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008792{
8793 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008794
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008795 s = PyUnicode_FromObject(s);
8796 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008797 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008798 if (sep != NULL) {
8799 sep = PyUnicode_FromObject(sep);
8800 if (sep == NULL) {
8801 Py_DECREF(s);
8802 return NULL;
8803 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008804 }
8805
8806 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8807
8808 Py_DECREF(s);
8809 Py_XDECREF(sep);
8810 return result;
8811}
8812
8813PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008814 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008815\n\
8816Return a list of the words in S, using sep as the\n\
8817delimiter string, starting at the end of the string and\n\
8818working to the front. If maxsplit is given, at most maxsplit\n\
8819splits are done. If sep is not specified, any whitespace string\n\
8820is a separator.");
8821
8822static PyObject*
8823unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8824{
8825 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008826 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008827
Martin v. Löwis18e16552006-02-15 17:27:45 +00008828 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008829 return NULL;
8830
8831 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008833 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008835 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008837}
8838
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008839PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008840 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841\n\
8842Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008843Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008844is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008845
8846static PyObject*
8847unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8848{
Guido van Rossum86662912000-04-11 15:38:46 +00008849 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850
Guido van Rossum86662912000-04-11 15:38:46 +00008851 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008852 return NULL;
8853
Guido van Rossum86662912000-04-11 15:38:46 +00008854 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008855}
8856
8857static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008858PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859{
Walter Dörwald346737f2007-05-31 10:44:43 +00008860 if (PyUnicode_CheckExact(self)) {
8861 Py_INCREF(self);
8862 return self;
8863 } else
8864 /* Subtype -- return genuine unicode string with the same value. */
8865 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8866 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867}
8868
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008869PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008870 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871\n\
8872Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008873and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874
8875static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008876unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878 return fixup(self, fixswapcase);
8879}
8880
Georg Brandlceee0772007-11-27 23:48:05 +00008881PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008883\n\
8884Return a translation table usable for str.translate().\n\
8885If there is only one argument, it must be a dictionary mapping Unicode\n\
8886ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008887Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008888If there are two arguments, they must be strings of equal length, and\n\
8889in the resulting dictionary, each character in x will be mapped to the\n\
8890character at the same position in y. If there is a third argument, it\n\
8891must be a string, whose characters will be mapped to None in the result.");
8892
8893static PyObject*
8894unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8895{
8896 PyObject *x, *y = NULL, *z = NULL;
8897 PyObject *new = NULL, *key, *value;
8898 Py_ssize_t i = 0;
8899 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008900
Georg Brandlceee0772007-11-27 23:48:05 +00008901 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8902 return NULL;
8903 new = PyDict_New();
8904 if (!new)
8905 return NULL;
8906 if (y != NULL) {
8907 /* x must be a string too, of equal length */
8908 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8909 if (!PyUnicode_Check(x)) {
8910 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8911 "be a string if there is a second argument");
8912 goto err;
8913 }
8914 if (PyUnicode_GET_SIZE(x) != ylen) {
8915 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8916 "arguments must have equal length");
8917 goto err;
8918 }
8919 /* create entries for translating chars in x to those in y */
8920 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008921 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8922 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008923 if (!key || !value)
8924 goto err;
8925 res = PyDict_SetItem(new, key, value);
8926 Py_DECREF(key);
8927 Py_DECREF(value);
8928 if (res < 0)
8929 goto err;
8930 }
8931 /* create entries for deleting chars in z */
8932 if (z != NULL) {
8933 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008934 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008935 if (!key)
8936 goto err;
8937 res = PyDict_SetItem(new, key, Py_None);
8938 Py_DECREF(key);
8939 if (res < 0)
8940 goto err;
8941 }
8942 }
8943 } else {
8944 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008945 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008946 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8947 "to maketrans it must be a dict");
8948 goto err;
8949 }
8950 /* copy entries into the new dict, converting string keys to int keys */
8951 while (PyDict_Next(x, &i, &key, &value)) {
8952 if (PyUnicode_Check(key)) {
8953 /* convert string keys to integer keys */
8954 PyObject *newkey;
8955 if (PyUnicode_GET_SIZE(key) != 1) {
8956 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8957 "table must be of length 1");
8958 goto err;
8959 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008960 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008961 if (!newkey)
8962 goto err;
8963 res = PyDict_SetItem(new, newkey, value);
8964 Py_DECREF(newkey);
8965 if (res < 0)
8966 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008967 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008968 /* just keep integer keys */
8969 if (PyDict_SetItem(new, key, value) < 0)
8970 goto err;
8971 } else {
8972 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8973 "be strings or integers");
8974 goto err;
8975 }
8976 }
8977 }
8978 return new;
8979 err:
8980 Py_DECREF(new);
8981 return NULL;
8982}
8983
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008984PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008985 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986\n\
8987Return a copy of the string S, where all characters have been mapped\n\
8988through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008989Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008990Unmapped characters are left untouched. Characters mapped to None\n\
8991are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992
8993static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008994unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995{
Georg Brandlceee0772007-11-27 23:48:05 +00008996 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997}
8998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008999PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009000 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009002Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003
9004static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009005unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007 return fixup(self, fixupper);
9008}
9009
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009010PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009011 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00009013Pad a numeric string S with zeros on the left, to fill a field\n\
9014of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015
9016static PyObject *
9017unicode_zfill(PyUnicodeObject *self, PyObject *args)
9018{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009019 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020 PyUnicodeObject *u;
9021
Martin v. Löwis18e16552006-02-15 17:27:45 +00009022 Py_ssize_t width;
9023 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024 return NULL;
9025
9026 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00009027 if (PyUnicode_CheckExact(self)) {
9028 Py_INCREF(self);
9029 return (PyObject*) self;
9030 }
9031 else
9032 return PyUnicode_FromUnicode(
9033 PyUnicode_AS_UNICODE(self),
9034 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00009035 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036 }
9037
9038 fill = width - self->length;
9039
9040 u = pad(self, fill, 0, '0');
9041
Walter Dörwald068325e2002-04-15 13:36:47 +00009042 if (u == NULL)
9043 return NULL;
9044
Guido van Rossumd57fd912000-03-10 22:53:23 +00009045 if (u->str[fill] == '+' || u->str[fill] == '-') {
9046 /* move sign to beginning of string */
9047 u->str[0] = u->str[fill];
9048 u->str[fill] = '0';
9049 }
9050
9051 return (PyObject*) u;
9052}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053
9054#if 0
9055static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009056unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057{
Christian Heimes2202f872008-02-06 14:31:34 +00009058 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009060
9061static PyObject *
9062unicode__decimal2ascii(PyObject *self)
9063{
9064 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9065 PyUnicode_GET_SIZE(self));
9066}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067#endif
9068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009069PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009070 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009072Return True if S starts with the specified prefix, False otherwise.\n\
9073With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009074With optional end, stop comparing S at that position.\n\
9075prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076
9077static PyObject *
9078unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009079 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009081 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009082 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009083 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009084 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009085 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009087 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009088 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9089 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009090 if (PyTuple_Check(subobj)) {
9091 Py_ssize_t i;
9092 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9093 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009094 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009095 if (substring == NULL)
9096 return NULL;
9097 result = tailmatch(self, substring, start, end, -1);
9098 Py_DECREF(substring);
9099 if (result) {
9100 Py_RETURN_TRUE;
9101 }
9102 }
9103 /* nothing matched */
9104 Py_RETURN_FALSE;
9105 }
9106 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009108 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009109 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009111 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112}
9113
9114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009115PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009116 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009118Return True if S ends with the specified suffix, False otherwise.\n\
9119With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009120With optional end, stop comparing S at that position.\n\
9121suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122
9123static PyObject *
9124unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009125 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009127 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009129 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009130 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009131 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009133 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9135 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009136 if (PyTuple_Check(subobj)) {
9137 Py_ssize_t i;
9138 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9139 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009140 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009141 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009142 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009143 result = tailmatch(self, substring, start, end, +1);
9144 Py_DECREF(substring);
9145 if (result) {
9146 Py_RETURN_TRUE;
9147 }
9148 }
9149 Py_RETURN_FALSE;
9150 }
9151 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009153 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009155 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009157 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158}
9159
Eric Smith8c663262007-08-25 02:26:07 +00009160#include "stringlib/string_format.h"
9161
9162PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009163 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009164\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009165Return a formatted version of S, using substitutions from args and kwargs.\n\
9166The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009167
Eric Smith27bbca62010-11-04 17:06:58 +00009168PyDoc_STRVAR(format_map__doc__,
9169 "S.format_map(mapping) -> str\n\
9170\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009171Return a formatted version of S, using substitutions from mapping.\n\
9172The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009173
Eric Smith4a7d76d2008-05-30 18:10:19 +00009174static PyObject *
9175unicode__format__(PyObject* self, PyObject* args)
9176{
9177 PyObject *format_spec;
9178
9179 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9180 return NULL;
9181
9182 return _PyUnicode_FormatAdvanced(self,
9183 PyUnicode_AS_UNICODE(format_spec),
9184 PyUnicode_GET_SIZE(format_spec));
9185}
9186
Eric Smith8c663262007-08-25 02:26:07 +00009187PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009188 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009189\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009190Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009191
9192static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009193unicode__sizeof__(PyUnicodeObject *v)
9194{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009195 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9196 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009197}
9198
9199PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009200 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009201
9202static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009203unicode_getnewargs(PyUnicodeObject *v)
9204{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009205 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009206}
9207
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208static PyMethodDef unicode_methods[] = {
9209
9210 /* Order is according to common usage: often used methods should
9211 appear first, since lookup is done sequentially. */
9212
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009213 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009214 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9215 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009216 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009217 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9218 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9219 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9220 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9221 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9222 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9223 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009224 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009225 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9226 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9227 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009228 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009229 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9230 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9231 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009232 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009233 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009234 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009235 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009236 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9237 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9238 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9239 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9240 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9241 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9242 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9243 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9244 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9245 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9246 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9247 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9248 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9249 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009250 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009251 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009252 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009253 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009254 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009255 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009256 {"maketrans", (PyCFunction) unicode_maketrans,
9257 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009258 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009259#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009260 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009261#endif
9262
9263#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009264 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009265 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009266 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009267#endif
9268
Benjamin Peterson14339b62009-01-31 16:36:08 +00009269 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270 {NULL, NULL}
9271};
9272
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009273static PyObject *
9274unicode_mod(PyObject *v, PyObject *w)
9275{
Benjamin Peterson29060642009-01-31 22:14:21 +00009276 if (!PyUnicode_Check(v)) {
9277 Py_INCREF(Py_NotImplemented);
9278 return Py_NotImplemented;
9279 }
9280 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009281}
9282
9283static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009284 0, /*nb_add*/
9285 0, /*nb_subtract*/
9286 0, /*nb_multiply*/
9287 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009288};
9289
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009291 (lenfunc) unicode_length, /* sq_length */
9292 PyUnicode_Concat, /* sq_concat */
9293 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9294 (ssizeargfunc) unicode_getitem, /* sq_item */
9295 0, /* sq_slice */
9296 0, /* sq_ass_item */
9297 0, /* sq_ass_slice */
9298 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009299};
9300
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009301static PyObject*
9302unicode_subscript(PyUnicodeObject* self, PyObject* item)
9303{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009304 if (PyIndex_Check(item)) {
9305 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009306 if (i == -1 && PyErr_Occurred())
9307 return NULL;
9308 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009309 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009310 return unicode_getitem(self, i);
9311 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009312 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009313 Py_UNICODE* source_buf;
9314 Py_UNICODE* result_buf;
9315 PyObject* result;
9316
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009317 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009318 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009319 return NULL;
9320 }
9321
9322 if (slicelength <= 0) {
9323 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009324 } else if (start == 0 && step == 1 && slicelength == self->length &&
9325 PyUnicode_CheckExact(self)) {
9326 Py_INCREF(self);
9327 return (PyObject *)self;
9328 } else if (step == 1) {
9329 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009330 } else {
9331 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009332 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9333 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009334
Benjamin Peterson29060642009-01-31 22:14:21 +00009335 if (result_buf == NULL)
9336 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009337
9338 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9339 result_buf[i] = source_buf[cur];
9340 }
Tim Petersced69f82003-09-16 20:30:58 +00009341
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009342 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009343 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009344 return result;
9345 }
9346 } else {
9347 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9348 return NULL;
9349 }
9350}
9351
9352static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009353 (lenfunc)unicode_length, /* mp_length */
9354 (binaryfunc)unicode_subscript, /* mp_subscript */
9355 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009356};
9357
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359/* Helpers for PyUnicode_Format() */
9360
9361static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009362getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009364 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009366 (*p_argidx)++;
9367 if (arglen < 0)
9368 return args;
9369 else
9370 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009371 }
9372 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009373 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374 return NULL;
9375}
9376
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009377/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009379static PyObject *
9380formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009382 char *p;
9383 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009385
Guido van Rossumd57fd912000-03-10 22:53:23 +00009386 x = PyFloat_AsDouble(v);
9387 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009388 return NULL;
9389
Guido van Rossumd57fd912000-03-10 22:53:23 +00009390 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009391 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009392
Eric Smith0923d1d2009-04-16 20:16:10 +00009393 p = PyOS_double_to_string(x, type, prec,
9394 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009395 if (p == NULL)
9396 return NULL;
9397 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009398 PyMem_Free(p);
9399 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400}
9401
Tim Peters38fd5b62000-09-21 05:43:11 +00009402static PyObject*
9403formatlong(PyObject *val, int flags, int prec, int type)
9404{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009405 char *buf;
9406 int len;
9407 PyObject *str; /* temporary string object. */
9408 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009409
Benjamin Peterson14339b62009-01-31 16:36:08 +00009410 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9411 if (!str)
9412 return NULL;
9413 result = PyUnicode_FromStringAndSize(buf, len);
9414 Py_DECREF(str);
9415 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009416}
9417
Guido van Rossumd57fd912000-03-10 22:53:23 +00009418static int
9419formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009420 size_t buflen,
9421 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009422{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009423 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009424 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009425 if (PyUnicode_GET_SIZE(v) == 1) {
9426 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9427 buf[1] = '\0';
9428 return 1;
9429 }
9430#ifndef Py_UNICODE_WIDE
9431 if (PyUnicode_GET_SIZE(v) == 2) {
9432 /* Decode a valid surrogate pair */
9433 int c0 = PyUnicode_AS_UNICODE(v)[0];
9434 int c1 = PyUnicode_AS_UNICODE(v)[1];
9435 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9436 0xDC00 <= c1 && c1 <= 0xDFFF) {
9437 buf[0] = c0;
9438 buf[1] = c1;
9439 buf[2] = '\0';
9440 return 2;
9441 }
9442 }
9443#endif
9444 goto onError;
9445 }
9446 else {
9447 /* Integer input truncated to a character */
9448 long x;
9449 x = PyLong_AsLong(v);
9450 if (x == -1 && PyErr_Occurred())
9451 goto onError;
9452
9453 if (x < 0 || x > 0x10ffff) {
9454 PyErr_SetString(PyExc_OverflowError,
9455 "%c arg not in range(0x110000)");
9456 return -1;
9457 }
9458
9459#ifndef Py_UNICODE_WIDE
9460 if (x > 0xffff) {
9461 x -= 0x10000;
9462 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9463 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9464 return 2;
9465 }
9466#endif
9467 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009468 buf[1] = '\0';
9469 return 1;
9470 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009471
Benjamin Peterson29060642009-01-31 22:14:21 +00009472 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009473 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009474 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009475 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476}
9477
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009478/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009479 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009480*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009481#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009482
Alexander Belopolsky40018472011-02-26 01:02:56 +00009483PyObject *
9484PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485{
9486 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009487 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488 int args_owned = 0;
9489 PyUnicodeObject *result = NULL;
9490 PyObject *dict = NULL;
9491 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009492
Guido van Rossumd57fd912000-03-10 22:53:23 +00009493 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009494 PyErr_BadInternalCall();
9495 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496 }
9497 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009498 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009499 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009500 fmt = PyUnicode_AS_UNICODE(uformat);
9501 fmtcnt = PyUnicode_GET_SIZE(uformat);
9502
9503 reslen = rescnt = fmtcnt + 100;
9504 result = _PyUnicode_New(reslen);
9505 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009506 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507 res = PyUnicode_AS_UNICODE(result);
9508
9509 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 arglen = PyTuple_Size(args);
9511 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512 }
9513 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009514 arglen = -1;
9515 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009517 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009518 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009519 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520
9521 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009522 if (*fmt != '%') {
9523 if (--rescnt < 0) {
9524 rescnt = fmtcnt + 100;
9525 reslen += rescnt;
9526 if (_PyUnicode_Resize(&result, reslen) < 0)
9527 goto onError;
9528 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9529 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009530 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009531 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009532 }
9533 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009534 /* Got a format specifier */
9535 int flags = 0;
9536 Py_ssize_t width = -1;
9537 int prec = -1;
9538 Py_UNICODE c = '\0';
9539 Py_UNICODE fill;
9540 int isnumok;
9541 PyObject *v = NULL;
9542 PyObject *temp = NULL;
9543 Py_UNICODE *pbuf;
9544 Py_UNICODE sign;
9545 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009546 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009547
Benjamin Peterson29060642009-01-31 22:14:21 +00009548 fmt++;
9549 if (*fmt == '(') {
9550 Py_UNICODE *keystart;
9551 Py_ssize_t keylen;
9552 PyObject *key;
9553 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009554
Benjamin Peterson29060642009-01-31 22:14:21 +00009555 if (dict == NULL) {
9556 PyErr_SetString(PyExc_TypeError,
9557 "format requires a mapping");
9558 goto onError;
9559 }
9560 ++fmt;
9561 --fmtcnt;
9562 keystart = fmt;
9563 /* Skip over balanced parentheses */
9564 while (pcount > 0 && --fmtcnt >= 0) {
9565 if (*fmt == ')')
9566 --pcount;
9567 else if (*fmt == '(')
9568 ++pcount;
9569 fmt++;
9570 }
9571 keylen = fmt - keystart - 1;
9572 if (fmtcnt < 0 || pcount > 0) {
9573 PyErr_SetString(PyExc_ValueError,
9574 "incomplete format key");
9575 goto onError;
9576 }
9577#if 0
9578 /* keys are converted to strings using UTF-8 and
9579 then looked up since Python uses strings to hold
9580 variables names etc. in its namespaces and we
9581 wouldn't want to break common idioms. */
9582 key = PyUnicode_EncodeUTF8(keystart,
9583 keylen,
9584 NULL);
9585#else
9586 key = PyUnicode_FromUnicode(keystart, keylen);
9587#endif
9588 if (key == NULL)
9589 goto onError;
9590 if (args_owned) {
9591 Py_DECREF(args);
9592 args_owned = 0;
9593 }
9594 args = PyObject_GetItem(dict, key);
9595 Py_DECREF(key);
9596 if (args == NULL) {
9597 goto onError;
9598 }
9599 args_owned = 1;
9600 arglen = -1;
9601 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009602 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009603 while (--fmtcnt >= 0) {
9604 switch (c = *fmt++) {
9605 case '-': flags |= F_LJUST; continue;
9606 case '+': flags |= F_SIGN; continue;
9607 case ' ': flags |= F_BLANK; continue;
9608 case '#': flags |= F_ALT; continue;
9609 case '0': flags |= F_ZERO; continue;
9610 }
9611 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009612 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009613 if (c == '*') {
9614 v = getnextarg(args, arglen, &argidx);
9615 if (v == NULL)
9616 goto onError;
9617 if (!PyLong_Check(v)) {
9618 PyErr_SetString(PyExc_TypeError,
9619 "* wants int");
9620 goto onError;
9621 }
9622 width = PyLong_AsLong(v);
9623 if (width == -1 && PyErr_Occurred())
9624 goto onError;
9625 if (width < 0) {
9626 flags |= F_LJUST;
9627 width = -width;
9628 }
9629 if (--fmtcnt >= 0)
9630 c = *fmt++;
9631 }
9632 else if (c >= '0' && c <= '9') {
9633 width = c - '0';
9634 while (--fmtcnt >= 0) {
9635 c = *fmt++;
9636 if (c < '0' || c > '9')
9637 break;
9638 if ((width*10) / 10 != width) {
9639 PyErr_SetString(PyExc_ValueError,
9640 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009641 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009642 }
9643 width = width*10 + (c - '0');
9644 }
9645 }
9646 if (c == '.') {
9647 prec = 0;
9648 if (--fmtcnt >= 0)
9649 c = *fmt++;
9650 if (c == '*') {
9651 v = getnextarg(args, arglen, &argidx);
9652 if (v == NULL)
9653 goto onError;
9654 if (!PyLong_Check(v)) {
9655 PyErr_SetString(PyExc_TypeError,
9656 "* wants int");
9657 goto onError;
9658 }
9659 prec = PyLong_AsLong(v);
9660 if (prec == -1 && PyErr_Occurred())
9661 goto onError;
9662 if (prec < 0)
9663 prec = 0;
9664 if (--fmtcnt >= 0)
9665 c = *fmt++;
9666 }
9667 else if (c >= '0' && c <= '9') {
9668 prec = c - '0';
9669 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009670 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009671 if (c < '0' || c > '9')
9672 break;
9673 if ((prec*10) / 10 != prec) {
9674 PyErr_SetString(PyExc_ValueError,
9675 "prec too big");
9676 goto onError;
9677 }
9678 prec = prec*10 + (c - '0');
9679 }
9680 }
9681 } /* prec */
9682 if (fmtcnt >= 0) {
9683 if (c == 'h' || c == 'l' || c == 'L') {
9684 if (--fmtcnt >= 0)
9685 c = *fmt++;
9686 }
9687 }
9688 if (fmtcnt < 0) {
9689 PyErr_SetString(PyExc_ValueError,
9690 "incomplete format");
9691 goto onError;
9692 }
9693 if (c != '%') {
9694 v = getnextarg(args, arglen, &argidx);
9695 if (v == NULL)
9696 goto onError;
9697 }
9698 sign = 0;
9699 fill = ' ';
9700 switch (c) {
9701
9702 case '%':
9703 pbuf = formatbuf;
9704 /* presume that buffer length is at least 1 */
9705 pbuf[0] = '%';
9706 len = 1;
9707 break;
9708
9709 case 's':
9710 case 'r':
9711 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009712 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009713 temp = v;
9714 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009715 }
9716 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009717 if (c == 's')
9718 temp = PyObject_Str(v);
9719 else if (c == 'r')
9720 temp = PyObject_Repr(v);
9721 else
9722 temp = PyObject_ASCII(v);
9723 if (temp == NULL)
9724 goto onError;
9725 if (PyUnicode_Check(temp))
9726 /* nothing to do */;
9727 else {
9728 Py_DECREF(temp);
9729 PyErr_SetString(PyExc_TypeError,
9730 "%s argument has non-string str()");
9731 goto onError;
9732 }
9733 }
9734 pbuf = PyUnicode_AS_UNICODE(temp);
9735 len = PyUnicode_GET_SIZE(temp);
9736 if (prec >= 0 && len > prec)
9737 len = prec;
9738 break;
9739
9740 case 'i':
9741 case 'd':
9742 case 'u':
9743 case 'o':
9744 case 'x':
9745 case 'X':
9746 if (c == 'i')
9747 c = 'd';
9748 isnumok = 0;
9749 if (PyNumber_Check(v)) {
9750 PyObject *iobj=NULL;
9751
9752 if (PyLong_Check(v)) {
9753 iobj = v;
9754 Py_INCREF(iobj);
9755 }
9756 else {
9757 iobj = PyNumber_Long(v);
9758 }
9759 if (iobj!=NULL) {
9760 if (PyLong_Check(iobj)) {
9761 isnumok = 1;
9762 temp = formatlong(iobj, flags, prec, c);
9763 Py_DECREF(iobj);
9764 if (!temp)
9765 goto onError;
9766 pbuf = PyUnicode_AS_UNICODE(temp);
9767 len = PyUnicode_GET_SIZE(temp);
9768 sign = 1;
9769 }
9770 else {
9771 Py_DECREF(iobj);
9772 }
9773 }
9774 }
9775 if (!isnumok) {
9776 PyErr_Format(PyExc_TypeError,
9777 "%%%c format: a number is required, "
9778 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9779 goto onError;
9780 }
9781 if (flags & F_ZERO)
9782 fill = '0';
9783 break;
9784
9785 case 'e':
9786 case 'E':
9787 case 'f':
9788 case 'F':
9789 case 'g':
9790 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009791 temp = formatfloat(v, flags, prec, c);
9792 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009793 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009794 pbuf = PyUnicode_AS_UNICODE(temp);
9795 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009796 sign = 1;
9797 if (flags & F_ZERO)
9798 fill = '0';
9799 break;
9800
9801 case 'c':
9802 pbuf = formatbuf;
9803 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9804 if (len < 0)
9805 goto onError;
9806 break;
9807
9808 default:
9809 PyErr_Format(PyExc_ValueError,
9810 "unsupported format character '%c' (0x%x) "
9811 "at index %zd",
9812 (31<=c && c<=126) ? (char)c : '?',
9813 (int)c,
9814 (Py_ssize_t)(fmt - 1 -
9815 PyUnicode_AS_UNICODE(uformat)));
9816 goto onError;
9817 }
9818 if (sign) {
9819 if (*pbuf == '-' || *pbuf == '+') {
9820 sign = *pbuf++;
9821 len--;
9822 }
9823 else if (flags & F_SIGN)
9824 sign = '+';
9825 else if (flags & F_BLANK)
9826 sign = ' ';
9827 else
9828 sign = 0;
9829 }
9830 if (width < len)
9831 width = len;
9832 if (rescnt - (sign != 0) < width) {
9833 reslen -= rescnt;
9834 rescnt = width + fmtcnt + 100;
9835 reslen += rescnt;
9836 if (reslen < 0) {
9837 Py_XDECREF(temp);
9838 PyErr_NoMemory();
9839 goto onError;
9840 }
9841 if (_PyUnicode_Resize(&result, reslen) < 0) {
9842 Py_XDECREF(temp);
9843 goto onError;
9844 }
9845 res = PyUnicode_AS_UNICODE(result)
9846 + reslen - rescnt;
9847 }
9848 if (sign) {
9849 if (fill != ' ')
9850 *res++ = sign;
9851 rescnt--;
9852 if (width > len)
9853 width--;
9854 }
9855 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9856 assert(pbuf[0] == '0');
9857 assert(pbuf[1] == c);
9858 if (fill != ' ') {
9859 *res++ = *pbuf++;
9860 *res++ = *pbuf++;
9861 }
9862 rescnt -= 2;
9863 width -= 2;
9864 if (width < 0)
9865 width = 0;
9866 len -= 2;
9867 }
9868 if (width > len && !(flags & F_LJUST)) {
9869 do {
9870 --rescnt;
9871 *res++ = fill;
9872 } while (--width > len);
9873 }
9874 if (fill == ' ') {
9875 if (sign)
9876 *res++ = sign;
9877 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9878 assert(pbuf[0] == '0');
9879 assert(pbuf[1] == c);
9880 *res++ = *pbuf++;
9881 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009882 }
9883 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009884 Py_UNICODE_COPY(res, pbuf, len);
9885 res += len;
9886 rescnt -= len;
9887 while (--width >= len) {
9888 --rescnt;
9889 *res++ = ' ';
9890 }
9891 if (dict && (argidx < arglen) && c != '%') {
9892 PyErr_SetString(PyExc_TypeError,
9893 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009894 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009895 goto onError;
9896 }
9897 Py_XDECREF(temp);
9898 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899 } /* until end */
9900 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009901 PyErr_SetString(PyExc_TypeError,
9902 "not all arguments converted during string formatting");
9903 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009904 }
9905
Thomas Woutersa96affe2006-03-12 00:29:36 +00009906 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009907 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009908 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009909 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009910 }
9911 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912 return (PyObject *)result;
9913
Benjamin Peterson29060642009-01-31 22:14:21 +00009914 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009915 Py_XDECREF(result);
9916 Py_DECREF(uformat);
9917 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009918 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009919 }
9920 return NULL;
9921}
9922
Jeremy Hylton938ace62002-07-17 16:30:39 +00009923static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009924unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9925
Tim Peters6d6c1a32001-08-02 04:15:00 +00009926static PyObject *
9927unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9928{
Benjamin Peterson29060642009-01-31 22:14:21 +00009929 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009930 static char *kwlist[] = {"object", "encoding", "errors", 0};
9931 char *encoding = NULL;
9932 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009933
Benjamin Peterson14339b62009-01-31 16:36:08 +00009934 if (type != &PyUnicode_Type)
9935 return unicode_subtype_new(type, args, kwds);
9936 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009937 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009938 return NULL;
9939 if (x == NULL)
9940 return (PyObject *)_PyUnicode_New(0);
9941 if (encoding == NULL && errors == NULL)
9942 return PyObject_Str(x);
9943 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009944 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009945}
9946
Guido van Rossume023fe02001-08-30 03:12:59 +00009947static PyObject *
9948unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9949{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009950 PyUnicodeObject *tmp, *pnew;
9951 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009952
Benjamin Peterson14339b62009-01-31 16:36:08 +00009953 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9954 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9955 if (tmp == NULL)
9956 return NULL;
9957 assert(PyUnicode_Check(tmp));
9958 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9959 if (pnew == NULL) {
9960 Py_DECREF(tmp);
9961 return NULL;
9962 }
9963 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9964 if (pnew->str == NULL) {
9965 _Py_ForgetReference((PyObject *)pnew);
9966 PyObject_Del(pnew);
9967 Py_DECREF(tmp);
9968 return PyErr_NoMemory();
9969 }
9970 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9971 pnew->length = n;
9972 pnew->hash = tmp->hash;
9973 Py_DECREF(tmp);
9974 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009975}
9976
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009977PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009978 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009979\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009980Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009981encoding defaults to the current default string encoding.\n\
9982errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009983
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009984static PyObject *unicode_iter(PyObject *seq);
9985
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009987 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009988 "str", /* tp_name */
9989 sizeof(PyUnicodeObject), /* tp_size */
9990 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009991 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009992 (destructor)unicode_dealloc, /* tp_dealloc */
9993 0, /* tp_print */
9994 0, /* tp_getattr */
9995 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009996 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009997 unicode_repr, /* tp_repr */
9998 &unicode_as_number, /* tp_as_number */
9999 &unicode_as_sequence, /* tp_as_sequence */
10000 &unicode_as_mapping, /* tp_as_mapping */
10001 (hashfunc) unicode_hash, /* tp_hash*/
10002 0, /* tp_call*/
10003 (reprfunc) unicode_str, /* tp_str */
10004 PyObject_GenericGetAttr, /* tp_getattro */
10005 0, /* tp_setattro */
10006 0, /* tp_as_buffer */
10007 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000010008 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010009 unicode_doc, /* tp_doc */
10010 0, /* tp_traverse */
10011 0, /* tp_clear */
10012 PyUnicode_RichCompare, /* tp_richcompare */
10013 0, /* tp_weaklistoffset */
10014 unicode_iter, /* tp_iter */
10015 0, /* tp_iternext */
10016 unicode_methods, /* tp_methods */
10017 0, /* tp_members */
10018 0, /* tp_getset */
10019 &PyBaseObject_Type, /* tp_base */
10020 0, /* tp_dict */
10021 0, /* tp_descr_get */
10022 0, /* tp_descr_set */
10023 0, /* tp_dictoffset */
10024 0, /* tp_init */
10025 0, /* tp_alloc */
10026 unicode_new, /* tp_new */
10027 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028};
10029
10030/* Initialize the Unicode implementation */
10031
Thomas Wouters78890102000-07-22 19:25:51 +000010032void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010034 int i;
10035
Thomas Wouters477c8d52006-05-27 19:21:47 +000010036 /* XXX - move this array to unicodectype.c ? */
10037 Py_UNICODE linebreak[] = {
10038 0x000A, /* LINE FEED */
10039 0x000D, /* CARRIAGE RETURN */
10040 0x001C, /* FILE SEPARATOR */
10041 0x001D, /* GROUP SEPARATOR */
10042 0x001E, /* RECORD SEPARATOR */
10043 0x0085, /* NEXT LINE */
10044 0x2028, /* LINE SEPARATOR */
10045 0x2029, /* PARAGRAPH SEPARATOR */
10046 };
10047
Fred Drakee4315f52000-05-09 19:53:39 +000010048 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +000010049 free_list = NULL;
10050 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010051 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010052 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +000010053 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010054
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010055 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000010056 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000010057 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010058 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000010059
10060 /* initialize the linebreak bloom filter */
10061 bloom_linebreak = make_bloom_mask(
10062 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10063 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010064
10065 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010066}
10067
10068/* Finalize the Unicode implementation */
10069
Christian Heimesa156e092008-02-16 07:38:31 +000010070int
10071PyUnicode_ClearFreeList(void)
10072{
10073 int freelist_size = numfree;
10074 PyUnicodeObject *u;
10075
10076 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010077 PyUnicodeObject *v = u;
10078 u = *(PyUnicodeObject **)u;
10079 if (v->str)
10080 PyObject_DEL(v->str);
10081 Py_XDECREF(v->defenc);
10082 PyObject_Del(v);
10083 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010084 }
10085 free_list = NULL;
10086 assert(numfree == 0);
10087 return freelist_size;
10088}
10089
Guido van Rossumd57fd912000-03-10 22:53:23 +000010090void
Thomas Wouters78890102000-07-22 19:25:51 +000010091_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010092{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010093 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010094
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010095 Py_XDECREF(unicode_empty);
10096 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010097
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010098 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010099 if (unicode_latin1[i]) {
10100 Py_DECREF(unicode_latin1[i]);
10101 unicode_latin1[i] = NULL;
10102 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010103 }
Christian Heimesa156e092008-02-16 07:38:31 +000010104 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010105}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010106
Walter Dörwald16807132007-05-25 13:52:07 +000010107void
10108PyUnicode_InternInPlace(PyObject **p)
10109{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010110 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10111 PyObject *t;
10112 if (s == NULL || !PyUnicode_Check(s))
10113 Py_FatalError(
10114 "PyUnicode_InternInPlace: unicode strings only please!");
10115 /* If it's a subclass, we don't really know what putting
10116 it in the interned dict might do. */
10117 if (!PyUnicode_CheckExact(s))
10118 return;
10119 if (PyUnicode_CHECK_INTERNED(s))
10120 return;
10121 if (interned == NULL) {
10122 interned = PyDict_New();
10123 if (interned == NULL) {
10124 PyErr_Clear(); /* Don't leave an exception */
10125 return;
10126 }
10127 }
10128 /* It might be that the GetItem call fails even
10129 though the key is present in the dictionary,
10130 namely when this happens during a stack overflow. */
10131 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010132 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010133 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010134
Benjamin Peterson29060642009-01-31 22:14:21 +000010135 if (t) {
10136 Py_INCREF(t);
10137 Py_DECREF(*p);
10138 *p = t;
10139 return;
10140 }
Walter Dörwald16807132007-05-25 13:52:07 +000010141
Benjamin Peterson14339b62009-01-31 16:36:08 +000010142 PyThreadState_GET()->recursion_critical = 1;
10143 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10144 PyErr_Clear();
10145 PyThreadState_GET()->recursion_critical = 0;
10146 return;
10147 }
10148 PyThreadState_GET()->recursion_critical = 0;
10149 /* The two references in interned are not counted by refcnt.
10150 The deallocator will take care of this */
10151 Py_REFCNT(s) -= 2;
10152 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010153}
10154
10155void
10156PyUnicode_InternImmortal(PyObject **p)
10157{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010158 PyUnicode_InternInPlace(p);
10159 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10160 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10161 Py_INCREF(*p);
10162 }
Walter Dörwald16807132007-05-25 13:52:07 +000010163}
10164
10165PyObject *
10166PyUnicode_InternFromString(const char *cp)
10167{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010168 PyObject *s = PyUnicode_FromString(cp);
10169 if (s == NULL)
10170 return NULL;
10171 PyUnicode_InternInPlace(&s);
10172 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010173}
10174
Alexander Belopolsky40018472011-02-26 01:02:56 +000010175void
10176_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000010177{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010178 PyObject *keys;
10179 PyUnicodeObject *s;
10180 Py_ssize_t i, n;
10181 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010182
Benjamin Peterson14339b62009-01-31 16:36:08 +000010183 if (interned == NULL || !PyDict_Check(interned))
10184 return;
10185 keys = PyDict_Keys(interned);
10186 if (keys == NULL || !PyList_Check(keys)) {
10187 PyErr_Clear();
10188 return;
10189 }
Walter Dörwald16807132007-05-25 13:52:07 +000010190
Benjamin Peterson14339b62009-01-31 16:36:08 +000010191 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10192 detector, interned unicode strings are not forcibly deallocated;
10193 rather, we give them their stolen references back, and then clear
10194 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010195
Benjamin Peterson14339b62009-01-31 16:36:08 +000010196 n = PyList_GET_SIZE(keys);
10197 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010198 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010199 for (i = 0; i < n; i++) {
10200 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10201 switch (s->state) {
10202 case SSTATE_NOT_INTERNED:
10203 /* XXX Shouldn't happen */
10204 break;
10205 case SSTATE_INTERNED_IMMORTAL:
10206 Py_REFCNT(s) += 1;
10207 immortal_size += s->length;
10208 break;
10209 case SSTATE_INTERNED_MORTAL:
10210 Py_REFCNT(s) += 2;
10211 mortal_size += s->length;
10212 break;
10213 default:
10214 Py_FatalError("Inconsistent interned string state.");
10215 }
10216 s->state = SSTATE_NOT_INTERNED;
10217 }
10218 fprintf(stderr, "total size of all interned strings: "
10219 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10220 "mortal/immortal\n", mortal_size, immortal_size);
10221 Py_DECREF(keys);
10222 PyDict_Clear(interned);
10223 Py_DECREF(interned);
10224 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010225}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010226
10227
10228/********************* Unicode Iterator **************************/
10229
10230typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010231 PyObject_HEAD
10232 Py_ssize_t it_index;
10233 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010234} unicodeiterobject;
10235
10236static void
10237unicodeiter_dealloc(unicodeiterobject *it)
10238{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010239 _PyObject_GC_UNTRACK(it);
10240 Py_XDECREF(it->it_seq);
10241 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010242}
10243
10244static int
10245unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10246{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010247 Py_VISIT(it->it_seq);
10248 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010249}
10250
10251static PyObject *
10252unicodeiter_next(unicodeiterobject *it)
10253{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010254 PyUnicodeObject *seq;
10255 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010256
Benjamin Peterson14339b62009-01-31 16:36:08 +000010257 assert(it != NULL);
10258 seq = it->it_seq;
10259 if (seq == NULL)
10260 return NULL;
10261 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010262
Benjamin Peterson14339b62009-01-31 16:36:08 +000010263 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10264 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010265 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010266 if (item != NULL)
10267 ++it->it_index;
10268 return item;
10269 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010270
Benjamin Peterson14339b62009-01-31 16:36:08 +000010271 Py_DECREF(seq);
10272 it->it_seq = NULL;
10273 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010274}
10275
10276static PyObject *
10277unicodeiter_len(unicodeiterobject *it)
10278{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010279 Py_ssize_t len = 0;
10280 if (it->it_seq)
10281 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10282 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010283}
10284
10285PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10286
10287static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010288 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010289 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010290 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010291};
10292
10293PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010294 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10295 "str_iterator", /* tp_name */
10296 sizeof(unicodeiterobject), /* tp_basicsize */
10297 0, /* tp_itemsize */
10298 /* methods */
10299 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10300 0, /* tp_print */
10301 0, /* tp_getattr */
10302 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010303 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010304 0, /* tp_repr */
10305 0, /* tp_as_number */
10306 0, /* tp_as_sequence */
10307 0, /* tp_as_mapping */
10308 0, /* tp_hash */
10309 0, /* tp_call */
10310 0, /* tp_str */
10311 PyObject_GenericGetAttr, /* tp_getattro */
10312 0, /* tp_setattro */
10313 0, /* tp_as_buffer */
10314 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10315 0, /* tp_doc */
10316 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10317 0, /* tp_clear */
10318 0, /* tp_richcompare */
10319 0, /* tp_weaklistoffset */
10320 PyObject_SelfIter, /* tp_iter */
10321 (iternextfunc)unicodeiter_next, /* tp_iternext */
10322 unicodeiter_methods, /* tp_methods */
10323 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010324};
10325
10326static PyObject *
10327unicode_iter(PyObject *seq)
10328{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010329 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010330
Benjamin Peterson14339b62009-01-31 16:36:08 +000010331 if (!PyUnicode_Check(seq)) {
10332 PyErr_BadInternalCall();
10333 return NULL;
10334 }
10335 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10336 if (it == NULL)
10337 return NULL;
10338 it->it_index = 0;
10339 Py_INCREF(seq);
10340 it->it_seq = (PyUnicodeObject *)seq;
10341 _PyObject_GC_TRACK(it);
10342 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010343}
10344
Martin v. Löwis5b222132007-06-10 09:51:05 +000010345size_t
10346Py_UNICODE_strlen(const Py_UNICODE *u)
10347{
10348 int res = 0;
10349 while(*u++)
10350 res++;
10351 return res;
10352}
10353
10354Py_UNICODE*
10355Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10356{
10357 Py_UNICODE *u = s1;
10358 while ((*u++ = *s2++));
10359 return s1;
10360}
10361
10362Py_UNICODE*
10363Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10364{
10365 Py_UNICODE *u = s1;
10366 while ((*u++ = *s2++))
10367 if (n-- == 0)
10368 break;
10369 return s1;
10370}
10371
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010372Py_UNICODE*
10373Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10374{
10375 Py_UNICODE *u1 = s1;
10376 u1 += Py_UNICODE_strlen(u1);
10377 Py_UNICODE_strcpy(u1, s2);
10378 return s1;
10379}
10380
Martin v. Löwis5b222132007-06-10 09:51:05 +000010381int
10382Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10383{
10384 while (*s1 && *s2 && *s1 == *s2)
10385 s1++, s2++;
10386 if (*s1 && *s2)
10387 return (*s1 < *s2) ? -1 : +1;
10388 if (*s1)
10389 return 1;
10390 if (*s2)
10391 return -1;
10392 return 0;
10393}
10394
Victor Stinneref8d95c2010-08-16 22:03:11 +000010395int
10396Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10397{
10398 register Py_UNICODE u1, u2;
10399 for (; n != 0; n--) {
10400 u1 = *s1;
10401 u2 = *s2;
10402 if (u1 != u2)
10403 return (u1 < u2) ? -1 : +1;
10404 if (u1 == '\0')
10405 return 0;
10406 s1++;
10407 s2++;
10408 }
10409 return 0;
10410}
10411
Martin v. Löwis5b222132007-06-10 09:51:05 +000010412Py_UNICODE*
10413Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10414{
10415 const Py_UNICODE *p;
10416 for (p = s; *p; p++)
10417 if (*p == c)
10418 return (Py_UNICODE*)p;
10419 return NULL;
10420}
10421
Victor Stinner331ea922010-08-10 16:37:20 +000010422Py_UNICODE*
10423Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10424{
10425 const Py_UNICODE *p;
10426 p = s + Py_UNICODE_strlen(s);
10427 while (p != s) {
10428 p--;
10429 if (*p == c)
10430 return (Py_UNICODE*)p;
10431 }
10432 return NULL;
10433}
10434
Victor Stinner71133ff2010-09-01 23:43:53 +000010435Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010436PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010437{
10438 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10439 Py_UNICODE *copy;
10440 Py_ssize_t size;
10441
10442 /* Ensure we won't overflow the size. */
10443 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10444 PyErr_NoMemory();
10445 return NULL;
10446 }
10447 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10448 size *= sizeof(Py_UNICODE);
10449 copy = PyMem_Malloc(size);
10450 if (copy == NULL) {
10451 PyErr_NoMemory();
10452 return NULL;
10453 }
10454 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10455 return copy;
10456}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010457
Georg Brandl66c221e2010-10-14 07:04:07 +000010458/* A _string module, to export formatter_parser and formatter_field_name_split
10459 to the string.Formatter class implemented in Python. */
10460
10461static PyMethodDef _string_methods[] = {
10462 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10463 METH_O, PyDoc_STR("split the argument as a field name")},
10464 {"formatter_parser", (PyCFunction) formatter_parser,
10465 METH_O, PyDoc_STR("parse the argument as a format string")},
10466 {NULL, NULL}
10467};
10468
10469static struct PyModuleDef _string_module = {
10470 PyModuleDef_HEAD_INIT,
10471 "_string",
10472 PyDoc_STR("string helper module"),
10473 0,
10474 _string_methods,
10475 NULL,
10476 NULL,
10477 NULL,
10478 NULL
10479};
10480
10481PyMODINIT_FUNC
10482PyInit__string(void)
10483{
10484 return PyModule_Create(&_string_module);
10485}
10486
10487
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010488#ifdef __cplusplus
10489}
10490#endif