blob: cd0fccf4b5dca89b5734eb7edfedb715a38e2c47 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Alexander Belopolsky40018472011-02-26 01:02:56 +0000145static PyObject *
146unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000147 PyObject **errorHandler,const char *encoding, const char *reason,
148 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
149 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
150
Alexander Belopolsky40018472011-02-26 01:02:56 +0000151static void
152raise_encode_exception(PyObject **exceptionObject,
153 const char *encoding,
154 const Py_UNICODE *unicode, Py_ssize_t size,
155 Py_ssize_t startpos, Py_ssize_t endpos,
156 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000157
Christian Heimes190d79e2008-01-30 11:58:22 +0000158/* Same for linebreaks */
159static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000161/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000162/* 0x000B, * LINE TABULATION */
163/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000165 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000166 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000167/* 0x001C, * FILE SEPARATOR */
168/* 0x001D, * GROUP SEPARATOR */
169/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000175
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000184};
185
186
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000188PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000190#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000192#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 /* This is actually an illegal character, so it should
194 not be passed to unichr. */
195 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000196#endif
197}
198
Thomas Wouters477c8d52006-05-27 19:21:47 +0000199/* --- Bloom Filters ----------------------------------------------------- */
200
201/* stuff to implement simple "bloom filters" for Unicode characters.
202 to keep things simple, we use a single bitmask, using the least 5
203 bits from each unicode characters as the bit index. */
204
205/* the linebreak mask is set up by Unicode_Init below */
206
Antoine Pitrouf068f942010-01-13 14:19:12 +0000207#if LONG_BIT >= 128
208#define BLOOM_WIDTH 128
209#elif LONG_BIT >= 64
210#define BLOOM_WIDTH 64
211#elif LONG_BIT >= 32
212#define BLOOM_WIDTH 32
213#else
214#error "LONG_BIT is smaller than 32"
215#endif
216
Thomas Wouters477c8d52006-05-27 19:21:47 +0000217#define BLOOM_MASK unsigned long
218
219static BLOOM_MASK bloom_linebreak;
220
Antoine Pitrouf068f942010-01-13 14:19:12 +0000221#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
222#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223
Benjamin Peterson29060642009-01-31 22:14:21 +0000224#define BLOOM_LINEBREAK(ch) \
225 ((ch) < 128U ? ascii_linebreak[(ch)] : \
226 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227
Alexander Belopolsky40018472011-02-26 01:02:56 +0000228Py_LOCAL_INLINE(BLOOM_MASK)
229make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243Py_LOCAL_INLINE(int)
244unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245{
246 Py_ssize_t i;
247
248 for (i = 0; i < setlen; i++)
249 if (set[i] == chr)
250 return 1;
251
252 return 0;
253}
254
Benjamin Peterson29060642009-01-31 22:14:21 +0000255#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000256 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
257
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258/* --- Unicode Object ----------------------------------------------------- */
259
Alexander Belopolsky40018472011-02-26 01:02:56 +0000260static int
261unicode_resize(register PyUnicodeObject *unicode,
262 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263{
264 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000265
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000268 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000270 /* Resizing shared object (unicode_empty or single character
271 objects) in-place is not allowed. Use PyUnicode_Resize()
272 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000275 (unicode->length == 1 &&
276 unicode->str[0] < 256U &&
277 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000279 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return -1;
281 }
282
Thomas Wouters477c8d52006-05-27 19:21:47 +0000283 /* We allocate one more byte to make sure the string is Ux0000 terminated.
284 The overallocation is also used by fastsearch, which assumes that it's
285 safe to look at str[length] (without making any assumptions about what
286 it contains). */
287
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000289 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000290 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000292 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 PyErr_NoMemory();
294 return -1;
295 }
296 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000297 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000301 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000302 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000303 }
304 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000305
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306 return 0;
307}
308
309/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000310 Ux0000 terminated; some code (e.g. new_identifier)
311 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312
313 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000314 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
316*/
317
Alexander Belopolsky40018472011-02-26 01:02:56 +0000318static PyUnicodeObject *
319_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320{
321 register PyUnicodeObject *unicode;
322
Thomas Wouters477c8d52006-05-27 19:21:47 +0000323 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (length == 0 && unicode_empty != NULL) {
325 Py_INCREF(unicode_empty);
326 return unicode_empty;
327 }
328
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000329 /* Ensure we won't overflow the size. */
330 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
331 return (PyUnicodeObject *)PyErr_NoMemory();
332 }
333
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000335 if (free_list) {
336 unicode = free_list;
337 free_list = *(PyUnicodeObject **)unicode;
338 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 if (unicode->str) {
340 /* Keep-Alive optimization: we only upsize the buffer,
341 never downsize it. */
342 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000343 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000344 PyObject_DEL(unicode->str);
345 unicode->str = NULL;
346 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000347 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000348 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000351 }
352 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 }
354 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000356 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 if (unicode == NULL)
358 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000359 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
360 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 }
362
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000363 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 PyErr_NoMemory();
365 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000366 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000367 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000368 * the caller fails before initializing str -- unicode_resize()
369 * reads str[0], and the Keep-Alive optimization can keep memory
370 * allocated for str alive across a call to unicode_dealloc(unicode).
371 * We don't want unicode_resize to read uninitialized memory in
372 * that case.
373 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000374 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000378 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000379 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000380 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381
Benjamin Peterson29060642009-01-31 22:14:21 +0000382 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000383 /* XXX UNREF/NEWREF interface should be more symmetrical */
384 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000385 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000386 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388}
389
Alexander Belopolsky40018472011-02-26 01:02:56 +0000390static void
391unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392{
Walter Dörwald16807132007-05-25 13:52:07 +0000393 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000394 case SSTATE_NOT_INTERNED:
395 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000396
Benjamin Peterson29060642009-01-31 22:14:21 +0000397 case SSTATE_INTERNED_MORTAL:
398 /* revive dead object temporarily for DelItem */
399 Py_REFCNT(unicode) = 3;
400 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
401 Py_FatalError(
402 "deletion of interned string failed");
403 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 case SSTATE_INTERNED_IMMORTAL:
406 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000407
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 default:
409 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000410 }
411
Guido van Rossum604ddf82001-12-06 20:03:56 +0000412 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000414 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000415 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
416 PyObject_DEL(unicode->str);
417 unicode->str = NULL;
418 unicode->length = 0;
419 }
420 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000421 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 }
423 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000424 *(PyUnicodeObject **)unicode = free_list;
425 free_list = unicode;
426 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 }
428 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000429 PyObject_DEL(unicode->str);
430 Py_XDECREF(unicode->defenc);
431 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 }
433}
434
Alexander Belopolsky40018472011-02-26 01:02:56 +0000435static int
436_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000437{
438 register PyUnicodeObject *v;
439
440 /* Argument checks */
441 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000442 PyErr_BadInternalCall();
443 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000446 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000447 PyErr_BadInternalCall();
448 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000449 }
450
451 /* Resizing unicode_empty and single character objects is not
452 possible since these are being shared. We simply return a fresh
453 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000454 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000455 (v == unicode_empty || v->length == 1)) {
456 PyUnicodeObject *w = _PyUnicode_New(length);
457 if (w == NULL)
458 return -1;
459 Py_UNICODE_COPY(w->str, v->str,
460 length < v->length ? length : v->length);
461 Py_DECREF(*unicode);
462 *unicode = w;
463 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000464 }
465
466 /* Note that we don't have to modify *unicode for unshared Unicode
467 objects, since we can modify them in-place. */
468 return unicode_resize(v, length);
469}
470
Alexander Belopolsky40018472011-02-26 01:02:56 +0000471int
472PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000473{
474 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
475}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476
Alexander Belopolsky40018472011-02-26 01:02:56 +0000477PyObject *
478PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479{
480 PyUnicodeObject *unicode;
481
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects. */
484 if (u != NULL) {
485
Benjamin Peterson29060642009-01-31 22:14:21 +0000486 /* Optimization for empty strings */
487 if (size == 0 && unicode_empty != NULL) {
488 Py_INCREF(unicode_empty);
489 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000490 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000491
492 /* Single character Unicode objects in the Latin-1 range are
493 shared when using this constructor */
494 if (size == 1 && *u < 256) {
495 unicode = unicode_latin1[*u];
496 if (!unicode) {
497 unicode = _PyUnicode_New(1);
498 if (!unicode)
499 return NULL;
500 unicode->str[0] = *u;
501 unicode_latin1[*u] = unicode;
502 }
503 Py_INCREF(unicode);
504 return (PyObject *)unicode;
505 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 unicode = _PyUnicode_New(size);
509 if (!unicode)
510 return NULL;
511
512 /* Copy the Unicode data into the new object */
513 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000514 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000515
516 return (PyObject *)unicode;
517}
518
Alexander Belopolsky40018472011-02-26 01:02:56 +0000519PyObject *
520PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000521{
522 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 if (size < 0) {
525 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000526 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000527 return NULL;
528 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000531 some optimizations which share commonly used objects.
532 Also, this means the input must be UTF-8, so fall back to the
533 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 if (u != NULL) {
535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536 /* Optimization for empty strings */
537 if (size == 0 && unicode_empty != NULL) {
538 Py_INCREF(unicode_empty);
539 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000540 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000541
542 /* Single characters are shared when using this constructor.
543 Restrict to ASCII, since the input must be UTF-8. */
544 if (size == 1 && Py_CHARMASK(*u) < 128) {
545 unicode = unicode_latin1[Py_CHARMASK(*u)];
546 if (!unicode) {
547 unicode = _PyUnicode_New(1);
548 if (!unicode)
549 return NULL;
550 unicode->str[0] = Py_CHARMASK(*u);
551 unicode_latin1[Py_CHARMASK(*u)] = unicode;
552 }
553 Py_INCREF(unicode);
554 return (PyObject *)unicode;
555 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000556
557 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 }
559
Walter Dörwald55507312007-05-18 13:12:10 +0000560 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 if (!unicode)
562 return NULL;
563
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 return (PyObject *)unicode;
565}
566
Alexander Belopolsky40018472011-02-26 01:02:56 +0000567PyObject *
568PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +0000569{
570 size_t size = strlen(u);
571 if (size > PY_SSIZE_T_MAX) {
572 PyErr_SetString(PyExc_OverflowError, "input too long");
573 return NULL;
574 }
575
576 return PyUnicode_FromStringAndSize(u, size);
577}
578
Guido van Rossumd57fd912000-03-10 22:53:23 +0000579#ifdef HAVE_WCHAR_H
580
Mark Dickinson081dfee2009-03-18 14:47:41 +0000581#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
582# define CONVERT_WCHAR_TO_SURROGATES
583#endif
584
585#ifdef CONVERT_WCHAR_TO_SURROGATES
586
587/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
588 to convert from UTF32 to UTF16. */
589
Alexander Belopolsky40018472011-02-26 01:02:56 +0000590PyObject *
591PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +0000592{
593 PyUnicodeObject *unicode;
594 register Py_ssize_t i;
595 Py_ssize_t alloc;
596 const wchar_t *orig_w;
597
598 if (w == NULL) {
599 if (size == 0)
600 return PyUnicode_FromStringAndSize(NULL, 0);
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 if (size == -1) {
606 size = wcslen(w);
607 }
608
609 alloc = size;
610 orig_w = w;
611 for (i = size; i > 0; i--) {
612 if (*w > 0xFFFF)
613 alloc++;
614 w++;
615 }
616 w = orig_w;
617 unicode = _PyUnicode_New(alloc);
618 if (!unicode)
619 return NULL;
620
621 /* Copy the wchar_t data into the new object */
622 {
623 register Py_UNICODE *u;
624 u = PyUnicode_AS_UNICODE(unicode);
625 for (i = size; i > 0; i--) {
626 if (*w > 0xFFFF) {
627 wchar_t ordinal = *w++;
628 ordinal -= 0x10000;
629 *u++ = 0xD800 | (ordinal >> 10);
630 *u++ = 0xDC00 | (ordinal & 0x3FF);
631 }
632 else
633 *u++ = *w++;
634 }
635 }
636 return (PyObject *)unicode;
637}
638
639#else
640
Alexander Belopolsky40018472011-02-26 01:02:56 +0000641PyObject *
642PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643{
644 PyUnicodeObject *unicode;
645
646 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000647 if (size == 0)
648 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 PyErr_BadInternalCall();
650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 }
652
Martin v. Löwis790465f2008-04-05 20:41:37 +0000653 if (size == -1) {
654 size = wcslen(w);
655 }
656
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 unicode = _PyUnicode_New(size);
658 if (!unicode)
659 return NULL;
660
661 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000662#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000664#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000666 register Py_UNICODE *u;
667 register Py_ssize_t i;
668 u = PyUnicode_AS_UNICODE(unicode);
669 for (i = size; i > 0; i--)
670 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671 }
672#endif
673
674 return (PyObject *)unicode;
675}
676
Mark Dickinson081dfee2009-03-18 14:47:41 +0000677#endif /* CONVERT_WCHAR_TO_SURROGATES */
678
679#undef CONVERT_WCHAR_TO_SURROGATES
680
Walter Dörwald346737f2007-05-31 10:44:43 +0000681static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000682makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
683 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000684{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000685 *fmt++ = '%';
686 if (width) {
687 if (zeropad)
688 *fmt++ = '0';
689 fmt += sprintf(fmt, "%d", width);
690 }
691 if (precision)
692 fmt += sprintf(fmt, ".%d", precision);
693 if (longflag)
694 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000695 else if (longlongflag) {
696 /* longlongflag should only ever be nonzero on machines with
697 HAVE_LONG_LONG defined */
698#ifdef HAVE_LONG_LONG
699 char *f = PY_FORMAT_LONG_LONG;
700 while (*f)
701 *fmt++ = *f++;
702#else
703 /* we shouldn't ever get here */
704 assert(0);
705 *fmt++ = 'l';
706#endif
707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000708 else if (size_tflag) {
709 char *f = PY_FORMAT_SIZE_T;
710 while (*f)
711 *fmt++ = *f++;
712 }
713 *fmt++ = c;
714 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000715}
716
Victor Stinner96865452011-03-01 23:44:09 +0000717/* helper for PyUnicode_FromFormatV() */
718
719static const char*
720parse_format_flags(const char *f,
721 int *p_width, int *p_precision,
722 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
723{
724 int width, precision, longflag, longlongflag, size_tflag;
725
726 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
727 f++;
728 width = 0;
729 while (Py_ISDIGIT((unsigned)*f))
730 width = (width*10) + *f++ - '0';
731 precision = 0;
732 if (*f == '.') {
733 f++;
734 while (Py_ISDIGIT((unsigned)*f))
735 precision = (precision*10) + *f++ - '0';
736 if (*f == '%') {
737 /* "%.3%s" => f points to "3" */
738 f--;
739 }
740 }
741 if (*f == '\0') {
742 /* bogus format "%.1" => go backward, f points to "1" */
743 f--;
744 }
745 if (p_width != NULL)
746 *p_width = width;
747 if (p_precision != NULL)
748 *p_precision = precision;
749
750 /* Handle %ld, %lu, %lld and %llu. */
751 longflag = 0;
752 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +0000753 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +0000754
755 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +0000756 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +0000757 longflag = 1;
758 ++f;
759 }
760#ifdef HAVE_LONG_LONG
761 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +0000762 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +0000763 longlongflag = 1;
764 f += 2;
765 }
766#endif
767 }
768 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +0000769 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +0000770 size_tflag = 1;
771 ++f;
772 }
773 if (p_longflag != NULL)
774 *p_longflag = longflag;
775 if (p_longlongflag != NULL)
776 *p_longlongflag = longlongflag;
777 if (p_size_tflag != NULL)
778 *p_size_tflag = size_tflag;
779 return f;
780}
781
Walter Dörwaldd2034312007-05-18 16:29:38 +0000782#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
783
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000784/* size of fixed-size buffer for formatting single arguments */
785#define ITEM_BUFFER_LEN 21
786/* maximum number of characters required for output of %ld. 21 characters
787 allows for 64-bit integers (in decimal) and an optional sign. */
788#define MAX_LONG_CHARS 21
789/* maximum number of characters required for output of %lld.
790 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
791 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
792#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
793
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794PyObject *
795PyUnicode_FromFormatV(const char *format, va_list vargs)
796{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000797 va_list count;
798 Py_ssize_t callcount = 0;
799 PyObject **callresults = NULL;
800 PyObject **callresult = NULL;
801 Py_ssize_t n = 0;
802 int width = 0;
803 int precision = 0;
804 int zeropad;
805 const char* f;
806 Py_UNICODE *s;
807 PyObject *string;
808 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000809 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* use abuffer instead of buffer, if we need more space
811 * (which can happen if there's a format specifier with width). */
812 char *abuffer = NULL;
813 char *realbuffer;
814 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000815 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000816 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000817
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000818 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000819 /* step 1: count the number of %S/%R/%A/%s format specifications
820 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
821 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
822 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000823 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000824 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +0000825 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
826 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
827 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000828 ++callcount;
829 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000830 else if (128 <= (unsigned char)*f) {
831 PyErr_Format(PyExc_ValueError,
832 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000833 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000834 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000835 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000836 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000837 }
838 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000839 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000840 if (callcount) {
841 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
842 if (!callresults) {
843 PyErr_NoMemory();
844 return NULL;
845 }
846 callresult = callresults;
847 }
848 /* step 3: figure out how large a buffer we need */
849 for (f = format; *f; f++) {
850 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000851#ifdef HAVE_LONG_LONG
Victor Stinner96865452011-03-01 23:44:09 +0000852 int longlongflag;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000853#endif
Victor Stinner96865452011-03-01 23:44:09 +0000854 const char* p;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000855
Victor Stinner96865452011-03-01 23:44:09 +0000856 p = f;
857 f = parse_format_flags(f, &width, NULL,
858 NULL, &longlongflag, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859
Benjamin Peterson14339b62009-01-31 16:36:08 +0000860 switch (*f) {
861 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000862 {
863#ifndef Py_UNICODE_WIDE
864 int ordinal = va_arg(count, int);
865 if (ordinal > 0xffff)
866 n += 2;
867 else
868 n++;
869#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000870 (void)va_arg(count, int);
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000871 n++;
872#endif
873 break;
874 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000875 case '%':
876 n++;
877 break;
878 case 'd': case 'u': case 'i': case 'x':
879 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000880#ifdef HAVE_LONG_LONG
881 if (longlongflag) {
882 if (width < MAX_LONG_LONG_CHARS)
883 width = MAX_LONG_LONG_CHARS;
884 }
885 else
886#endif
887 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
888 including sign. Decimal takes the most space. This
889 isn't enough for octal. If a width is specified we
890 need more (which we allocate later). */
891 if (width < MAX_LONG_CHARS)
892 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000893 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000894 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000895 if (abuffersize < width)
896 abuffersize = width;
897 break;
898 case 's':
899 {
900 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000901 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000902 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
903 if (!str)
904 goto fail;
905 n += PyUnicode_GET_SIZE(str);
906 /* Remember the str and switch to the next slot */
907 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000908 break;
909 }
910 case 'U':
911 {
912 PyObject *obj = va_arg(count, PyObject *);
913 assert(obj && PyUnicode_Check(obj));
914 n += PyUnicode_GET_SIZE(obj);
915 break;
916 }
917 case 'V':
918 {
919 PyObject *obj = va_arg(count, PyObject *);
920 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000921 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000922 assert(obj || str);
923 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +0000924 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000925 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000926 *callresult++ = NULL;
927 }
928 else {
929 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
930 if (!str_obj)
931 goto fail;
932 n += PyUnicode_GET_SIZE(str_obj);
933 *callresult++ = str_obj;
934 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000935 break;
936 }
937 case 'S':
938 {
939 PyObject *obj = va_arg(count, PyObject *);
940 PyObject *str;
941 assert(obj);
942 str = PyObject_Str(obj);
943 if (!str)
944 goto fail;
945 n += PyUnicode_GET_SIZE(str);
946 /* Remember the str and switch to the next slot */
947 *callresult++ = str;
948 break;
949 }
950 case 'R':
951 {
952 PyObject *obj = va_arg(count, PyObject *);
953 PyObject *repr;
954 assert(obj);
955 repr = PyObject_Repr(obj);
956 if (!repr)
957 goto fail;
958 n += PyUnicode_GET_SIZE(repr);
959 /* Remember the repr and switch to the next slot */
960 *callresult++ = repr;
961 break;
962 }
963 case 'A':
964 {
965 PyObject *obj = va_arg(count, PyObject *);
966 PyObject *ascii;
967 assert(obj);
968 ascii = PyObject_ASCII(obj);
969 if (!ascii)
970 goto fail;
971 n += PyUnicode_GET_SIZE(ascii);
972 /* Remember the repr and switch to the next slot */
973 *callresult++ = ascii;
974 break;
975 }
976 case 'p':
977 (void) va_arg(count, int);
978 /* maximum 64-bit pointer representation:
979 * 0xffffffffffffffff
980 * so 19 characters is enough.
981 * XXX I count 18 -- what's the extra for?
982 */
983 n += 19;
984 break;
985 default:
986 /* if we stumble upon an unknown
987 formatting code, copy the rest of
988 the format string to the output
989 string. (we cannot just skip the
990 code, since there's no way to know
991 what's in the argument list) */
992 n += strlen(p);
993 goto expand;
994 }
995 } else
996 n++;
997 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000998 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000999 if (abuffersize > ITEM_BUFFER_LEN) {
1000 /* add 1 for sprintf's trailing null byte */
1001 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001002 if (!abuffer) {
1003 PyErr_NoMemory();
1004 goto fail;
1005 }
1006 realbuffer = abuffer;
1007 }
1008 else
1009 realbuffer = buffer;
1010 /* step 4: fill the buffer */
1011 /* Since we've analyzed how much space we need for the worst case,
1012 we don't have to resize the string.
1013 There can be no errors beyond this point. */
1014 string = PyUnicode_FromUnicode(NULL, n);
1015 if (!string)
1016 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001017
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 s = PyUnicode_AS_UNICODE(string);
1019 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001020
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 for (f = format; *f; f++) {
1022 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001023 const char* p;
1024 int longflag;
1025 int longlongflag;
1026 int size_tflag;
1027
1028 p = f;
1029 zeropad = (f[1] == '0');
1030 f = parse_format_flags(f, &width, &precision,
1031 &longflag, &longlongflag, &size_tflag);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001032
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 switch (*f) {
1034 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001035 {
1036 int ordinal = va_arg(vargs, int);
1037#ifndef Py_UNICODE_WIDE
1038 if (ordinal > 0xffff) {
1039 ordinal -= 0x10000;
1040 *s++ = 0xD800 | (ordinal >> 10);
1041 *s++ = 0xDC00 | (ordinal & 0x3FF);
1042 } else
1043#endif
1044 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001046 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001047 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001048 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
Victor Stinner6d970f42011-03-02 00:04:25 +00001050 width, precision, *f);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 if (longflag)
1052 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001053#ifdef HAVE_LONG_LONG
1054 else if (longlongflag)
1055 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1056#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001057 else if (size_tflag)
1058 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1059 else
1060 sprintf(realbuffer, fmt, va_arg(vargs, int));
1061 appendstring(realbuffer);
1062 break;
1063 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001064 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1065 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001066 if (longflag)
1067 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001068#ifdef HAVE_LONG_LONG
1069 else if (longlongflag)
1070 sprintf(realbuffer, fmt, va_arg(vargs,
1071 unsigned PY_LONG_LONG));
1072#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001073 else if (size_tflag)
1074 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1075 else
1076 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1077 appendstring(realbuffer);
1078 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001079 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001080 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001081 sprintf(realbuffer, fmt, va_arg(vargs, int));
1082 appendstring(realbuffer);
1083 break;
1084 case 's':
1085 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001086 /* unused, since we already have the result */
1087 (void) va_arg(vargs, char *);
1088 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1089 PyUnicode_GET_SIZE(*callresult));
1090 s += PyUnicode_GET_SIZE(*callresult);
1091 /* We're done with the unicode()/repr() => forget it */
1092 Py_DECREF(*callresult);
1093 /* switch to next unicode()/repr() result */
1094 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001095 break;
1096 }
1097 case 'U':
1098 {
1099 PyObject *obj = va_arg(vargs, PyObject *);
1100 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1101 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1102 s += size;
1103 break;
1104 }
1105 case 'V':
1106 {
1107 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001108 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001109 if (obj) {
1110 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1111 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1112 s += size;
1113 } else {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001114 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1115 PyUnicode_GET_SIZE(*callresult));
1116 s += PyUnicode_GET_SIZE(*callresult);
1117 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001118 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001119 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001120 break;
1121 }
1122 case 'S':
1123 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001124 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001125 {
1126 Py_UNICODE *ucopy;
1127 Py_ssize_t usize;
1128 Py_ssize_t upos;
1129 /* unused, since we already have the result */
1130 (void) va_arg(vargs, PyObject *);
1131 ucopy = PyUnicode_AS_UNICODE(*callresult);
1132 usize = PyUnicode_GET_SIZE(*callresult);
1133 for (upos = 0; upos<usize;)
1134 *s++ = ucopy[upos++];
1135 /* We're done with the unicode()/repr() => forget it */
1136 Py_DECREF(*callresult);
1137 /* switch to next unicode()/repr() result */
1138 ++callresult;
1139 break;
1140 }
1141 case 'p':
1142 sprintf(buffer, "%p", va_arg(vargs, void*));
1143 /* %p is ill-defined: ensure leading 0x. */
1144 if (buffer[1] == 'X')
1145 buffer[1] = 'x';
1146 else if (buffer[1] != 'x') {
1147 memmove(buffer+2, buffer, strlen(buffer)+1);
1148 buffer[0] = '0';
1149 buffer[1] = 'x';
1150 }
1151 appendstring(buffer);
1152 break;
1153 case '%':
1154 *s++ = '%';
1155 break;
1156 default:
1157 appendstring(p);
1158 goto end;
1159 }
Victor Stinner1205f272010-09-11 00:54:47 +00001160 }
Victor Stinner1205f272010-09-11 00:54:47 +00001161 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 *s++ = *f;
1163 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001164
Benjamin Peterson29060642009-01-31 22:14:21 +00001165 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001166 if (callresults)
1167 PyObject_Free(callresults);
1168 if (abuffer)
1169 PyObject_Free(abuffer);
1170 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1171 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001173 if (callresults) {
1174 PyObject **callresult2 = callresults;
1175 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001176 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001177 ++callresult2;
1178 }
1179 PyObject_Free(callresults);
1180 }
1181 if (abuffer)
1182 PyObject_Free(abuffer);
1183 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001184}
1185
1186#undef appendstring
1187
1188PyObject *
1189PyUnicode_FromFormat(const char *format, ...)
1190{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001191 PyObject* ret;
1192 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001193
1194#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001195 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001196#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001197 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001198#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001199 ret = PyUnicode_FromFormatV(format, vargs);
1200 va_end(vargs);
1201 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001202}
1203
Victor Stinner5593d8a2010-10-02 11:11:27 +00001204/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1205 convert a Unicode object to a wide character string.
1206
1207 - If w is NULL: return the number of wide characters (including the nul
1208 character) required to convert the unicode object. Ignore size argument.
1209
1210 - Otherwise: return the number of wide characters (excluding the nul
1211 character) written into w. Write at most size wide characters (including
1212 the nul character). */
1213static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001214unicode_aswidechar(PyUnicodeObject *unicode,
1215 wchar_t *w,
1216 Py_ssize_t size)
1217{
1218#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001219 Py_ssize_t res;
1220 if (w != NULL) {
1221 res = PyUnicode_GET_SIZE(unicode);
1222 if (size > res)
1223 size = res + 1;
1224 else
1225 res = size;
1226 memcpy(w, unicode->str, size * sizeof(wchar_t));
1227 return res;
1228 }
1229 else
1230 return PyUnicode_GET_SIZE(unicode) + 1;
1231#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1232 register const Py_UNICODE *u;
1233 const Py_UNICODE *uend;
1234 const wchar_t *worig, *wend;
1235 Py_ssize_t nchar;
1236
Victor Stinner137c34c2010-09-29 10:25:54 +00001237 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001238 uend = u + PyUnicode_GET_SIZE(unicode);
1239 if (w != NULL) {
1240 worig = w;
1241 wend = w + size;
1242 while (u != uend && w != wend) {
1243 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1244 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1245 {
1246 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1247 u += 2;
1248 }
1249 else {
1250 *w = *u;
1251 u++;
1252 }
1253 w++;
1254 }
1255 if (w != wend)
1256 *w = L'\0';
1257 return w - worig;
1258 }
1259 else {
1260 nchar = 1; /* nul character at the end */
1261 while (u != uend) {
1262 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1263 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1264 u += 2;
1265 else
1266 u++;
1267 nchar++;
1268 }
1269 }
1270 return nchar;
1271#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1272 register Py_UNICODE *u, *uend, ordinal;
1273 register Py_ssize_t i;
1274 wchar_t *worig, *wend;
1275 Py_ssize_t nchar;
1276
1277 u = PyUnicode_AS_UNICODE(unicode);
1278 uend = u + PyUnicode_GET_SIZE(u);
1279 if (w != NULL) {
1280 worig = w;
1281 wend = w + size;
1282 while (u != uend && w != wend) {
1283 ordinal = *u;
1284 if (ordinal > 0xffff) {
1285 ordinal -= 0x10000;
1286 *w++ = 0xD800 | (ordinal >> 10);
1287 *w++ = 0xDC00 | (ordinal & 0x3FF);
1288 }
1289 else
1290 *w++ = ordinal;
1291 u++;
1292 }
1293 if (w != wend)
1294 *w = 0;
1295 return w - worig;
1296 }
1297 else {
1298 nchar = 1; /* nul character */
1299 while (u != uend) {
1300 if (*u > 0xffff)
1301 nchar += 2;
1302 else
1303 nchar++;
1304 u++;
1305 }
1306 return nchar;
1307 }
1308#else
1309# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001310#endif
1311}
1312
1313Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001314PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001315 wchar_t *w,
1316 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317{
1318 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001319 PyErr_BadInternalCall();
1320 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001322 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323}
1324
Victor Stinner137c34c2010-09-29 10:25:54 +00001325wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001326PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001327 Py_ssize_t *size)
1328{
1329 wchar_t* buffer;
1330 Py_ssize_t buflen;
1331
1332 if (unicode == NULL) {
1333 PyErr_BadInternalCall();
1334 return NULL;
1335 }
1336
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001337 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001338 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001339 PyErr_NoMemory();
1340 return NULL;
1341 }
1342
Victor Stinner137c34c2010-09-29 10:25:54 +00001343 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1344 if (buffer == NULL) {
1345 PyErr_NoMemory();
1346 return NULL;
1347 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001348 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001349 if (size != NULL)
1350 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001351 return buffer;
1352}
1353
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354#endif
1355
Alexander Belopolsky40018472011-02-26 01:02:56 +00001356PyObject *
1357PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001358{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001359 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001360
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001361 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001362 PyErr_SetString(PyExc_ValueError,
1363 "chr() arg not in range(0x110000)");
1364 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001365 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001366
1367#ifndef Py_UNICODE_WIDE
1368 if (ordinal > 0xffff) {
1369 ordinal -= 0x10000;
1370 s[0] = 0xD800 | (ordinal >> 10);
1371 s[1] = 0xDC00 | (ordinal & 0x3FF);
1372 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001373 }
1374#endif
1375
Hye-Shik Chang40574832004-04-06 07:24:51 +00001376 s[0] = (Py_UNICODE)ordinal;
1377 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001378}
1379
Alexander Belopolsky40018472011-02-26 01:02:56 +00001380PyObject *
1381PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001383 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001384 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001385 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001386 Py_INCREF(obj);
1387 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001388 }
1389 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 /* For a Unicode subtype that's not a Unicode object,
1391 return a true Unicode object with the same data. */
1392 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1393 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001394 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001395 PyErr_Format(PyExc_TypeError,
1396 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001397 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001398 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001399}
1400
Alexander Belopolsky40018472011-02-26 01:02:56 +00001401PyObject *
1402PyUnicode_FromEncodedObject(register PyObject *obj,
1403 const char *encoding,
1404 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001405{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001406 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001407 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001408
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001410 PyErr_BadInternalCall();
1411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001413
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001414 /* Decoding bytes objects is the most common case and should be fast */
1415 if (PyBytes_Check(obj)) {
1416 if (PyBytes_GET_SIZE(obj) == 0) {
1417 Py_INCREF(unicode_empty);
1418 v = (PyObject *) unicode_empty;
1419 }
1420 else {
1421 v = PyUnicode_Decode(
1422 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1423 encoding, errors);
1424 }
1425 return v;
1426 }
1427
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001428 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001429 PyErr_SetString(PyExc_TypeError,
1430 "decoding str is not supported");
1431 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001432 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001433
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001434 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1435 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1436 PyErr_Format(PyExc_TypeError,
1437 "coercing to str: need bytes, bytearray "
1438 "or buffer-like object, %.80s found",
1439 Py_TYPE(obj)->tp_name);
1440 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001441 }
Tim Petersced69f82003-09-16 20:30:58 +00001442
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001443 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001444 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001445 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446 }
Tim Petersced69f82003-09-16 20:30:58 +00001447 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001448 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001449
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001450 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001451 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452}
1453
Victor Stinner600d3be2010-06-10 12:00:55 +00001454/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001455 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1456 1 on success. */
1457static int
1458normalize_encoding(const char *encoding,
1459 char *lower,
1460 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001462 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001463 char *l;
1464 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001465
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001466 e = encoding;
1467 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001468 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001469 while (*e) {
1470 if (l == l_end)
1471 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001472 if (Py_ISUPPER(*e)) {
1473 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001474 }
1475 else if (*e == '_') {
1476 *l++ = '-';
1477 e++;
1478 }
1479 else {
1480 *l++ = *e++;
1481 }
1482 }
1483 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001484 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001485}
1486
Alexander Belopolsky40018472011-02-26 01:02:56 +00001487PyObject *
1488PyUnicode_Decode(const char *s,
1489 Py_ssize_t size,
1490 const char *encoding,
1491 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00001492{
1493 PyObject *buffer = NULL, *unicode;
1494 Py_buffer info;
1495 char lower[11]; /* Enough for any encoding shortcut */
1496
1497 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001499
1500 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001501 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001502 if ((strcmp(lower, "utf-8") == 0) ||
1503 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001504 return PyUnicode_DecodeUTF8(s, size, errors);
1505 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001506 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001507 (strcmp(lower, "iso-8859-1") == 0))
1508 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001509#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001510 else if (strcmp(lower, "mbcs") == 0)
1511 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001512#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001513 else if (strcmp(lower, "ascii") == 0)
1514 return PyUnicode_DecodeASCII(s, size, errors);
1515 else if (strcmp(lower, "utf-16") == 0)
1516 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1517 else if (strcmp(lower, "utf-32") == 0)
1518 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001520
1521 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001522 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001523 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001524 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001525 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 if (buffer == NULL)
1527 goto onError;
1528 unicode = PyCodec_Decode(buffer, encoding, errors);
1529 if (unicode == NULL)
1530 goto onError;
1531 if (!PyUnicode_Check(unicode)) {
1532 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001533 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001534 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001535 Py_DECREF(unicode);
1536 goto onError;
1537 }
1538 Py_DECREF(buffer);
1539 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001540
Benjamin Peterson29060642009-01-31 22:14:21 +00001541 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 Py_XDECREF(buffer);
1543 return NULL;
1544}
1545
Alexander Belopolsky40018472011-02-26 01:02:56 +00001546PyObject *
1547PyUnicode_AsDecodedObject(PyObject *unicode,
1548 const char *encoding,
1549 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001550{
1551 PyObject *v;
1552
1553 if (!PyUnicode_Check(unicode)) {
1554 PyErr_BadArgument();
1555 goto onError;
1556 }
1557
1558 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001559 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001560
1561 /* Decode via the codec registry */
1562 v = PyCodec_Decode(unicode, encoding, errors);
1563 if (v == NULL)
1564 goto onError;
1565 return v;
1566
Benjamin Peterson29060642009-01-31 22:14:21 +00001567 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001568 return NULL;
1569}
1570
Alexander Belopolsky40018472011-02-26 01:02:56 +00001571PyObject *
1572PyUnicode_AsDecodedUnicode(PyObject *unicode,
1573 const char *encoding,
1574 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001575{
1576 PyObject *v;
1577
1578 if (!PyUnicode_Check(unicode)) {
1579 PyErr_BadArgument();
1580 goto onError;
1581 }
1582
1583 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001584 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001585
1586 /* Decode via the codec registry */
1587 v = PyCodec_Decode(unicode, encoding, errors);
1588 if (v == NULL)
1589 goto onError;
1590 if (!PyUnicode_Check(v)) {
1591 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001592 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001593 Py_TYPE(v)->tp_name);
1594 Py_DECREF(v);
1595 goto onError;
1596 }
1597 return v;
1598
Benjamin Peterson29060642009-01-31 22:14:21 +00001599 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001600 return NULL;
1601}
1602
Alexander Belopolsky40018472011-02-26 01:02:56 +00001603PyObject *
1604PyUnicode_Encode(const Py_UNICODE *s,
1605 Py_ssize_t size,
1606 const char *encoding,
1607 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608{
1609 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001610
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 unicode = PyUnicode_FromUnicode(s, size);
1612 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001613 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1615 Py_DECREF(unicode);
1616 return v;
1617}
1618
Alexander Belopolsky40018472011-02-26 01:02:56 +00001619PyObject *
1620PyUnicode_AsEncodedObject(PyObject *unicode,
1621 const char *encoding,
1622 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001623{
1624 PyObject *v;
1625
1626 if (!PyUnicode_Check(unicode)) {
1627 PyErr_BadArgument();
1628 goto onError;
1629 }
1630
1631 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001632 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001633
1634 /* Encode via the codec registry */
1635 v = PyCodec_Encode(unicode, encoding, errors);
1636 if (v == NULL)
1637 goto onError;
1638 return v;
1639
Benjamin Peterson29060642009-01-31 22:14:21 +00001640 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001641 return NULL;
1642}
1643
Victor Stinnerad158722010-10-27 00:25:46 +00001644PyObject *
1645PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001646{
Victor Stinner313a1202010-06-11 23:56:51 +00001647#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001648 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1649 PyUnicode_GET_SIZE(unicode),
1650 NULL);
1651#elif defined(__APPLE__)
1652 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1653 PyUnicode_GET_SIZE(unicode),
1654 "surrogateescape");
1655#else
1656 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001657 return PyUnicode_AsEncodedString(unicode,
1658 Py_FileSystemDefaultEncoding,
1659 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001660 }
1661 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001662 /* locale encoding with surrogateescape */
1663 wchar_t *wchar;
1664 char *bytes;
1665 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001666 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001667
1668 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1669 if (wchar == NULL)
1670 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001671 bytes = _Py_wchar2char(wchar, &error_pos);
1672 if (bytes == NULL) {
1673 if (error_pos != (size_t)-1) {
1674 char *errmsg = strerror(errno);
1675 PyObject *exc = NULL;
1676 if (errmsg == NULL)
1677 errmsg = "Py_wchar2char() failed";
1678 raise_encode_exception(&exc,
1679 "filesystemencoding",
1680 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1681 error_pos, error_pos+1,
1682 errmsg);
1683 Py_XDECREF(exc);
1684 }
1685 else
1686 PyErr_NoMemory();
1687 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001688 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001689 }
1690 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001691
1692 bytes_obj = PyBytes_FromString(bytes);
1693 PyMem_Free(bytes);
1694 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001695 }
Victor Stinnerad158722010-10-27 00:25:46 +00001696#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001697}
1698
Alexander Belopolsky40018472011-02-26 01:02:56 +00001699PyObject *
1700PyUnicode_AsEncodedString(PyObject *unicode,
1701 const char *encoding,
1702 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703{
1704 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001705 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001706
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 if (!PyUnicode_Check(unicode)) {
1708 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001709 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710 }
Fred Drakee4315f52000-05-09 19:53:39 +00001711
Tim Petersced69f82003-09-16 20:30:58 +00001712 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001713 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1714 PyUnicode_GET_SIZE(unicode),
1715 errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001716
1717 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001718 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001719 if ((strcmp(lower, "utf-8") == 0) ||
1720 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001721 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1722 PyUnicode_GET_SIZE(unicode),
1723 errors);
1724 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001725 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001726 (strcmp(lower, "iso-8859-1") == 0))
1727 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1728 PyUnicode_GET_SIZE(unicode),
1729 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001730#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001731 else if (strcmp(lower, "mbcs") == 0)
1732 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1733 PyUnicode_GET_SIZE(unicode),
1734 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001735#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001736 else if (strcmp(lower, "ascii") == 0)
1737 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1738 PyUnicode_GET_SIZE(unicode),
1739 errors);
1740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741
1742 /* Encode via the codec registry */
1743 v = PyCodec_Encode(unicode, encoding, errors);
1744 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001745 return NULL;
1746
1747 /* The normal path */
1748 if (PyBytes_Check(v))
1749 return v;
1750
1751 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001752 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001753 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001754 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001755
1756 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1757 "encoder %s returned bytearray instead of bytes",
1758 encoding);
1759 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001760 Py_DECREF(v);
1761 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001762 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001763
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001764 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1765 Py_DECREF(v);
1766 return b;
1767 }
1768
1769 PyErr_Format(PyExc_TypeError,
1770 "encoder did not return a bytes object (type=%.400s)",
1771 Py_TYPE(v)->tp_name);
1772 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001773 return NULL;
1774}
1775
Alexander Belopolsky40018472011-02-26 01:02:56 +00001776PyObject *
1777PyUnicode_AsEncodedUnicode(PyObject *unicode,
1778 const char *encoding,
1779 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001780{
1781 PyObject *v;
1782
1783 if (!PyUnicode_Check(unicode)) {
1784 PyErr_BadArgument();
1785 goto onError;
1786 }
1787
1788 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001789 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001790
1791 /* Encode via the codec registry */
1792 v = PyCodec_Encode(unicode, encoding, errors);
1793 if (v == NULL)
1794 goto onError;
1795 if (!PyUnicode_Check(v)) {
1796 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001797 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001798 Py_TYPE(v)->tp_name);
1799 Py_DECREF(v);
1800 goto onError;
1801 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001803
Benjamin Peterson29060642009-01-31 22:14:21 +00001804 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 return NULL;
1806}
1807
Alexander Belopolsky40018472011-02-26 01:02:56 +00001808PyObject *
1809_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1810 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001811{
1812 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001813 if (v)
1814 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001815 if (errors != NULL)
1816 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001817 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001818 PyUnicode_GET_SIZE(unicode),
1819 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001820 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001821 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001822 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001823 return v;
1824}
1825
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001826PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001827PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001828 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001829 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1830}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001831
Christian Heimes5894ba72007-11-04 11:43:14 +00001832PyObject*
1833PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1834{
Victor Stinnerad158722010-10-27 00:25:46 +00001835#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1836 return PyUnicode_DecodeMBCS(s, size, NULL);
1837#elif defined(__APPLE__)
1838 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1839#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001840 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1841 can be undefined. If it is case, decode using UTF-8. The following assumes
1842 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1843 bootstrapping process where the codecs aren't ready yet.
1844 */
1845 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001846 return PyUnicode_Decode(s, size,
1847 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001848 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001849 }
1850 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001851 /* locale encoding with surrogateescape */
1852 wchar_t *wchar;
1853 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001854 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001855
1856 if (s[size] != '\0' || size != strlen(s)) {
1857 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1858 return NULL;
1859 }
1860
Victor Stinner168e1172010-10-16 23:16:16 +00001861 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001862 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001863 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001864
Victor Stinner168e1172010-10-16 23:16:16 +00001865 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001866 PyMem_Free(wchar);
1867 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001868 }
Victor Stinnerad158722010-10-27 00:25:46 +00001869#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001870}
1871
Martin v. Löwis011e8422009-05-05 04:43:17 +00001872
1873int
1874PyUnicode_FSConverter(PyObject* arg, void* addr)
1875{
1876 PyObject *output = NULL;
1877 Py_ssize_t size;
1878 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001879 if (arg == NULL) {
1880 Py_DECREF(*(PyObject**)addr);
1881 return 1;
1882 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001883 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001884 output = arg;
1885 Py_INCREF(output);
1886 }
1887 else {
1888 arg = PyUnicode_FromObject(arg);
1889 if (!arg)
1890 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001891 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001892 Py_DECREF(arg);
1893 if (!output)
1894 return 0;
1895 if (!PyBytes_Check(output)) {
1896 Py_DECREF(output);
1897 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1898 return 0;
1899 }
1900 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001901 size = PyBytes_GET_SIZE(output);
1902 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001903 if (size != strlen(data)) {
1904 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1905 Py_DECREF(output);
1906 return 0;
1907 }
1908 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001909 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001910}
1911
1912
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001913int
1914PyUnicode_FSDecoder(PyObject* arg, void* addr)
1915{
1916 PyObject *output = NULL;
1917 Py_ssize_t size;
1918 void *data;
1919 if (arg == NULL) {
1920 Py_DECREF(*(PyObject**)addr);
1921 return 1;
1922 }
1923 if (PyUnicode_Check(arg)) {
1924 output = arg;
1925 Py_INCREF(output);
1926 }
1927 else {
1928 arg = PyBytes_FromObject(arg);
1929 if (!arg)
1930 return 0;
1931 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1932 PyBytes_GET_SIZE(arg));
1933 Py_DECREF(arg);
1934 if (!output)
1935 return 0;
1936 if (!PyUnicode_Check(output)) {
1937 Py_DECREF(output);
1938 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1939 return 0;
1940 }
1941 }
1942 size = PyUnicode_GET_SIZE(output);
1943 data = PyUnicode_AS_UNICODE(output);
1944 if (size != Py_UNICODE_strlen(data)) {
1945 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1946 Py_DECREF(output);
1947 return 0;
1948 }
1949 *(PyObject**)addr = output;
1950 return Py_CLEANUP_SUPPORTED;
1951}
1952
1953
Martin v. Löwis5b222132007-06-10 09:51:05 +00001954char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001955_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001956{
Christian Heimesf3863112007-11-22 07:46:41 +00001957 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001958 if (!PyUnicode_Check(unicode)) {
1959 PyErr_BadArgument();
1960 return NULL;
1961 }
Christian Heimesf3863112007-11-22 07:46:41 +00001962 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1963 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001964 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001965 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001966 *psize = PyBytes_GET_SIZE(bytes);
1967 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001968}
1969
1970char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001971_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001972{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001973 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001974}
1975
Alexander Belopolsky40018472011-02-26 01:02:56 +00001976Py_UNICODE *
1977PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978{
1979 if (!PyUnicode_Check(unicode)) {
1980 PyErr_BadArgument();
1981 goto onError;
1982 }
1983 return PyUnicode_AS_UNICODE(unicode);
1984
Benjamin Peterson29060642009-01-31 22:14:21 +00001985 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 return NULL;
1987}
1988
Alexander Belopolsky40018472011-02-26 01:02:56 +00001989Py_ssize_t
1990PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991{
1992 if (!PyUnicode_Check(unicode)) {
1993 PyErr_BadArgument();
1994 goto onError;
1995 }
1996 return PyUnicode_GET_SIZE(unicode);
1997
Benjamin Peterson29060642009-01-31 22:14:21 +00001998 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001999 return -1;
2000}
2001
Alexander Belopolsky40018472011-02-26 01:02:56 +00002002const char *
2003PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002004{
Victor Stinner42cb4622010-09-01 19:39:01 +00002005 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002006}
2007
Victor Stinner554f3f02010-06-16 23:33:54 +00002008/* create or adjust a UnicodeDecodeError */
2009static void
2010make_decode_exception(PyObject **exceptionObject,
2011 const char *encoding,
2012 const char *input, Py_ssize_t length,
2013 Py_ssize_t startpos, Py_ssize_t endpos,
2014 const char *reason)
2015{
2016 if (*exceptionObject == NULL) {
2017 *exceptionObject = PyUnicodeDecodeError_Create(
2018 encoding, input, length, startpos, endpos, reason);
2019 }
2020 else {
2021 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2022 goto onError;
2023 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2024 goto onError;
2025 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2026 goto onError;
2027 }
2028 return;
2029
2030onError:
2031 Py_DECREF(*exceptionObject);
2032 *exceptionObject = NULL;
2033}
2034
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002035/* error handling callback helper:
2036 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002037 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002038 and adjust various state variables.
2039 return 0 on success, -1 on error
2040*/
2041
Alexander Belopolsky40018472011-02-26 01:02:56 +00002042static int
2043unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2044 const char *encoding, const char *reason,
2045 const char **input, const char **inend, Py_ssize_t *startinpos,
2046 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2047 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002048{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002049 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002050
2051 PyObject *restuple = NULL;
2052 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002053 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002054 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002055 Py_ssize_t requiredsize;
2056 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002057 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002058 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002059 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002060 int res = -1;
2061
2062 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002063 *errorHandler = PyCodec_LookupError(errors);
2064 if (*errorHandler == NULL)
2065 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002066 }
2067
Victor Stinner554f3f02010-06-16 23:33:54 +00002068 make_decode_exception(exceptionObject,
2069 encoding,
2070 *input, *inend - *input,
2071 *startinpos, *endinpos,
2072 reason);
2073 if (*exceptionObject == NULL)
2074 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002075
2076 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2077 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002078 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002080 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002081 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 }
2083 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002084 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002085
2086 /* Copy back the bytes variables, which might have been modified by the
2087 callback */
2088 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2089 if (!inputobj)
2090 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002091 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002092 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002093 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002094 *input = PyBytes_AS_STRING(inputobj);
2095 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002096 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002097 /* we can DECREF safely, as the exception has another reference,
2098 so the object won't go away. */
2099 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002100
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002101 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002102 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002103 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002104 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2105 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002106 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002107
2108 /* need more space? (at least enough for what we
2109 have+the replacement+the rest of the string (starting
2110 at the new input position), so we won't have to check space
2111 when there are no errors in the rest of the string) */
2112 repptr = PyUnicode_AS_UNICODE(repunicode);
2113 repsize = PyUnicode_GET_SIZE(repunicode);
2114 requiredsize = *outpos + repsize + insize-newpos;
2115 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002116 if (requiredsize<2*outsize)
2117 requiredsize = 2*outsize;
2118 if (_PyUnicode_Resize(output, requiredsize) < 0)
2119 goto onError;
2120 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002121 }
2122 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002123 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002124 Py_UNICODE_COPY(*outptr, repptr, repsize);
2125 *outptr += repsize;
2126 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002127
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002128 /* we made it! */
2129 res = 0;
2130
Benjamin Peterson29060642009-01-31 22:14:21 +00002131 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002132 Py_XDECREF(restuple);
2133 return res;
2134}
2135
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002136/* --- UTF-7 Codec -------------------------------------------------------- */
2137
Antoine Pitrou244651a2009-05-04 18:56:13 +00002138/* See RFC2152 for details. We encode conservatively and decode liberally. */
2139
2140/* Three simple macros defining base-64. */
2141
2142/* Is c a base-64 character? */
2143
2144#define IS_BASE64(c) \
2145 (((c) >= 'A' && (c) <= 'Z') || \
2146 ((c) >= 'a' && (c) <= 'z') || \
2147 ((c) >= '0' && (c) <= '9') || \
2148 (c) == '+' || (c) == '/')
2149
2150/* given that c is a base-64 character, what is its base-64 value? */
2151
2152#define FROM_BASE64(c) \
2153 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2154 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2155 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2156 (c) == '+' ? 62 : 63)
2157
2158/* What is the base-64 character of the bottom 6 bits of n? */
2159
2160#define TO_BASE64(n) \
2161 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2162
2163/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2164 * decoded as itself. We are permissive on decoding; the only ASCII
2165 * byte not decoding to itself is the + which begins a base64
2166 * string. */
2167
2168#define DECODE_DIRECT(c) \
2169 ((c) <= 127 && (c) != '+')
2170
2171/* The UTF-7 encoder treats ASCII characters differently according to
2172 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2173 * the above). See RFC2152. This array identifies these different
2174 * sets:
2175 * 0 : "Set D"
2176 * alphanumeric and '(),-./:?
2177 * 1 : "Set O"
2178 * !"#$%&*;<=>@[]^_`{|}
2179 * 2 : "whitespace"
2180 * ht nl cr sp
2181 * 3 : special (must be base64 encoded)
2182 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2183 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002184
Tim Petersced69f82003-09-16 20:30:58 +00002185static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002186char utf7_category[128] = {
2187/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2188 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2189/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2190 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2191/* sp ! " # $ % & ' ( ) * + , - . / */
2192 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2193/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2195/* @ A B C D E F G H I J K L M N O */
2196 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2197/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2199/* ` a b c d e f g h i j k l m n o */
2200 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2201/* p q r s t u v w x y z { | } ~ del */
2202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002203};
2204
Antoine Pitrou244651a2009-05-04 18:56:13 +00002205/* ENCODE_DIRECT: this character should be encoded as itself. The
2206 * answer depends on whether we are encoding set O as itself, and also
2207 * on whether we are encoding whitespace as itself. RFC2152 makes it
2208 * clear that the answers to these questions vary between
2209 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002210
Antoine Pitrou244651a2009-05-04 18:56:13 +00002211#define ENCODE_DIRECT(c, directO, directWS) \
2212 ((c) < 128 && (c) > 0 && \
2213 ((utf7_category[(c)] == 0) || \
2214 (directWS && (utf7_category[(c)] == 2)) || \
2215 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002216
Alexander Belopolsky40018472011-02-26 01:02:56 +00002217PyObject *
2218PyUnicode_DecodeUTF7(const char *s,
2219 Py_ssize_t size,
2220 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002221{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002222 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2223}
2224
Antoine Pitrou244651a2009-05-04 18:56:13 +00002225/* The decoder. The only state we preserve is our read position,
2226 * i.e. how many characters we have consumed. So if we end in the
2227 * middle of a shift sequence we have to back off the read position
2228 * and the output to the beginning of the sequence, otherwise we lose
2229 * all the shift state (seen bits, number of bits seen, high
2230 * surrogate). */
2231
Alexander Belopolsky40018472011-02-26 01:02:56 +00002232PyObject *
2233PyUnicode_DecodeUTF7Stateful(const char *s,
2234 Py_ssize_t size,
2235 const char *errors,
2236 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002237{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002238 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002239 Py_ssize_t startinpos;
2240 Py_ssize_t endinpos;
2241 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002242 const char *e;
2243 PyUnicodeObject *unicode;
2244 Py_UNICODE *p;
2245 const char *errmsg = "";
2246 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002247 Py_UNICODE *shiftOutStart;
2248 unsigned int base64bits = 0;
2249 unsigned long base64buffer = 0;
2250 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002251 PyObject *errorHandler = NULL;
2252 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002253
2254 unicode = _PyUnicode_New(size);
2255 if (!unicode)
2256 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002257 if (size == 0) {
2258 if (consumed)
2259 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002260 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002261 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002262
2263 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002264 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002265 e = s + size;
2266
2267 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002268 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002269 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002270 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002271
Antoine Pitrou244651a2009-05-04 18:56:13 +00002272 if (inShift) { /* in a base-64 section */
2273 if (IS_BASE64(ch)) { /* consume a base-64 character */
2274 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2275 base64bits += 6;
2276 s++;
2277 if (base64bits >= 16) {
2278 /* we have enough bits for a UTF-16 value */
2279 Py_UNICODE outCh = (Py_UNICODE)
2280 (base64buffer >> (base64bits-16));
2281 base64bits -= 16;
2282 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2283 if (surrogate) {
2284 /* expecting a second surrogate */
2285 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2286#ifdef Py_UNICODE_WIDE
2287 *p++ = (((surrogate & 0x3FF)<<10)
2288 | (outCh & 0x3FF)) + 0x10000;
2289#else
2290 *p++ = surrogate;
2291 *p++ = outCh;
2292#endif
2293 surrogate = 0;
2294 }
2295 else {
2296 surrogate = 0;
2297 errmsg = "second surrogate missing";
2298 goto utf7Error;
2299 }
2300 }
2301 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2302 /* first surrogate */
2303 surrogate = outCh;
2304 }
2305 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2306 errmsg = "unexpected second surrogate";
2307 goto utf7Error;
2308 }
2309 else {
2310 *p++ = outCh;
2311 }
2312 }
2313 }
2314 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002315 inShift = 0;
2316 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002317 if (surrogate) {
2318 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002319 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002320 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002321 if (base64bits > 0) { /* left-over bits */
2322 if (base64bits >= 6) {
2323 /* We've seen at least one base-64 character */
2324 errmsg = "partial character in shift sequence";
2325 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002326 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002327 else {
2328 /* Some bits remain; they should be zero */
2329 if (base64buffer != 0) {
2330 errmsg = "non-zero padding bits in shift sequence";
2331 goto utf7Error;
2332 }
2333 }
2334 }
2335 if (ch != '-') {
2336 /* '-' is absorbed; other terminating
2337 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002338 *p++ = ch;
2339 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002340 }
2341 }
2342 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002343 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002344 s++; /* consume '+' */
2345 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002346 s++;
2347 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002348 }
2349 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002350 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002351 shiftOutStart = p;
2352 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002353 }
2354 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002355 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002356 *p++ = ch;
2357 s++;
2358 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002359 else {
2360 startinpos = s-starts;
2361 s++;
2362 errmsg = "unexpected special character";
2363 goto utf7Error;
2364 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002365 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002366utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002367 outpos = p-PyUnicode_AS_UNICODE(unicode);
2368 endinpos = s-starts;
2369 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002370 errors, &errorHandler,
2371 "utf7", errmsg,
2372 &starts, &e, &startinpos, &endinpos, &exc, &s,
2373 &unicode, &outpos, &p))
2374 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002375 }
2376
Antoine Pitrou244651a2009-05-04 18:56:13 +00002377 /* end of string */
2378
2379 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2380 /* if we're in an inconsistent state, that's an error */
2381 if (surrogate ||
2382 (base64bits >= 6) ||
2383 (base64bits > 0 && base64buffer != 0)) {
2384 outpos = p-PyUnicode_AS_UNICODE(unicode);
2385 endinpos = size;
2386 if (unicode_decode_call_errorhandler(
2387 errors, &errorHandler,
2388 "utf7", "unterminated shift sequence",
2389 &starts, &e, &startinpos, &endinpos, &exc, &s,
2390 &unicode, &outpos, &p))
2391 goto onError;
2392 if (s < e)
2393 goto restart;
2394 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002395 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002396
2397 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002398 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002399 if (inShift) {
2400 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002401 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002402 }
2403 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002404 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002405 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002406 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002407
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002408 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002409 goto onError;
2410
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002411 Py_XDECREF(errorHandler);
2412 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002413 return (PyObject *)unicode;
2414
Benjamin Peterson29060642009-01-31 22:14:21 +00002415 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002416 Py_XDECREF(errorHandler);
2417 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002418 Py_DECREF(unicode);
2419 return NULL;
2420}
2421
2422
Alexander Belopolsky40018472011-02-26 01:02:56 +00002423PyObject *
2424PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2425 Py_ssize_t size,
2426 int base64SetO,
2427 int base64WhiteSpace,
2428 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002429{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002430 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002431 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002432 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002433 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002434 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002435 unsigned int base64bits = 0;
2436 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002437 char * out;
2438 char * start;
2439
2440 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002441 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002442
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002443 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002444 return PyErr_NoMemory();
2445
Antoine Pitrou244651a2009-05-04 18:56:13 +00002446 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002447 if (v == NULL)
2448 return NULL;
2449
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002450 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002451 for (;i < size; ++i) {
2452 Py_UNICODE ch = s[i];
2453
Antoine Pitrou244651a2009-05-04 18:56:13 +00002454 if (inShift) {
2455 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2456 /* shifting out */
2457 if (base64bits) { /* output remaining bits */
2458 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2459 base64buffer = 0;
2460 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002461 }
2462 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002463 /* Characters not in the BASE64 set implicitly unshift the sequence
2464 so no '-' is required, except if the character is itself a '-' */
2465 if (IS_BASE64(ch) || ch == '-') {
2466 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002467 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002468 *out++ = (char) ch;
2469 }
2470 else {
2471 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002472 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002473 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002474 else { /* not in a shift sequence */
2475 if (ch == '+') {
2476 *out++ = '+';
2477 *out++ = '-';
2478 }
2479 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2480 *out++ = (char) ch;
2481 }
2482 else {
2483 *out++ = '+';
2484 inShift = 1;
2485 goto encode_char;
2486 }
2487 }
2488 continue;
2489encode_char:
2490#ifdef Py_UNICODE_WIDE
2491 if (ch >= 0x10000) {
2492 /* code first surrogate */
2493 base64bits += 16;
2494 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2495 while (base64bits >= 6) {
2496 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2497 base64bits -= 6;
2498 }
2499 /* prepare second surrogate */
2500 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2501 }
2502#endif
2503 base64bits += 16;
2504 base64buffer = (base64buffer << 16) | ch;
2505 while (base64bits >= 6) {
2506 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2507 base64bits -= 6;
2508 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002509 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002510 if (base64bits)
2511 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2512 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002513 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002514 if (_PyBytes_Resize(&v, out - start) < 0)
2515 return NULL;
2516 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002517}
2518
Antoine Pitrou244651a2009-05-04 18:56:13 +00002519#undef IS_BASE64
2520#undef FROM_BASE64
2521#undef TO_BASE64
2522#undef DECODE_DIRECT
2523#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002524
Guido van Rossumd57fd912000-03-10 22:53:23 +00002525/* --- UTF-8 Codec -------------------------------------------------------- */
2526
Tim Petersced69f82003-09-16 20:30:58 +00002527static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002529 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2530 illegal prefix. See RFC 3629 for details */
2531 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2532 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002533 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002542 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2543 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2544 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2545 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2546 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547};
2548
Alexander Belopolsky40018472011-02-26 01:02:56 +00002549PyObject *
2550PyUnicode_DecodeUTF8(const char *s,
2551 Py_ssize_t size,
2552 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553{
Walter Dörwald69652032004-09-07 20:24:22 +00002554 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2555}
2556
Antoine Pitrouab868312009-01-10 15:40:25 +00002557/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2558#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2559
2560/* Mask to quickly check whether a C 'long' contains a
2561 non-ASCII, UTF8-encoded char. */
2562#if (SIZEOF_LONG == 8)
2563# define ASCII_CHAR_MASK 0x8080808080808080L
2564#elif (SIZEOF_LONG == 4)
2565# define ASCII_CHAR_MASK 0x80808080L
2566#else
2567# error C 'long' size should be either 4 or 8!
2568#endif
2569
Alexander Belopolsky40018472011-02-26 01:02:56 +00002570PyObject *
2571PyUnicode_DecodeUTF8Stateful(const char *s,
2572 Py_ssize_t size,
2573 const char *errors,
2574 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002575{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002576 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002577 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002578 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002579 Py_ssize_t startinpos;
2580 Py_ssize_t endinpos;
2581 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002582 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583 PyUnicodeObject *unicode;
2584 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002585 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002586 PyObject *errorHandler = NULL;
2587 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002588
2589 /* Note: size will always be longer than the resulting Unicode
2590 character count */
2591 unicode = _PyUnicode_New(size);
2592 if (!unicode)
2593 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002594 if (size == 0) {
2595 if (consumed)
2596 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599
2600 /* Unpack UTF-8 encoded data */
2601 p = unicode->str;
2602 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002603 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604
2605 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002606 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002607
2608 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002609 /* Fast path for runs of ASCII characters. Given that common UTF-8
2610 input will consist of an overwhelming majority of ASCII
2611 characters, we try to optimize for this case by checking
2612 as many characters as a C 'long' can contain.
2613 First, check if we can do an aligned read, as most CPUs have
2614 a penalty for unaligned reads.
2615 */
2616 if (!((size_t) s & LONG_PTR_MASK)) {
2617 /* Help register allocation */
2618 register const char *_s = s;
2619 register Py_UNICODE *_p = p;
2620 while (_s < aligned_end) {
2621 /* Read a whole long at a time (either 4 or 8 bytes),
2622 and do a fast unrolled copy if it only contains ASCII
2623 characters. */
2624 unsigned long data = *(unsigned long *) _s;
2625 if (data & ASCII_CHAR_MASK)
2626 break;
2627 _p[0] = (unsigned char) _s[0];
2628 _p[1] = (unsigned char) _s[1];
2629 _p[2] = (unsigned char) _s[2];
2630 _p[3] = (unsigned char) _s[3];
2631#if (SIZEOF_LONG == 8)
2632 _p[4] = (unsigned char) _s[4];
2633 _p[5] = (unsigned char) _s[5];
2634 _p[6] = (unsigned char) _s[6];
2635 _p[7] = (unsigned char) _s[7];
2636#endif
2637 _s += SIZEOF_LONG;
2638 _p += SIZEOF_LONG;
2639 }
2640 s = _s;
2641 p = _p;
2642 if (s == e)
2643 break;
2644 ch = (unsigned char)*s;
2645 }
2646 }
2647
2648 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002649 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002650 s++;
2651 continue;
2652 }
2653
2654 n = utf8_code_length[ch];
2655
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002656 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002657 if (consumed)
2658 break;
2659 else {
2660 errmsg = "unexpected end of data";
2661 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002662 endinpos = startinpos+1;
2663 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2664 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002665 goto utf8Error;
2666 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002668
2669 switch (n) {
2670
2671 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002672 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002673 startinpos = s-starts;
2674 endinpos = startinpos+1;
2675 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676
2677 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002678 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002679 startinpos = s-starts;
2680 endinpos = startinpos+1;
2681 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682
2683 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002684 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002685 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002686 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002687 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002688 goto utf8Error;
2689 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002691 assert ((ch > 0x007F) && (ch <= 0x07FF));
2692 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002693 break;
2694
2695 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002696 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2697 will result in surrogates in range d800-dfff. Surrogates are
2698 not valid UTF-8 so they are rejected.
2699 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2700 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002701 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002702 (s[2] & 0xc0) != 0x80 ||
2703 ((unsigned char)s[0] == 0xE0 &&
2704 (unsigned char)s[1] < 0xA0) ||
2705 ((unsigned char)s[0] == 0xED &&
2706 (unsigned char)s[1] > 0x9F)) {
2707 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002708 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002709 endinpos = startinpos + 1;
2710
2711 /* if s[1] first two bits are 1 and 0, then the invalid
2712 continuation byte is s[2], so increment endinpos by 1,
2713 if not, s[1] is invalid and endinpos doesn't need to
2714 be incremented. */
2715 if ((s[1] & 0xC0) == 0x80)
2716 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002717 goto utf8Error;
2718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002720 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2721 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002722 break;
2723
2724 case 4:
2725 if ((s[1] & 0xc0) != 0x80 ||
2726 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002727 (s[3] & 0xc0) != 0x80 ||
2728 ((unsigned char)s[0] == 0xF0 &&
2729 (unsigned char)s[1] < 0x90) ||
2730 ((unsigned char)s[0] == 0xF4 &&
2731 (unsigned char)s[1] > 0x8F)) {
2732 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002733 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002734 endinpos = startinpos + 1;
2735 if ((s[1] & 0xC0) == 0x80) {
2736 endinpos++;
2737 if ((s[2] & 0xC0) == 0x80)
2738 endinpos++;
2739 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002740 goto utf8Error;
2741 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002742 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002743 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2744 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2745
Fredrik Lundh8f455852001-06-27 18:59:43 +00002746#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002747 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002748#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002749 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002750
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002751 /* translate from 10000..10FFFF to 0..FFFF */
2752 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002753
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002754 /* high surrogate = top 10 bits added to D800 */
2755 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002756
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002757 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002758 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002759#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761 }
2762 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002763 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002764
Benjamin Peterson29060642009-01-31 22:14:21 +00002765 utf8Error:
2766 outpos = p-PyUnicode_AS_UNICODE(unicode);
2767 if (unicode_decode_call_errorhandler(
2768 errors, &errorHandler,
2769 "utf8", errmsg,
2770 &starts, &e, &startinpos, &endinpos, &exc, &s,
2771 &unicode, &outpos, &p))
2772 goto onError;
2773 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002774 }
Walter Dörwald69652032004-09-07 20:24:22 +00002775 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002776 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777
2778 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002779 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 goto onError;
2781
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002782 Py_XDECREF(errorHandler);
2783 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 return (PyObject *)unicode;
2785
Benjamin Peterson29060642009-01-31 22:14:21 +00002786 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002787 Py_XDECREF(errorHandler);
2788 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 Py_DECREF(unicode);
2790 return NULL;
2791}
2792
Antoine Pitrouab868312009-01-10 15:40:25 +00002793#undef ASCII_CHAR_MASK
2794
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002795#ifdef __APPLE__
2796
2797/* Simplified UTF-8 decoder using surrogateescape error handler,
2798 used to decode the command line arguments on Mac OS X. */
2799
2800wchar_t*
2801_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2802{
2803 int n;
2804 const char *e;
2805 wchar_t *unicode, *p;
2806
2807 /* Note: size will always be longer than the resulting Unicode
2808 character count */
2809 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2810 PyErr_NoMemory();
2811 return NULL;
2812 }
2813 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2814 if (!unicode)
2815 return NULL;
2816
2817 /* Unpack UTF-8 encoded data */
2818 p = unicode;
2819 e = s + size;
2820 while (s < e) {
2821 Py_UCS4 ch = (unsigned char)*s;
2822
2823 if (ch < 0x80) {
2824 *p++ = (wchar_t)ch;
2825 s++;
2826 continue;
2827 }
2828
2829 n = utf8_code_length[ch];
2830 if (s + n > e) {
2831 goto surrogateescape;
2832 }
2833
2834 switch (n) {
2835 case 0:
2836 case 1:
2837 goto surrogateescape;
2838
2839 case 2:
2840 if ((s[1] & 0xc0) != 0x80)
2841 goto surrogateescape;
2842 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2843 assert ((ch > 0x007F) && (ch <= 0x07FF));
2844 *p++ = (wchar_t)ch;
2845 break;
2846
2847 case 3:
2848 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2849 will result in surrogates in range d800-dfff. Surrogates are
2850 not valid UTF-8 so they are rejected.
2851 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2852 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2853 if ((s[1] & 0xc0) != 0x80 ||
2854 (s[2] & 0xc0) != 0x80 ||
2855 ((unsigned char)s[0] == 0xE0 &&
2856 (unsigned char)s[1] < 0xA0) ||
2857 ((unsigned char)s[0] == 0xED &&
2858 (unsigned char)s[1] > 0x9F)) {
2859
2860 goto surrogateescape;
2861 }
2862 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2863 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2864 *p++ = (Py_UNICODE)ch;
2865 break;
2866
2867 case 4:
2868 if ((s[1] & 0xc0) != 0x80 ||
2869 (s[2] & 0xc0) != 0x80 ||
2870 (s[3] & 0xc0) != 0x80 ||
2871 ((unsigned char)s[0] == 0xF0 &&
2872 (unsigned char)s[1] < 0x90) ||
2873 ((unsigned char)s[0] == 0xF4 &&
2874 (unsigned char)s[1] > 0x8F)) {
2875 goto surrogateescape;
2876 }
2877 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2878 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2879 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2880
2881#if SIZEOF_WCHAR_T == 4
2882 *p++ = (wchar_t)ch;
2883#else
2884 /* compute and append the two surrogates: */
2885
2886 /* translate from 10000..10FFFF to 0..FFFF */
2887 ch -= 0x10000;
2888
2889 /* high surrogate = top 10 bits added to D800 */
2890 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2891
2892 /* low surrogate = bottom 10 bits added to DC00 */
2893 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2894#endif
2895 break;
2896 }
2897 s += n;
2898 continue;
2899
2900 surrogateescape:
2901 *p++ = 0xDC00 + ch;
2902 s++;
2903 }
2904 *p = L'\0';
2905 return unicode;
2906}
2907
2908#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002909
Tim Peters602f7402002-04-27 18:03:26 +00002910/* Allocation strategy: if the string is short, convert into a stack buffer
2911 and allocate exactly as much space needed at the end. Else allocate the
2912 maximum possible needed (4 result bytes per Unicode character), and return
2913 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002914*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002915PyObject *
2916PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002917 Py_ssize_t size,
2918 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919{
Tim Peters602f7402002-04-27 18:03:26 +00002920#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002921
Guido van Rossum98297ee2007-11-06 21:34:58 +00002922 Py_ssize_t i; /* index into s of next input byte */
2923 PyObject *result; /* result string object */
2924 char *p; /* next free byte in output buffer */
2925 Py_ssize_t nallocated; /* number of result bytes allocated */
2926 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002927 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002928 PyObject *errorHandler = NULL;
2929 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002930
Tim Peters602f7402002-04-27 18:03:26 +00002931 assert(s != NULL);
2932 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002933
Tim Peters602f7402002-04-27 18:03:26 +00002934 if (size <= MAX_SHORT_UNICHARS) {
2935 /* Write into the stack buffer; nallocated can't overflow.
2936 * At the end, we'll allocate exactly as much heap space as it
2937 * turns out we need.
2938 */
2939 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002940 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002941 p = stackbuf;
2942 }
2943 else {
2944 /* Overallocate on the heap, and give the excess back at the end. */
2945 nallocated = size * 4;
2946 if (nallocated / 4 != size) /* overflow! */
2947 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002948 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002949 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002950 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002951 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002952 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002953
Tim Peters602f7402002-04-27 18:03:26 +00002954 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002955 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002956
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002957 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002958 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002959 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002960
Guido van Rossumd57fd912000-03-10 22:53:23 +00002961 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002962 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002963 *p++ = (char)(0xc0 | (ch >> 6));
2964 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002965 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002966#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002967 /* Special case: check for high and low surrogate */
2968 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2969 Py_UCS4 ch2 = s[i];
2970 /* Combine the two surrogates to form a UCS4 value */
2971 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2972 i++;
2973
2974 /* Encode UCS4 Unicode ordinals */
2975 *p++ = (char)(0xf0 | (ch >> 18));
2976 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002977 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2978 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002979 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002980#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002981 Py_ssize_t newpos;
2982 PyObject *rep;
2983 Py_ssize_t repsize, k;
2984 rep = unicode_encode_call_errorhandler
2985 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2986 s, size, &exc, i-1, i, &newpos);
2987 if (!rep)
2988 goto error;
2989
2990 if (PyBytes_Check(rep))
2991 repsize = PyBytes_GET_SIZE(rep);
2992 else
2993 repsize = PyUnicode_GET_SIZE(rep);
2994
2995 if (repsize > 4) {
2996 Py_ssize_t offset;
2997
2998 if (result == NULL)
2999 offset = p - stackbuf;
3000 else
3001 offset = p - PyBytes_AS_STRING(result);
3002
3003 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
3004 /* integer overflow */
3005 PyErr_NoMemory();
3006 goto error;
3007 }
3008 nallocated += repsize - 4;
3009 if (result != NULL) {
3010 if (_PyBytes_Resize(&result, nallocated) < 0)
3011 goto error;
3012 } else {
3013 result = PyBytes_FromStringAndSize(NULL, nallocated);
3014 if (result == NULL)
3015 goto error;
3016 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3017 }
3018 p = PyBytes_AS_STRING(result) + offset;
3019 }
3020
3021 if (PyBytes_Check(rep)) {
3022 char *prep = PyBytes_AS_STRING(rep);
3023 for(k = repsize; k > 0; k--)
3024 *p++ = *prep++;
3025 } else /* rep is unicode */ {
3026 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3027 Py_UNICODE c;
3028
3029 for(k=0; k<repsize; k++) {
3030 c = prep[k];
3031 if (0x80 <= c) {
3032 raise_encode_exception(&exc, "utf-8", s, size,
3033 i-1, i, "surrogates not allowed");
3034 goto error;
3035 }
3036 *p++ = (char)prep[k];
3037 }
3038 }
3039 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003040#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003041 }
Victor Stinner445a6232010-04-22 20:01:57 +00003042#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003043 } else if (ch < 0x10000) {
3044 *p++ = (char)(0xe0 | (ch >> 12));
3045 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3046 *p++ = (char)(0x80 | (ch & 0x3f));
3047 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003048 /* Encode UCS4 Unicode ordinals */
3049 *p++ = (char)(0xf0 | (ch >> 18));
3050 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3051 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3052 *p++ = (char)(0x80 | (ch & 0x3f));
3053 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003055
Guido van Rossum98297ee2007-11-06 21:34:58 +00003056 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003057 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003058 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003059 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003060 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003061 }
3062 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003063 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003064 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003065 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003066 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003067 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003068 Py_XDECREF(errorHandler);
3069 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003070 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003071 error:
3072 Py_XDECREF(errorHandler);
3073 Py_XDECREF(exc);
3074 Py_XDECREF(result);
3075 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003076
Tim Peters602f7402002-04-27 18:03:26 +00003077#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003078}
3079
Alexander Belopolsky40018472011-02-26 01:02:56 +00003080PyObject *
3081PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003083 if (!PyUnicode_Check(unicode)) {
3084 PyErr_BadArgument();
3085 return NULL;
3086 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003087 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003088 PyUnicode_GET_SIZE(unicode),
3089 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090}
3091
Walter Dörwald41980ca2007-08-16 21:55:45 +00003092/* --- UTF-32 Codec ------------------------------------------------------- */
3093
3094PyObject *
3095PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003096 Py_ssize_t size,
3097 const char *errors,
3098 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003099{
3100 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3101}
3102
3103PyObject *
3104PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003105 Py_ssize_t size,
3106 const char *errors,
3107 int *byteorder,
3108 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003109{
3110 const char *starts = s;
3111 Py_ssize_t startinpos;
3112 Py_ssize_t endinpos;
3113 Py_ssize_t outpos;
3114 PyUnicodeObject *unicode;
3115 Py_UNICODE *p;
3116#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003117 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003118 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003119#else
3120 const int pairs = 0;
3121#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003122 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003123 int bo = 0; /* assume native ordering by default */
3124 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003125 /* Offsets from q for retrieving bytes in the right order. */
3126#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3127 int iorder[] = {0, 1, 2, 3};
3128#else
3129 int iorder[] = {3, 2, 1, 0};
3130#endif
3131 PyObject *errorHandler = NULL;
3132 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003133
Walter Dörwald41980ca2007-08-16 21:55:45 +00003134 q = (unsigned char *)s;
3135 e = q + size;
3136
3137 if (byteorder)
3138 bo = *byteorder;
3139
3140 /* Check for BOM marks (U+FEFF) in the input and adjust current
3141 byte order setting accordingly. In native mode, the leading BOM
3142 mark is skipped, in all other modes, it is copied to the output
3143 stream as-is (giving a ZWNBSP character). */
3144 if (bo == 0) {
3145 if (size >= 4) {
3146 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003147 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003148#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003149 if (bom == 0x0000FEFF) {
3150 q += 4;
3151 bo = -1;
3152 }
3153 else if (bom == 0xFFFE0000) {
3154 q += 4;
3155 bo = 1;
3156 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003157#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003158 if (bom == 0x0000FEFF) {
3159 q += 4;
3160 bo = 1;
3161 }
3162 else if (bom == 0xFFFE0000) {
3163 q += 4;
3164 bo = -1;
3165 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003166#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003167 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003168 }
3169
3170 if (bo == -1) {
3171 /* force LE */
3172 iorder[0] = 0;
3173 iorder[1] = 1;
3174 iorder[2] = 2;
3175 iorder[3] = 3;
3176 }
3177 else if (bo == 1) {
3178 /* force BE */
3179 iorder[0] = 3;
3180 iorder[1] = 2;
3181 iorder[2] = 1;
3182 iorder[3] = 0;
3183 }
3184
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003185 /* On narrow builds we split characters outside the BMP into two
3186 codepoints => count how much extra space we need. */
3187#ifndef Py_UNICODE_WIDE
3188 for (qq = q; qq < e; qq += 4)
3189 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3190 pairs++;
3191#endif
3192
3193 /* This might be one to much, because of a BOM */
3194 unicode = _PyUnicode_New((size+3)/4+pairs);
3195 if (!unicode)
3196 return NULL;
3197 if (size == 0)
3198 return (PyObject *)unicode;
3199
3200 /* Unpack UTF-32 encoded data */
3201 p = unicode->str;
3202
Walter Dörwald41980ca2007-08-16 21:55:45 +00003203 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003204 Py_UCS4 ch;
3205 /* remaining bytes at the end? (size should be divisible by 4) */
3206 if (e-q<4) {
3207 if (consumed)
3208 break;
3209 errmsg = "truncated data";
3210 startinpos = ((const char *)q)-starts;
3211 endinpos = ((const char *)e)-starts;
3212 goto utf32Error;
3213 /* The remaining input chars are ignored if the callback
3214 chooses to skip the input */
3215 }
3216 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3217 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003218
Benjamin Peterson29060642009-01-31 22:14:21 +00003219 if (ch >= 0x110000)
3220 {
3221 errmsg = "codepoint not in range(0x110000)";
3222 startinpos = ((const char *)q)-starts;
3223 endinpos = startinpos+4;
3224 goto utf32Error;
3225 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003226#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 if (ch >= 0x10000)
3228 {
3229 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3230 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3231 }
3232 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003233#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003234 *p++ = ch;
3235 q += 4;
3236 continue;
3237 utf32Error:
3238 outpos = p-PyUnicode_AS_UNICODE(unicode);
3239 if (unicode_decode_call_errorhandler(
3240 errors, &errorHandler,
3241 "utf32", errmsg,
3242 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3243 &unicode, &outpos, &p))
3244 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003245 }
3246
3247 if (byteorder)
3248 *byteorder = bo;
3249
3250 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003251 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003252
3253 /* Adjust length */
3254 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3255 goto onError;
3256
3257 Py_XDECREF(errorHandler);
3258 Py_XDECREF(exc);
3259 return (PyObject *)unicode;
3260
Benjamin Peterson29060642009-01-31 22:14:21 +00003261 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003262 Py_DECREF(unicode);
3263 Py_XDECREF(errorHandler);
3264 Py_XDECREF(exc);
3265 return NULL;
3266}
3267
3268PyObject *
3269PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003270 Py_ssize_t size,
3271 const char *errors,
3272 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003273{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003274 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003275 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003276 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003277#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003278 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003279#else
3280 const int pairs = 0;
3281#endif
3282 /* Offsets from p for storing byte pairs in the right order. */
3283#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3284 int iorder[] = {0, 1, 2, 3};
3285#else
3286 int iorder[] = {3, 2, 1, 0};
3287#endif
3288
Benjamin Peterson29060642009-01-31 22:14:21 +00003289#define STORECHAR(CH) \
3290 do { \
3291 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3292 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3293 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3294 p[iorder[0]] = (CH) & 0xff; \
3295 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003296 } while(0)
3297
3298 /* In narrow builds we can output surrogate pairs as one codepoint,
3299 so we need less space. */
3300#ifndef Py_UNICODE_WIDE
3301 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003302 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3303 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3304 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003305#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003306 nsize = (size - pairs + (byteorder == 0));
3307 bytesize = nsize * 4;
3308 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003309 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003310 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003311 if (v == NULL)
3312 return NULL;
3313
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003314 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003315 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003316 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003317 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003318 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003319
3320 if (byteorder == -1) {
3321 /* force LE */
3322 iorder[0] = 0;
3323 iorder[1] = 1;
3324 iorder[2] = 2;
3325 iorder[3] = 3;
3326 }
3327 else if (byteorder == 1) {
3328 /* force BE */
3329 iorder[0] = 3;
3330 iorder[1] = 2;
3331 iorder[2] = 1;
3332 iorder[3] = 0;
3333 }
3334
3335 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003336 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003337#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003338 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3339 Py_UCS4 ch2 = *s;
3340 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3341 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3342 s++;
3343 size--;
3344 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003345 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003346#endif
3347 STORECHAR(ch);
3348 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003349
3350 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003351 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003352#undef STORECHAR
3353}
3354
Alexander Belopolsky40018472011-02-26 01:02:56 +00003355PyObject *
3356PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003357{
3358 if (!PyUnicode_Check(unicode)) {
3359 PyErr_BadArgument();
3360 return NULL;
3361 }
3362 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003363 PyUnicode_GET_SIZE(unicode),
3364 NULL,
3365 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003366}
3367
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368/* --- UTF-16 Codec ------------------------------------------------------- */
3369
Tim Peters772747b2001-08-09 22:21:55 +00003370PyObject *
3371PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003372 Py_ssize_t size,
3373 const char *errors,
3374 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003375{
Walter Dörwald69652032004-09-07 20:24:22 +00003376 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3377}
3378
Antoine Pitrouab868312009-01-10 15:40:25 +00003379/* Two masks for fast checking of whether a C 'long' may contain
3380 UTF16-encoded surrogate characters. This is an efficient heuristic,
3381 assuming that non-surrogate characters with a code point >= 0x8000 are
3382 rare in most input.
3383 FAST_CHAR_MASK is used when the input is in native byte ordering,
3384 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003385*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003386#if (SIZEOF_LONG == 8)
3387# define FAST_CHAR_MASK 0x8000800080008000L
3388# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3389#elif (SIZEOF_LONG == 4)
3390# define FAST_CHAR_MASK 0x80008000L
3391# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3392#else
3393# error C 'long' size should be either 4 or 8!
3394#endif
3395
Walter Dörwald69652032004-09-07 20:24:22 +00003396PyObject *
3397PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003398 Py_ssize_t size,
3399 const char *errors,
3400 int *byteorder,
3401 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003402{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003404 Py_ssize_t startinpos;
3405 Py_ssize_t endinpos;
3406 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003407 PyUnicodeObject *unicode;
3408 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003409 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003410 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003411 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003412 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003413 /* Offsets from q for retrieving byte pairs in the right order. */
3414#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3415 int ihi = 1, ilo = 0;
3416#else
3417 int ihi = 0, ilo = 1;
3418#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 PyObject *errorHandler = NULL;
3420 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421
3422 /* Note: size will always be longer than the resulting Unicode
3423 character count */
3424 unicode = _PyUnicode_New(size);
3425 if (!unicode)
3426 return NULL;
3427 if (size == 0)
3428 return (PyObject *)unicode;
3429
3430 /* Unpack UTF-16 encoded data */
3431 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003432 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003433 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003434
3435 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003436 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003438 /* Check for BOM marks (U+FEFF) in the input and adjust current
3439 byte order setting accordingly. In native mode, the leading BOM
3440 mark is skipped, in all other modes, it is copied to the output
3441 stream as-is (giving a ZWNBSP character). */
3442 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003443 if (size >= 2) {
3444 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003445#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003446 if (bom == 0xFEFF) {
3447 q += 2;
3448 bo = -1;
3449 }
3450 else if (bom == 0xFFFE) {
3451 q += 2;
3452 bo = 1;
3453 }
Tim Petersced69f82003-09-16 20:30:58 +00003454#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003455 if (bom == 0xFEFF) {
3456 q += 2;
3457 bo = 1;
3458 }
3459 else if (bom == 0xFFFE) {
3460 q += 2;
3461 bo = -1;
3462 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003463#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003464 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466
Tim Peters772747b2001-08-09 22:21:55 +00003467 if (bo == -1) {
3468 /* force LE */
3469 ihi = 1;
3470 ilo = 0;
3471 }
3472 else if (bo == 1) {
3473 /* force BE */
3474 ihi = 0;
3475 ilo = 1;
3476 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003477#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3478 native_ordering = ilo < ihi;
3479#else
3480 native_ordering = ilo > ihi;
3481#endif
Tim Peters772747b2001-08-09 22:21:55 +00003482
Antoine Pitrouab868312009-01-10 15:40:25 +00003483 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003484 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003485 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003486 /* First check for possible aligned read of a C 'long'. Unaligned
3487 reads are more expensive, better to defer to another iteration. */
3488 if (!((size_t) q & LONG_PTR_MASK)) {
3489 /* Fast path for runs of non-surrogate chars. */
3490 register const unsigned char *_q = q;
3491 Py_UNICODE *_p = p;
3492 if (native_ordering) {
3493 /* Native ordering is simple: as long as the input cannot
3494 possibly contain a surrogate char, do an unrolled copy
3495 of several 16-bit code points to the target object.
3496 The non-surrogate check is done on several input bytes
3497 at a time (as many as a C 'long' can contain). */
3498 while (_q < aligned_end) {
3499 unsigned long data = * (unsigned long *) _q;
3500 if (data & FAST_CHAR_MASK)
3501 break;
3502 _p[0] = ((unsigned short *) _q)[0];
3503 _p[1] = ((unsigned short *) _q)[1];
3504#if (SIZEOF_LONG == 8)
3505 _p[2] = ((unsigned short *) _q)[2];
3506 _p[3] = ((unsigned short *) _q)[3];
3507#endif
3508 _q += SIZEOF_LONG;
3509 _p += SIZEOF_LONG / 2;
3510 }
3511 }
3512 else {
3513 /* Byteswapped ordering is similar, but we must decompose
3514 the copy bytewise, and take care of zero'ing out the
3515 upper bytes if the target object is in 32-bit units
3516 (that is, in UCS-4 builds). */
3517 while (_q < aligned_end) {
3518 unsigned long data = * (unsigned long *) _q;
3519 if (data & SWAPPED_FAST_CHAR_MASK)
3520 break;
3521 /* Zero upper bytes in UCS-4 builds */
3522#if (Py_UNICODE_SIZE > 2)
3523 _p[0] = 0;
3524 _p[1] = 0;
3525#if (SIZEOF_LONG == 8)
3526 _p[2] = 0;
3527 _p[3] = 0;
3528#endif
3529#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003530 /* Issue #4916; UCS-4 builds on big endian machines must
3531 fill the two last bytes of each 4-byte unit. */
3532#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3533# define OFF 2
3534#else
3535# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003536#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003537 ((unsigned char *) _p)[OFF + 1] = _q[0];
3538 ((unsigned char *) _p)[OFF + 0] = _q[1];
3539 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3540 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3541#if (SIZEOF_LONG == 8)
3542 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3543 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3544 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3545 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3546#endif
3547#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003548 _q += SIZEOF_LONG;
3549 _p += SIZEOF_LONG / 2;
3550 }
3551 }
3552 p = _p;
3553 q = _q;
3554 if (q >= e)
3555 break;
3556 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003557 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003558
Benjamin Peterson14339b62009-01-31 16:36:08 +00003559 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003560
3561 if (ch < 0xD800 || ch > 0xDFFF) {
3562 *p++ = ch;
3563 continue;
3564 }
3565
3566 /* UTF-16 code pair: */
3567 if (q > e) {
3568 errmsg = "unexpected end of data";
3569 startinpos = (((const char *)q) - 2) - starts;
3570 endinpos = ((const char *)e) + 1 - starts;
3571 goto utf16Error;
3572 }
3573 if (0xD800 <= ch && ch <= 0xDBFF) {
3574 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3575 q += 2;
3576 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003577#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003578 *p++ = ch;
3579 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003580#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003581 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003582#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003583 continue;
3584 }
3585 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003586 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003587 startinpos = (((const char *)q)-4)-starts;
3588 endinpos = startinpos+2;
3589 goto utf16Error;
3590 }
3591
Benjamin Peterson14339b62009-01-31 16:36:08 +00003592 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003593 errmsg = "illegal encoding";
3594 startinpos = (((const char *)q)-2)-starts;
3595 endinpos = startinpos+2;
3596 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003597
Benjamin Peterson29060642009-01-31 22:14:21 +00003598 utf16Error:
3599 outpos = p - PyUnicode_AS_UNICODE(unicode);
3600 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003601 errors,
3602 &errorHandler,
3603 "utf16", errmsg,
3604 &starts,
3605 (const char **)&e,
3606 &startinpos,
3607 &endinpos,
3608 &exc,
3609 (const char **)&q,
3610 &unicode,
3611 &outpos,
3612 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003613 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003615 /* remaining byte at the end? (size should be even) */
3616 if (e == q) {
3617 if (!consumed) {
3618 errmsg = "truncated data";
3619 startinpos = ((const char *)q) - starts;
3620 endinpos = ((const char *)e) + 1 - starts;
3621 outpos = p - PyUnicode_AS_UNICODE(unicode);
3622 if (unicode_decode_call_errorhandler(
3623 errors,
3624 &errorHandler,
3625 "utf16", errmsg,
3626 &starts,
3627 (const char **)&e,
3628 &startinpos,
3629 &endinpos,
3630 &exc,
3631 (const char **)&q,
3632 &unicode,
3633 &outpos,
3634 &p))
3635 goto onError;
3636 /* The remaining input chars are ignored if the callback
3637 chooses to skip the input */
3638 }
3639 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003640
3641 if (byteorder)
3642 *byteorder = bo;
3643
Walter Dörwald69652032004-09-07 20:24:22 +00003644 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003645 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003646
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003648 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 goto onError;
3650
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003651 Py_XDECREF(errorHandler);
3652 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 return (PyObject *)unicode;
3654
Benjamin Peterson29060642009-01-31 22:14:21 +00003655 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657 Py_XDECREF(errorHandler);
3658 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003659 return NULL;
3660}
3661
Antoine Pitrouab868312009-01-10 15:40:25 +00003662#undef FAST_CHAR_MASK
3663#undef SWAPPED_FAST_CHAR_MASK
3664
Tim Peters772747b2001-08-09 22:21:55 +00003665PyObject *
3666PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003667 Py_ssize_t size,
3668 const char *errors,
3669 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003671 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003672 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003673 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003674#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003675 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003676#else
3677 const int pairs = 0;
3678#endif
Tim Peters772747b2001-08-09 22:21:55 +00003679 /* Offsets from p for storing byte pairs in the right order. */
3680#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3681 int ihi = 1, ilo = 0;
3682#else
3683 int ihi = 0, ilo = 1;
3684#endif
3685
Benjamin Peterson29060642009-01-31 22:14:21 +00003686#define STORECHAR(CH) \
3687 do { \
3688 p[ihi] = ((CH) >> 8) & 0xff; \
3689 p[ilo] = (CH) & 0xff; \
3690 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003691 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003693#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003694 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003695 if (s[i] >= 0x10000)
3696 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003697#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003698 /* 2 * (size + pairs + (byteorder == 0)) */
3699 if (size > PY_SSIZE_T_MAX ||
3700 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003701 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003702 nsize = size + pairs + (byteorder == 0);
3703 bytesize = nsize * 2;
3704 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003705 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003706 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 if (v == NULL)
3708 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003709
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003710 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003712 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003713 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003714 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003715
3716 if (byteorder == -1) {
3717 /* force LE */
3718 ihi = 1;
3719 ilo = 0;
3720 }
3721 else if (byteorder == 1) {
3722 /* force BE */
3723 ihi = 0;
3724 ilo = 1;
3725 }
3726
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003727 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003728 Py_UNICODE ch = *s++;
3729 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003730#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003731 if (ch >= 0x10000) {
3732 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3733 ch = 0xD800 | ((ch-0x10000) >> 10);
3734 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003735#endif
Tim Peters772747b2001-08-09 22:21:55 +00003736 STORECHAR(ch);
3737 if (ch2)
3738 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003739 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003740
3741 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003742 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003743#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744}
3745
Alexander Belopolsky40018472011-02-26 01:02:56 +00003746PyObject *
3747PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748{
3749 if (!PyUnicode_Check(unicode)) {
3750 PyErr_BadArgument();
3751 return NULL;
3752 }
3753 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003754 PyUnicode_GET_SIZE(unicode),
3755 NULL,
3756 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757}
3758
3759/* --- Unicode Escape Codec ----------------------------------------------- */
3760
Fredrik Lundh06d12682001-01-24 07:59:11 +00003761static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003762
Alexander Belopolsky40018472011-02-26 01:02:56 +00003763PyObject *
3764PyUnicode_DecodeUnicodeEscape(const char *s,
3765 Py_ssize_t size,
3766 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003767{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003769 Py_ssize_t startinpos;
3770 Py_ssize_t endinpos;
3771 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003772 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003774 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003775 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003776 char* message;
3777 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778 PyObject *errorHandler = NULL;
3779 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003780
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781 /* Escaped strings will always be longer than the resulting
3782 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 length after conversion to the true value.
3784 (but if the error callback returns a long replacement string
3785 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003786 v = _PyUnicode_New(size);
3787 if (v == NULL)
3788 goto onError;
3789 if (size == 0)
3790 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003791
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003794
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795 while (s < end) {
3796 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003797 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003798 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799
3800 /* Non-escape characters are interpreted as Unicode ordinals */
3801 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003802 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803 continue;
3804 }
3805
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003806 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003807 /* \ - Escapes */
3808 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003809 c = *s++;
3810 if (s > end)
3811 c = '\0'; /* Invalid after \ */
3812 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813
Benjamin Peterson29060642009-01-31 22:14:21 +00003814 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 case '\n': break;
3816 case '\\': *p++ = '\\'; break;
3817 case '\'': *p++ = '\''; break;
3818 case '\"': *p++ = '\"'; break;
3819 case 'b': *p++ = '\b'; break;
3820 case 'f': *p++ = '\014'; break; /* FF */
3821 case 't': *p++ = '\t'; break;
3822 case 'n': *p++ = '\n'; break;
3823 case 'r': *p++ = '\r'; break;
3824 case 'v': *p++ = '\013'; break; /* VT */
3825 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3826
Benjamin Peterson29060642009-01-31 22:14:21 +00003827 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 case '0': case '1': case '2': case '3':
3829 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003830 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003831 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003832 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003833 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003834 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003836 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 break;
3838
Benjamin Peterson29060642009-01-31 22:14:21 +00003839 /* hex escapes */
3840 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003842 digits = 2;
3843 message = "truncated \\xXX escape";
3844 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003845
Benjamin Peterson29060642009-01-31 22:14:21 +00003846 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003847 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003848 digits = 4;
3849 message = "truncated \\uXXXX escape";
3850 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851
Benjamin Peterson29060642009-01-31 22:14:21 +00003852 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003853 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003854 digits = 8;
3855 message = "truncated \\UXXXXXXXX escape";
3856 hexescape:
3857 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003858 outpos = p-PyUnicode_AS_UNICODE(v);
3859 if (s+digits>end) {
3860 endinpos = size;
3861 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003862 errors, &errorHandler,
3863 "unicodeescape", "end of string in escape sequence",
3864 &starts, &end, &startinpos, &endinpos, &exc, &s,
3865 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003866 goto onError;
3867 goto nextByte;
3868 }
3869 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003870 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003871 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003872 endinpos = (s+i+1)-starts;
3873 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003874 errors, &errorHandler,
3875 "unicodeescape", message,
3876 &starts, &end, &startinpos, &endinpos, &exc, &s,
3877 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003878 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003879 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003880 }
3881 chr = (chr<<4) & ~0xF;
3882 if (c >= '0' && c <= '9')
3883 chr += c - '0';
3884 else if (c >= 'a' && c <= 'f')
3885 chr += 10 + c - 'a';
3886 else
3887 chr += 10 + c - 'A';
3888 }
3889 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003890 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003891 /* _decoding_error will have already written into the
3892 target buffer. */
3893 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003894 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003895 /* when we get here, chr is a 32-bit unicode character */
3896 if (chr <= 0xffff)
3897 /* UCS-2 character */
3898 *p++ = (Py_UNICODE) chr;
3899 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003900 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003901 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003902#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003903 *p++ = chr;
3904#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003905 chr -= 0x10000L;
3906 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003907 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003908#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003909 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003910 endinpos = s-starts;
3911 outpos = p-PyUnicode_AS_UNICODE(v);
3912 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003913 errors, &errorHandler,
3914 "unicodeescape", "illegal Unicode character",
3915 &starts, &end, &startinpos, &endinpos, &exc, &s,
3916 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003917 goto onError;
3918 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003919 break;
3920
Benjamin Peterson29060642009-01-31 22:14:21 +00003921 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003922 case 'N':
3923 message = "malformed \\N character escape";
3924 if (ucnhash_CAPI == NULL) {
3925 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003926 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003927 if (ucnhash_CAPI == NULL)
3928 goto ucnhashError;
3929 }
3930 if (*s == '{') {
3931 const char *start = s+1;
3932 /* look for the closing brace */
3933 while (*s != '}' && s < end)
3934 s++;
3935 if (s > start && s < end && *s == '}') {
3936 /* found a name. look it up in the unicode database */
3937 message = "unknown Unicode character name";
3938 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003939 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003940 goto store;
3941 }
3942 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003943 endinpos = s-starts;
3944 outpos = p-PyUnicode_AS_UNICODE(v);
3945 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003946 errors, &errorHandler,
3947 "unicodeescape", message,
3948 &starts, &end, &startinpos, &endinpos, &exc, &s,
3949 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003950 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003951 break;
3952
3953 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003954 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003955 message = "\\ at end of string";
3956 s--;
3957 endinpos = s-starts;
3958 outpos = p-PyUnicode_AS_UNICODE(v);
3959 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003960 errors, &errorHandler,
3961 "unicodeescape", message,
3962 &starts, &end, &startinpos, &endinpos, &exc, &s,
3963 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003964 goto onError;
3965 }
3966 else {
3967 *p++ = '\\';
3968 *p++ = (unsigned char)s[-1];
3969 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003970 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003971 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003972 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003973 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003975 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003976 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003977 Py_XDECREF(errorHandler);
3978 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003979 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003980
Benjamin Peterson29060642009-01-31 22:14:21 +00003981 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003982 PyErr_SetString(
3983 PyExc_UnicodeError,
3984 "\\N escapes not supported (can't load unicodedata module)"
3985 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003986 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003987 Py_XDECREF(errorHandler);
3988 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003989 return NULL;
3990
Benjamin Peterson29060642009-01-31 22:14:21 +00003991 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003993 Py_XDECREF(errorHandler);
3994 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 return NULL;
3996}
3997
3998/* Return a Unicode-Escape string version of the Unicode object.
3999
4000 If quotes is true, the string is enclosed in u"" or u'' quotes as
4001 appropriate.
4002
4003*/
4004
Thomas Wouters477c8d52006-05-27 19:21:47 +00004005Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004006 Py_ssize_t size,
4007 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00004008{
4009 /* like wcschr, but doesn't stop at NULL characters */
4010
4011 while (size-- > 0) {
4012 if (*s == ch)
4013 return s;
4014 s++;
4015 }
4016
4017 return NULL;
4018}
Barry Warsaw51ac5802000-03-20 16:36:48 +00004019
Walter Dörwald79e913e2007-05-12 11:08:06 +00004020static const char *hexdigits = "0123456789abcdef";
4021
Alexander Belopolsky40018472011-02-26 01:02:56 +00004022PyObject *
4023PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
4024 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004025{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004026 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004029#ifdef Py_UNICODE_WIDE
4030 const Py_ssize_t expandsize = 10;
4031#else
4032 const Py_ssize_t expandsize = 6;
4033#endif
4034
Thomas Wouters89f507f2006-12-13 04:49:30 +00004035 /* XXX(nnorwitz): rather than over-allocating, it would be
4036 better to choose a different scheme. Perhaps scan the
4037 first N-chars of the string and allocate based on that size.
4038 */
4039 /* Initial allocation is based on the longest-possible unichr
4040 escape.
4041
4042 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4043 unichr, so in this case it's the longest unichr escape. In
4044 narrow (UTF-16) builds this is five chars per source unichr
4045 since there are two unichrs in the surrogate pair, so in narrow
4046 (UTF-16) builds it's not the longest unichr escape.
4047
4048 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4049 so in the narrow (UTF-16) build case it's the longest unichr
4050 escape.
4051 */
4052
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004053 if (size == 0)
4054 return PyBytes_FromStringAndSize(NULL, 0);
4055
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004056 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004057 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004058
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004059 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004060 2
4061 + expandsize*size
4062 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004063 if (repr == NULL)
4064 return NULL;
4065
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004066 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 while (size-- > 0) {
4069 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004070
Walter Dörwald79e913e2007-05-12 11:08:06 +00004071 /* Escape backslashes */
4072 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 *p++ = '\\';
4074 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004075 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004076 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004077
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004078#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004079 /* Map 21-bit characters to '\U00xxxxxx' */
4080 else if (ch >= 0x10000) {
4081 *p++ = '\\';
4082 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004083 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4084 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4085 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4086 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4087 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4088 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4089 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4090 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004091 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004092 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004093#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004094 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4095 else if (ch >= 0xD800 && ch < 0xDC00) {
4096 Py_UNICODE ch2;
4097 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004098
Benjamin Peterson29060642009-01-31 22:14:21 +00004099 ch2 = *s++;
4100 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004101 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004102 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4103 *p++ = '\\';
4104 *p++ = 'U';
4105 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4106 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4107 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4108 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4109 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4110 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4111 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4112 *p++ = hexdigits[ucs & 0x0000000F];
4113 continue;
4114 }
4115 /* Fall through: isolated surrogates are copied as-is */
4116 s--;
4117 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004118 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004119#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004120
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004122 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123 *p++ = '\\';
4124 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004125 *p++ = hexdigits[(ch >> 12) & 0x000F];
4126 *p++ = hexdigits[(ch >> 8) & 0x000F];
4127 *p++ = hexdigits[(ch >> 4) & 0x000F];
4128 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004130
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004131 /* Map special whitespace to '\t', \n', '\r' */
4132 else if (ch == '\t') {
4133 *p++ = '\\';
4134 *p++ = 't';
4135 }
4136 else if (ch == '\n') {
4137 *p++ = '\\';
4138 *p++ = 'n';
4139 }
4140 else if (ch == '\r') {
4141 *p++ = '\\';
4142 *p++ = 'r';
4143 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004144
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004145 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004146 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004148 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004149 *p++ = hexdigits[(ch >> 4) & 0x000F];
4150 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004151 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004152
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153 /* Copy everything else as-is */
4154 else
4155 *p++ = (char) ch;
4156 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004158 assert(p - PyBytes_AS_STRING(repr) > 0);
4159 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4160 return NULL;
4161 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162}
4163
Alexander Belopolsky40018472011-02-26 01:02:56 +00004164PyObject *
4165PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004167 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168 if (!PyUnicode_Check(unicode)) {
4169 PyErr_BadArgument();
4170 return NULL;
4171 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004172 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4173 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004174 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175}
4176
4177/* --- Raw Unicode Escape Codec ------------------------------------------- */
4178
Alexander Belopolsky40018472011-02-26 01:02:56 +00004179PyObject *
4180PyUnicode_DecodeRawUnicodeEscape(const char *s,
4181 Py_ssize_t size,
4182 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004183{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004184 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004185 Py_ssize_t startinpos;
4186 Py_ssize_t endinpos;
4187 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190 const char *end;
4191 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 PyObject *errorHandler = NULL;
4193 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004194
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195 /* Escaped strings will always be longer than the resulting
4196 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 length after conversion to the true value. (But decoding error
4198 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 v = _PyUnicode_New(size);
4200 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004201 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004202 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004203 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205 end = s + size;
4206 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004207 unsigned char c;
4208 Py_UCS4 x;
4209 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004210 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004211
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 /* Non-escape characters are interpreted as Unicode ordinals */
4213 if (*s != '\\') {
4214 *p++ = (unsigned char)*s++;
4215 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004216 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004217 startinpos = s-starts;
4218
4219 /* \u-escapes are only interpreted iff the number of leading
4220 backslashes if odd */
4221 bs = s;
4222 for (;s < end;) {
4223 if (*s != '\\')
4224 break;
4225 *p++ = (unsigned char)*s++;
4226 }
4227 if (((s - bs) & 1) == 0 ||
4228 s >= end ||
4229 (*s != 'u' && *s != 'U')) {
4230 continue;
4231 }
4232 p--;
4233 count = *s=='u' ? 4 : 8;
4234 s++;
4235
4236 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4237 outpos = p-PyUnicode_AS_UNICODE(v);
4238 for (x = 0, i = 0; i < count; ++i, ++s) {
4239 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004240 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004241 endinpos = s-starts;
4242 if (unicode_decode_call_errorhandler(
4243 errors, &errorHandler,
4244 "rawunicodeescape", "truncated \\uXXXX",
4245 &starts, &end, &startinpos, &endinpos, &exc, &s,
4246 &v, &outpos, &p))
4247 goto onError;
4248 goto nextByte;
4249 }
4250 x = (x<<4) & ~0xF;
4251 if (c >= '0' && c <= '9')
4252 x += c - '0';
4253 else if (c >= 'a' && c <= 'f')
4254 x += 10 + c - 'a';
4255 else
4256 x += 10 + c - 'A';
4257 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004258 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004259 /* UCS-2 character */
4260 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004261 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004262 /* UCS-4 character. Either store directly, or as
4263 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004264#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004265 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004266#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004267 x -= 0x10000L;
4268 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4269 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004270#endif
4271 } else {
4272 endinpos = s-starts;
4273 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004274 if (unicode_decode_call_errorhandler(
4275 errors, &errorHandler,
4276 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004277 &starts, &end, &startinpos, &endinpos, &exc, &s,
4278 &v, &outpos, &p))
4279 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004280 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004281 nextByte:
4282 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004284 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004285 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 Py_XDECREF(errorHandler);
4287 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004289
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004292 Py_XDECREF(errorHandler);
4293 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004294 return NULL;
4295}
4296
Alexander Belopolsky40018472011-02-26 01:02:56 +00004297PyObject *
4298PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4299 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004301 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004302 char *p;
4303 char *q;
4304
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004305#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004306 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004307#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004308 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004309#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004310
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004311 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004312 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004313
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004314 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315 if (repr == NULL)
4316 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004317 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004318 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004320 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321 while (size-- > 0) {
4322 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004323#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004324 /* Map 32-bit characters to '\Uxxxxxxxx' */
4325 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004326 *p++ = '\\';
4327 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004328 *p++ = hexdigits[(ch >> 28) & 0xf];
4329 *p++ = hexdigits[(ch >> 24) & 0xf];
4330 *p++ = hexdigits[(ch >> 20) & 0xf];
4331 *p++ = hexdigits[(ch >> 16) & 0xf];
4332 *p++ = hexdigits[(ch >> 12) & 0xf];
4333 *p++ = hexdigits[(ch >> 8) & 0xf];
4334 *p++ = hexdigits[(ch >> 4) & 0xf];
4335 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004336 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004337 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004338#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004339 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4340 if (ch >= 0xD800 && ch < 0xDC00) {
4341 Py_UNICODE ch2;
4342 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004343
Benjamin Peterson29060642009-01-31 22:14:21 +00004344 ch2 = *s++;
4345 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004346 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004347 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4348 *p++ = '\\';
4349 *p++ = 'U';
4350 *p++ = hexdigits[(ucs >> 28) & 0xf];
4351 *p++ = hexdigits[(ucs >> 24) & 0xf];
4352 *p++ = hexdigits[(ucs >> 20) & 0xf];
4353 *p++ = hexdigits[(ucs >> 16) & 0xf];
4354 *p++ = hexdigits[(ucs >> 12) & 0xf];
4355 *p++ = hexdigits[(ucs >> 8) & 0xf];
4356 *p++ = hexdigits[(ucs >> 4) & 0xf];
4357 *p++ = hexdigits[ucs & 0xf];
4358 continue;
4359 }
4360 /* Fall through: isolated surrogates are copied as-is */
4361 s--;
4362 size++;
4363 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004364#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004365 /* Map 16-bit characters to '\uxxxx' */
4366 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004367 *p++ = '\\';
4368 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004369 *p++ = hexdigits[(ch >> 12) & 0xf];
4370 *p++ = hexdigits[(ch >> 8) & 0xf];
4371 *p++ = hexdigits[(ch >> 4) & 0xf];
4372 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004374 /* Copy everything else as-is */
4375 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376 *p++ = (char) ch;
4377 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004378 size = p - q;
4379
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004380 assert(size > 0);
4381 if (_PyBytes_Resize(&repr, size) < 0)
4382 return NULL;
4383 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384}
4385
Alexander Belopolsky40018472011-02-26 01:02:56 +00004386PyObject *
4387PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004389 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004391 PyErr_BadArgument();
4392 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004394 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4395 PyUnicode_GET_SIZE(unicode));
4396
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004397 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398}
4399
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004400/* --- Unicode Internal Codec ------------------------------------------- */
4401
Alexander Belopolsky40018472011-02-26 01:02:56 +00004402PyObject *
4403_PyUnicode_DecodeUnicodeInternal(const char *s,
4404 Py_ssize_t size,
4405 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004406{
4407 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004408 Py_ssize_t startinpos;
4409 Py_ssize_t endinpos;
4410 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004411 PyUnicodeObject *v;
4412 Py_UNICODE *p;
4413 const char *end;
4414 const char *reason;
4415 PyObject *errorHandler = NULL;
4416 PyObject *exc = NULL;
4417
Neal Norwitzd43069c2006-01-08 01:12:10 +00004418#ifdef Py_UNICODE_WIDE
4419 Py_UNICODE unimax = PyUnicode_GetMax();
4420#endif
4421
Thomas Wouters89f507f2006-12-13 04:49:30 +00004422 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004423 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4424 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004425 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004426 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004427 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004428 p = PyUnicode_AS_UNICODE(v);
4429 end = s + size;
4430
4431 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004432 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004433 /* We have to sanity check the raw data, otherwise doom looms for
4434 some malformed UCS-4 data. */
4435 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004436#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004437 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004438#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004439 end-s < Py_UNICODE_SIZE
4440 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004441 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004442 startinpos = s - starts;
4443 if (end-s < Py_UNICODE_SIZE) {
4444 endinpos = end-starts;
4445 reason = "truncated input";
4446 }
4447 else {
4448 endinpos = s - starts + Py_UNICODE_SIZE;
4449 reason = "illegal code point (> 0x10FFFF)";
4450 }
4451 outpos = p - PyUnicode_AS_UNICODE(v);
4452 if (unicode_decode_call_errorhandler(
4453 errors, &errorHandler,
4454 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004455 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004456 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004457 goto onError;
4458 }
4459 }
4460 else {
4461 p++;
4462 s += Py_UNICODE_SIZE;
4463 }
4464 }
4465
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004466 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004467 goto onError;
4468 Py_XDECREF(errorHandler);
4469 Py_XDECREF(exc);
4470 return (PyObject *)v;
4471
Benjamin Peterson29060642009-01-31 22:14:21 +00004472 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004473 Py_XDECREF(v);
4474 Py_XDECREF(errorHandler);
4475 Py_XDECREF(exc);
4476 return NULL;
4477}
4478
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479/* --- Latin-1 Codec ------------------------------------------------------ */
4480
Alexander Belopolsky40018472011-02-26 01:02:56 +00004481PyObject *
4482PyUnicode_DecodeLatin1(const char *s,
4483 Py_ssize_t size,
4484 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004485{
4486 PyUnicodeObject *v;
4487 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004488 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004489
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004491 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004492 Py_UNICODE r = *(unsigned char*)s;
4493 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004494 }
4495
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496 v = _PyUnicode_New(size);
4497 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004498 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004500 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004501 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004502 e = s + size;
4503 /* Unrolling the copy makes it much faster by reducing the looping
4504 overhead. This is similar to what many memcpy() implementations do. */
4505 unrolled_end = e - 4;
4506 while (s < unrolled_end) {
4507 p[0] = (unsigned char) s[0];
4508 p[1] = (unsigned char) s[1];
4509 p[2] = (unsigned char) s[2];
4510 p[3] = (unsigned char) s[3];
4511 s += 4;
4512 p += 4;
4513 }
4514 while (s < e)
4515 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004517
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519 Py_XDECREF(v);
4520 return NULL;
4521}
4522
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004523/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004524static void
4525make_encode_exception(PyObject **exceptionObject,
4526 const char *encoding,
4527 const Py_UNICODE *unicode, Py_ssize_t size,
4528 Py_ssize_t startpos, Py_ssize_t endpos,
4529 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004531 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004532 *exceptionObject = PyUnicodeEncodeError_Create(
4533 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534 }
4535 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004536 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4537 goto onError;
4538 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4539 goto onError;
4540 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4541 goto onError;
4542 return;
4543 onError:
4544 Py_DECREF(*exceptionObject);
4545 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004546 }
4547}
4548
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004550static void
4551raise_encode_exception(PyObject **exceptionObject,
4552 const char *encoding,
4553 const Py_UNICODE *unicode, Py_ssize_t size,
4554 Py_ssize_t startpos, Py_ssize_t endpos,
4555 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004556{
4557 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004558 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004559 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004560 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004561}
4562
4563/* error handling callback helper:
4564 build arguments, call the callback and check the arguments,
4565 put the result into newpos and return the replacement string, which
4566 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004567static PyObject *
4568unicode_encode_call_errorhandler(const char *errors,
4569 PyObject **errorHandler,
4570 const char *encoding, const char *reason,
4571 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4572 Py_ssize_t startpos, Py_ssize_t endpos,
4573 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004574{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004575 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004576
4577 PyObject *restuple;
4578 PyObject *resunicode;
4579
4580 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004581 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004582 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004583 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004584 }
4585
4586 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004589 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004590
4591 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004592 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004593 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004594 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004595 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004596 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004597 Py_DECREF(restuple);
4598 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004600 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004601 &resunicode, newpos)) {
4602 Py_DECREF(restuple);
4603 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004605 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4606 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4607 Py_DECREF(restuple);
4608 return NULL;
4609 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004611 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004612 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4614 Py_DECREF(restuple);
4615 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004616 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617 Py_INCREF(resunicode);
4618 Py_DECREF(restuple);
4619 return resunicode;
4620}
4621
Alexander Belopolsky40018472011-02-26 01:02:56 +00004622static PyObject *
4623unicode_encode_ucs1(const Py_UNICODE *p,
4624 Py_ssize_t size,
4625 const char *errors,
4626 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004627{
4628 /* output object */
4629 PyObject *res;
4630 /* pointers to the beginning and end+1 of input */
4631 const Py_UNICODE *startp = p;
4632 const Py_UNICODE *endp = p + size;
4633 /* pointer to the beginning of the unencodable characters */
4634 /* const Py_UNICODE *badp = NULL; */
4635 /* pointer into the output */
4636 char *str;
4637 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004638 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004639 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4640 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641 PyObject *errorHandler = NULL;
4642 PyObject *exc = NULL;
4643 /* the following variable is used for caching string comparisons
4644 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4645 int known_errorHandler = -1;
4646
4647 /* allocate enough for a simple encoding without
4648 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004649 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004650 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004651 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004652 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004653 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004654 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004655 ressize = size;
4656
4657 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004658 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659
Benjamin Peterson29060642009-01-31 22:14:21 +00004660 /* can we encode this? */
4661 if (c<limit) {
4662 /* no overflow check, because we know that the space is enough */
4663 *str++ = (char)c;
4664 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004665 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004666 else {
4667 Py_ssize_t unicodepos = p-startp;
4668 Py_ssize_t requiredsize;
4669 PyObject *repunicode;
4670 Py_ssize_t repsize;
4671 Py_ssize_t newpos;
4672 Py_ssize_t respos;
4673 Py_UNICODE *uni2;
4674 /* startpos for collecting unencodable chars */
4675 const Py_UNICODE *collstart = p;
4676 const Py_UNICODE *collend = p;
4677 /* find all unecodable characters */
4678 while ((collend < endp) && ((*collend)>=limit))
4679 ++collend;
4680 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4681 if (known_errorHandler==-1) {
4682 if ((errors==NULL) || (!strcmp(errors, "strict")))
4683 known_errorHandler = 1;
4684 else if (!strcmp(errors, "replace"))
4685 known_errorHandler = 2;
4686 else if (!strcmp(errors, "ignore"))
4687 known_errorHandler = 3;
4688 else if (!strcmp(errors, "xmlcharrefreplace"))
4689 known_errorHandler = 4;
4690 else
4691 known_errorHandler = 0;
4692 }
4693 switch (known_errorHandler) {
4694 case 1: /* strict */
4695 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4696 goto onError;
4697 case 2: /* replace */
4698 while (collstart++<collend)
4699 *str++ = '?'; /* fall through */
4700 case 3: /* ignore */
4701 p = collend;
4702 break;
4703 case 4: /* xmlcharrefreplace */
4704 respos = str - PyBytes_AS_STRING(res);
4705 /* determine replacement size (temporarily (mis)uses p) */
4706 for (p = collstart, repsize = 0; p < collend; ++p) {
4707 if (*p<10)
4708 repsize += 2+1+1;
4709 else if (*p<100)
4710 repsize += 2+2+1;
4711 else if (*p<1000)
4712 repsize += 2+3+1;
4713 else if (*p<10000)
4714 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004715#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004716 else
4717 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004718#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004719 else if (*p<100000)
4720 repsize += 2+5+1;
4721 else if (*p<1000000)
4722 repsize += 2+6+1;
4723 else
4724 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004725#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004726 }
4727 requiredsize = respos+repsize+(endp-collend);
4728 if (requiredsize > ressize) {
4729 if (requiredsize<2*ressize)
4730 requiredsize = 2*ressize;
4731 if (_PyBytes_Resize(&res, requiredsize))
4732 goto onError;
4733 str = PyBytes_AS_STRING(res) + respos;
4734 ressize = requiredsize;
4735 }
4736 /* generate replacement (temporarily (mis)uses p) */
4737 for (p = collstart; p < collend; ++p) {
4738 str += sprintf(str, "&#%d;", (int)*p);
4739 }
4740 p = collend;
4741 break;
4742 default:
4743 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4744 encoding, reason, startp, size, &exc,
4745 collstart-startp, collend-startp, &newpos);
4746 if (repunicode == NULL)
4747 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004748 if (PyBytes_Check(repunicode)) {
4749 /* Directly copy bytes result to output. */
4750 repsize = PyBytes_Size(repunicode);
4751 if (repsize > 1) {
4752 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004753 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004754 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4755 Py_DECREF(repunicode);
4756 goto onError;
4757 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004758 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004759 ressize += repsize-1;
4760 }
4761 memcpy(str, PyBytes_AsString(repunicode), repsize);
4762 str += repsize;
4763 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004764 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004765 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004766 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004767 /* need more space? (at least enough for what we
4768 have+the replacement+the rest of the string, so
4769 we won't have to check space for encodable characters) */
4770 respos = str - PyBytes_AS_STRING(res);
4771 repsize = PyUnicode_GET_SIZE(repunicode);
4772 requiredsize = respos+repsize+(endp-collend);
4773 if (requiredsize > ressize) {
4774 if (requiredsize<2*ressize)
4775 requiredsize = 2*ressize;
4776 if (_PyBytes_Resize(&res, requiredsize)) {
4777 Py_DECREF(repunicode);
4778 goto onError;
4779 }
4780 str = PyBytes_AS_STRING(res) + respos;
4781 ressize = requiredsize;
4782 }
4783 /* check if there is anything unencodable in the replacement
4784 and copy it to the output */
4785 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4786 c = *uni2;
4787 if (c >= limit) {
4788 raise_encode_exception(&exc, encoding, startp, size,
4789 unicodepos, unicodepos+1, reason);
4790 Py_DECREF(repunicode);
4791 goto onError;
4792 }
4793 *str = (char)c;
4794 }
4795 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004796 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004797 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004798 }
4799 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004800 /* Resize if we allocated to much */
4801 size = str - PyBytes_AS_STRING(res);
4802 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004803 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004804 if (_PyBytes_Resize(&res, size) < 0)
4805 goto onError;
4806 }
4807
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004808 Py_XDECREF(errorHandler);
4809 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004810 return res;
4811
4812 onError:
4813 Py_XDECREF(res);
4814 Py_XDECREF(errorHandler);
4815 Py_XDECREF(exc);
4816 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004817}
4818
Alexander Belopolsky40018472011-02-26 01:02:56 +00004819PyObject *
4820PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4821 Py_ssize_t size,
4822 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004824 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825}
4826
Alexander Belopolsky40018472011-02-26 01:02:56 +00004827PyObject *
4828PyUnicode_AsLatin1String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829{
4830 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004831 PyErr_BadArgument();
4832 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833 }
4834 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004835 PyUnicode_GET_SIZE(unicode),
4836 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837}
4838
4839/* --- 7-bit ASCII Codec -------------------------------------------------- */
4840
Alexander Belopolsky40018472011-02-26 01:02:56 +00004841PyObject *
4842PyUnicode_DecodeASCII(const char *s,
4843 Py_ssize_t size,
4844 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004846 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847 PyUnicodeObject *v;
4848 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004849 Py_ssize_t startinpos;
4850 Py_ssize_t endinpos;
4851 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004852 const char *e;
4853 PyObject *errorHandler = NULL;
4854 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004855
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004857 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004858 Py_UNICODE r = *(unsigned char*)s;
4859 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004860 }
Tim Petersced69f82003-09-16 20:30:58 +00004861
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 v = _PyUnicode_New(size);
4863 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004864 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004866 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004868 e = s + size;
4869 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004870 register unsigned char c = (unsigned char)*s;
4871 if (c < 128) {
4872 *p++ = c;
4873 ++s;
4874 }
4875 else {
4876 startinpos = s-starts;
4877 endinpos = startinpos + 1;
4878 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4879 if (unicode_decode_call_errorhandler(
4880 errors, &errorHandler,
4881 "ascii", "ordinal not in range(128)",
4882 &starts, &e, &startinpos, &endinpos, &exc, &s,
4883 &v, &outpos, &p))
4884 goto onError;
4885 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004887 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004888 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4889 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004890 Py_XDECREF(errorHandler);
4891 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004892 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004893
Benjamin Peterson29060642009-01-31 22:14:21 +00004894 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896 Py_XDECREF(errorHandler);
4897 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004898 return NULL;
4899}
4900
Alexander Belopolsky40018472011-02-26 01:02:56 +00004901PyObject *
4902PyUnicode_EncodeASCII(const Py_UNICODE *p,
4903 Py_ssize_t size,
4904 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004906 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907}
4908
Alexander Belopolsky40018472011-02-26 01:02:56 +00004909PyObject *
4910PyUnicode_AsASCIIString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911{
4912 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004913 PyErr_BadArgument();
4914 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915 }
4916 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004917 PyUnicode_GET_SIZE(unicode),
4918 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919}
4920
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004921#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004922
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004923/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004924
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004925#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004926#define NEED_RETRY
4927#endif
4928
4929/* XXX This code is limited to "true" double-byte encodings, as
4930 a) it assumes an incomplete character consists of a single byte, and
4931 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004932 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004933
Alexander Belopolsky40018472011-02-26 01:02:56 +00004934static int
4935is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004936{
4937 const char *curr = s + offset;
4938
4939 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 const char *prev = CharPrev(s, curr);
4941 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004942 }
4943 return 0;
4944}
4945
4946/*
4947 * Decode MBCS string into unicode object. If 'final' is set, converts
4948 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4949 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004950static int
4951decode_mbcs(PyUnicodeObject **v,
4952 const char *s, /* MBCS string */
4953 int size, /* sizeof MBCS string */
4954 int final,
4955 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004956{
4957 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004958 Py_ssize_t n;
4959 DWORD usize;
4960 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004961
4962 assert(size >= 0);
4963
Victor Stinner554f3f02010-06-16 23:33:54 +00004964 /* check and handle 'errors' arg */
4965 if (errors==NULL || strcmp(errors, "strict")==0)
4966 flags = MB_ERR_INVALID_CHARS;
4967 else if (strcmp(errors, "ignore")==0)
4968 flags = 0;
4969 else {
4970 PyErr_Format(PyExc_ValueError,
4971 "mbcs encoding does not support errors='%s'",
4972 errors);
4973 return -1;
4974 }
4975
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004976 /* Skip trailing lead-byte unless 'final' is set */
4977 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004978 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004979
4980 /* First get the size of the result */
4981 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004982 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4983 if (usize==0)
4984 goto mbcs_decode_error;
4985 } else
4986 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004987
4988 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004989 /* Create unicode object */
4990 *v = _PyUnicode_New(usize);
4991 if (*v == NULL)
4992 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004993 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004994 }
4995 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004996 /* Extend unicode object */
4997 n = PyUnicode_GET_SIZE(*v);
4998 if (_PyUnicode_Resize(v, n + usize) < 0)
4999 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005000 }
5001
5002 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00005003 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005004 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005005 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
5006 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00005007 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005008 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005009 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00005010
5011mbcs_decode_error:
5012 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
5013 we raise a UnicodeDecodeError - else it is a 'generic'
5014 windows error
5015 */
5016 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
5017 /* Ideally, we should get reason from FormatMessage - this
5018 is the Windows 2000 English version of the message
5019 */
5020 PyObject *exc = NULL;
5021 const char *reason = "No mapping for the Unicode character exists "
5022 "in the target multi-byte code page.";
5023 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
5024 if (exc != NULL) {
5025 PyCodec_StrictErrors(exc);
5026 Py_DECREF(exc);
5027 }
5028 } else {
5029 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5030 }
5031 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005032}
5033
Alexander Belopolsky40018472011-02-26 01:02:56 +00005034PyObject *
5035PyUnicode_DecodeMBCSStateful(const char *s,
5036 Py_ssize_t size,
5037 const char *errors,
5038 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005039{
5040 PyUnicodeObject *v = NULL;
5041 int done;
5042
5043 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005044 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005045
5046#ifdef NEED_RETRY
5047 retry:
5048 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005049 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005050 else
5051#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005052 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005053
5054 if (done < 0) {
5055 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005057 }
5058
5059 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005061
5062#ifdef NEED_RETRY
5063 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 s += done;
5065 size -= done;
5066 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005067 }
5068#endif
5069
5070 return (PyObject *)v;
5071}
5072
Alexander Belopolsky40018472011-02-26 01:02:56 +00005073PyObject *
5074PyUnicode_DecodeMBCS(const char *s,
5075 Py_ssize_t size,
5076 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005077{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005078 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5079}
5080
5081/*
5082 * Convert unicode into string object (MBCS).
5083 * Returns 0 if succeed, -1 otherwise.
5084 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005085static int
5086encode_mbcs(PyObject **repr,
5087 const Py_UNICODE *p, /* unicode */
5088 int size, /* size of unicode */
5089 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005090{
Victor Stinner554f3f02010-06-16 23:33:54 +00005091 BOOL usedDefaultChar = FALSE;
5092 BOOL *pusedDefaultChar;
5093 int mbcssize;
5094 Py_ssize_t n;
5095 PyObject *exc = NULL;
5096 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005097
5098 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005099
Victor Stinner554f3f02010-06-16 23:33:54 +00005100 /* check and handle 'errors' arg */
5101 if (errors==NULL || strcmp(errors, "strict")==0) {
5102 flags = WC_NO_BEST_FIT_CHARS;
5103 pusedDefaultChar = &usedDefaultChar;
5104 } else if (strcmp(errors, "replace")==0) {
5105 flags = 0;
5106 pusedDefaultChar = NULL;
5107 } else {
5108 PyErr_Format(PyExc_ValueError,
5109 "mbcs encoding does not support errors='%s'",
5110 errors);
5111 return -1;
5112 }
5113
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005114 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005115 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005116 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5117 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005118 if (mbcssize == 0) {
5119 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5120 return -1;
5121 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005122 /* If we used a default char, then we failed! */
5123 if (pusedDefaultChar && *pusedDefaultChar)
5124 goto mbcs_encode_error;
5125 } else {
5126 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005127 }
5128
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005129 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005130 /* Create string object */
5131 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5132 if (*repr == NULL)
5133 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005134 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005135 }
5136 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005137 /* Extend string object */
5138 n = PyBytes_Size(*repr);
5139 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5140 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005141 }
5142
5143 /* Do the conversion */
5144 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005145 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005146 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5147 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005148 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5149 return -1;
5150 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005151 if (pusedDefaultChar && *pusedDefaultChar)
5152 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005153 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005154 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005155
5156mbcs_encode_error:
5157 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5158 Py_XDECREF(exc);
5159 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005160}
5161
Alexander Belopolsky40018472011-02-26 01:02:56 +00005162PyObject *
5163PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5164 Py_ssize_t size,
5165 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005166{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005167 PyObject *repr = NULL;
5168 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005169
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005170#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005171 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005172 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005173 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005174 else
5175#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005176 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005177
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005178 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005179 Py_XDECREF(repr);
5180 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005181 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005182
5183#ifdef NEED_RETRY
5184 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005185 p += INT_MAX;
5186 size -= INT_MAX;
5187 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005188 }
5189#endif
5190
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005191 return repr;
5192}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005193
Alexander Belopolsky40018472011-02-26 01:02:56 +00005194PyObject *
5195PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005196{
5197 if (!PyUnicode_Check(unicode)) {
5198 PyErr_BadArgument();
5199 return NULL;
5200 }
5201 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005202 PyUnicode_GET_SIZE(unicode),
5203 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005204}
5205
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005206#undef NEED_RETRY
5207
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005208#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005209
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210/* --- Character Mapping Codec -------------------------------------------- */
5211
Alexander Belopolsky40018472011-02-26 01:02:56 +00005212PyObject *
5213PyUnicode_DecodeCharmap(const char *s,
5214 Py_ssize_t size,
5215 PyObject *mapping,
5216 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005218 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005219 Py_ssize_t startinpos;
5220 Py_ssize_t endinpos;
5221 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005222 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223 PyUnicodeObject *v;
5224 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005225 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005226 PyObject *errorHandler = NULL;
5227 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005228 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005229 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005230
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231 /* Default to Latin-1 */
5232 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234
5235 v = _PyUnicode_New(size);
5236 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005237 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005241 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005242 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 mapstring = PyUnicode_AS_UNICODE(mapping);
5244 maplen = PyUnicode_GET_SIZE(mapping);
5245 while (s < e) {
5246 unsigned char ch = *s;
5247 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 if (ch < maplen)
5250 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251
Benjamin Peterson29060642009-01-31 22:14:21 +00005252 if (x == 0xfffe) {
5253 /* undefined mapping */
5254 outpos = p-PyUnicode_AS_UNICODE(v);
5255 startinpos = s-starts;
5256 endinpos = startinpos+1;
5257 if (unicode_decode_call_errorhandler(
5258 errors, &errorHandler,
5259 "charmap", "character maps to <undefined>",
5260 &starts, &e, &startinpos, &endinpos, &exc, &s,
5261 &v, &outpos, &p)) {
5262 goto onError;
5263 }
5264 continue;
5265 }
5266 *p++ = x;
5267 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005268 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005269 }
5270 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005271 while (s < e) {
5272 unsigned char ch = *s;
5273 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005274
Benjamin Peterson29060642009-01-31 22:14:21 +00005275 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5276 w = PyLong_FromLong((long)ch);
5277 if (w == NULL)
5278 goto onError;
5279 x = PyObject_GetItem(mapping, w);
5280 Py_DECREF(w);
5281 if (x == NULL) {
5282 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5283 /* No mapping found means: mapping is undefined. */
5284 PyErr_Clear();
5285 x = Py_None;
5286 Py_INCREF(x);
5287 } else
5288 goto onError;
5289 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005290
Benjamin Peterson29060642009-01-31 22:14:21 +00005291 /* Apply mapping */
5292 if (PyLong_Check(x)) {
5293 long value = PyLong_AS_LONG(x);
5294 if (value < 0 || value > 65535) {
5295 PyErr_SetString(PyExc_TypeError,
5296 "character mapping must be in range(65536)");
5297 Py_DECREF(x);
5298 goto onError;
5299 }
5300 *p++ = (Py_UNICODE)value;
5301 }
5302 else if (x == Py_None) {
5303 /* undefined mapping */
5304 outpos = p-PyUnicode_AS_UNICODE(v);
5305 startinpos = s-starts;
5306 endinpos = startinpos+1;
5307 if (unicode_decode_call_errorhandler(
5308 errors, &errorHandler,
5309 "charmap", "character maps to <undefined>",
5310 &starts, &e, &startinpos, &endinpos, &exc, &s,
5311 &v, &outpos, &p)) {
5312 Py_DECREF(x);
5313 goto onError;
5314 }
5315 Py_DECREF(x);
5316 continue;
5317 }
5318 else if (PyUnicode_Check(x)) {
5319 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005320
Benjamin Peterson29060642009-01-31 22:14:21 +00005321 if (targetsize == 1)
5322 /* 1-1 mapping */
5323 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005324
Benjamin Peterson29060642009-01-31 22:14:21 +00005325 else if (targetsize > 1) {
5326 /* 1-n mapping */
5327 if (targetsize > extrachars) {
5328 /* resize first */
5329 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5330 Py_ssize_t needed = (targetsize - extrachars) + \
5331 (targetsize << 2);
5332 extrachars += needed;
5333 /* XXX overflow detection missing */
5334 if (_PyUnicode_Resize(&v,
5335 PyUnicode_GET_SIZE(v) + needed) < 0) {
5336 Py_DECREF(x);
5337 goto onError;
5338 }
5339 p = PyUnicode_AS_UNICODE(v) + oldpos;
5340 }
5341 Py_UNICODE_COPY(p,
5342 PyUnicode_AS_UNICODE(x),
5343 targetsize);
5344 p += targetsize;
5345 extrachars -= targetsize;
5346 }
5347 /* 1-0 mapping: skip the character */
5348 }
5349 else {
5350 /* wrong return value */
5351 PyErr_SetString(PyExc_TypeError,
5352 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005353 Py_DECREF(x);
5354 goto onError;
5355 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 Py_DECREF(x);
5357 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 }
5360 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5362 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005363 Py_XDECREF(errorHandler);
5364 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005366
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005368 Py_XDECREF(errorHandler);
5369 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370 Py_XDECREF(v);
5371 return NULL;
5372}
5373
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005374/* Charmap encoding: the lookup table */
5375
Alexander Belopolsky40018472011-02-26 01:02:56 +00005376struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00005377 PyObject_HEAD
5378 unsigned char level1[32];
5379 int count2, count3;
5380 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005381};
5382
5383static PyObject*
5384encoding_map_size(PyObject *obj, PyObject* args)
5385{
5386 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005387 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005389}
5390
5391static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005392 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005393 PyDoc_STR("Return the size (in bytes) of this object") },
5394 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005395};
5396
5397static void
5398encoding_map_dealloc(PyObject* o)
5399{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005400 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005401}
5402
5403static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005404 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 "EncodingMap", /*tp_name*/
5406 sizeof(struct encoding_map), /*tp_basicsize*/
5407 0, /*tp_itemsize*/
5408 /* methods */
5409 encoding_map_dealloc, /*tp_dealloc*/
5410 0, /*tp_print*/
5411 0, /*tp_getattr*/
5412 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005413 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 0, /*tp_repr*/
5415 0, /*tp_as_number*/
5416 0, /*tp_as_sequence*/
5417 0, /*tp_as_mapping*/
5418 0, /*tp_hash*/
5419 0, /*tp_call*/
5420 0, /*tp_str*/
5421 0, /*tp_getattro*/
5422 0, /*tp_setattro*/
5423 0, /*tp_as_buffer*/
5424 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5425 0, /*tp_doc*/
5426 0, /*tp_traverse*/
5427 0, /*tp_clear*/
5428 0, /*tp_richcompare*/
5429 0, /*tp_weaklistoffset*/
5430 0, /*tp_iter*/
5431 0, /*tp_iternext*/
5432 encoding_map_methods, /*tp_methods*/
5433 0, /*tp_members*/
5434 0, /*tp_getset*/
5435 0, /*tp_base*/
5436 0, /*tp_dict*/
5437 0, /*tp_descr_get*/
5438 0, /*tp_descr_set*/
5439 0, /*tp_dictoffset*/
5440 0, /*tp_init*/
5441 0, /*tp_alloc*/
5442 0, /*tp_new*/
5443 0, /*tp_free*/
5444 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005445};
5446
5447PyObject*
5448PyUnicode_BuildEncodingMap(PyObject* string)
5449{
5450 Py_UNICODE *decode;
5451 PyObject *result;
5452 struct encoding_map *mresult;
5453 int i;
5454 int need_dict = 0;
5455 unsigned char level1[32];
5456 unsigned char level2[512];
5457 unsigned char *mlevel1, *mlevel2, *mlevel3;
5458 int count2 = 0, count3 = 0;
5459
5460 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5461 PyErr_BadArgument();
5462 return NULL;
5463 }
5464 decode = PyUnicode_AS_UNICODE(string);
5465 memset(level1, 0xFF, sizeof level1);
5466 memset(level2, 0xFF, sizeof level2);
5467
5468 /* If there isn't a one-to-one mapping of NULL to \0,
5469 or if there are non-BMP characters, we need to use
5470 a mapping dictionary. */
5471 if (decode[0] != 0)
5472 need_dict = 1;
5473 for (i = 1; i < 256; i++) {
5474 int l1, l2;
5475 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005476#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005477 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005478#endif
5479 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005480 need_dict = 1;
5481 break;
5482 }
5483 if (decode[i] == 0xFFFE)
5484 /* unmapped character */
5485 continue;
5486 l1 = decode[i] >> 11;
5487 l2 = decode[i] >> 7;
5488 if (level1[l1] == 0xFF)
5489 level1[l1] = count2++;
5490 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005491 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005492 }
5493
5494 if (count2 >= 0xFF || count3 >= 0xFF)
5495 need_dict = 1;
5496
5497 if (need_dict) {
5498 PyObject *result = PyDict_New();
5499 PyObject *key, *value;
5500 if (!result)
5501 return NULL;
5502 for (i = 0; i < 256; i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00005503 key = PyLong_FromLong(decode[i]);
5504 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005505 if (!key || !value)
5506 goto failed1;
5507 if (PyDict_SetItem(result, key, value) == -1)
5508 goto failed1;
5509 Py_DECREF(key);
5510 Py_DECREF(value);
5511 }
5512 return result;
5513 failed1:
5514 Py_XDECREF(key);
5515 Py_XDECREF(value);
5516 Py_DECREF(result);
5517 return NULL;
5518 }
5519
5520 /* Create a three-level trie */
5521 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5522 16*count2 + 128*count3 - 1);
5523 if (!result)
5524 return PyErr_NoMemory();
5525 PyObject_Init(result, &EncodingMapType);
5526 mresult = (struct encoding_map*)result;
5527 mresult->count2 = count2;
5528 mresult->count3 = count3;
5529 mlevel1 = mresult->level1;
5530 mlevel2 = mresult->level23;
5531 mlevel3 = mresult->level23 + 16*count2;
5532 memcpy(mlevel1, level1, 32);
5533 memset(mlevel2, 0xFF, 16*count2);
5534 memset(mlevel3, 0, 128*count3);
5535 count3 = 0;
5536 for (i = 1; i < 256; i++) {
5537 int o1, o2, o3, i2, i3;
5538 if (decode[i] == 0xFFFE)
5539 /* unmapped character */
5540 continue;
5541 o1 = decode[i]>>11;
5542 o2 = (decode[i]>>7) & 0xF;
5543 i2 = 16*mlevel1[o1] + o2;
5544 if (mlevel2[i2] == 0xFF)
5545 mlevel2[i2] = count3++;
5546 o3 = decode[i] & 0x7F;
5547 i3 = 128*mlevel2[i2] + o3;
5548 mlevel3[i3] = i;
5549 }
5550 return result;
5551}
5552
5553static int
5554encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5555{
5556 struct encoding_map *map = (struct encoding_map*)mapping;
5557 int l1 = c>>11;
5558 int l2 = (c>>7) & 0xF;
5559 int l3 = c & 0x7F;
5560 int i;
5561
5562#ifdef Py_UNICODE_WIDE
5563 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005565 }
5566#endif
5567 if (c == 0)
5568 return 0;
5569 /* level 1*/
5570 i = map->level1[l1];
5571 if (i == 0xFF) {
5572 return -1;
5573 }
5574 /* level 2*/
5575 i = map->level23[16*i+l2];
5576 if (i == 0xFF) {
5577 return -1;
5578 }
5579 /* level 3 */
5580 i = map->level23[16*map->count2 + 128*i + l3];
5581 if (i == 0) {
5582 return -1;
5583 }
5584 return i;
5585}
5586
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005587/* Lookup the character ch in the mapping. If the character
5588 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005589 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005590static PyObject *
5591charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592{
Christian Heimes217cfd12007-12-02 14:31:20 +00005593 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005594 PyObject *x;
5595
5596 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005597 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005598 x = PyObject_GetItem(mapping, w);
5599 Py_DECREF(w);
5600 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5602 /* No mapping found means: mapping is undefined. */
5603 PyErr_Clear();
5604 x = Py_None;
5605 Py_INCREF(x);
5606 return x;
5607 } else
5608 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005610 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005611 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005612 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 long value = PyLong_AS_LONG(x);
5614 if (value < 0 || value > 255) {
5615 PyErr_SetString(PyExc_TypeError,
5616 "character mapping must be in range(256)");
5617 Py_DECREF(x);
5618 return NULL;
5619 }
5620 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005622 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005623 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005625 /* wrong return value */
5626 PyErr_Format(PyExc_TypeError,
5627 "character mapping must return integer, bytes or None, not %.400s",
5628 x->ob_type->tp_name);
5629 Py_DECREF(x);
5630 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 }
5632}
5633
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005634static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005635charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005636{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005637 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5638 /* exponentially overallocate to minimize reallocations */
5639 if (requiredsize < 2*outsize)
5640 requiredsize = 2*outsize;
5641 if (_PyBytes_Resize(outobj, requiredsize))
5642 return -1;
5643 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005644}
5645
Benjamin Peterson14339b62009-01-31 16:36:08 +00005646typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00005648} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005649/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005650 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005651 space is available. Return a new reference to the object that
5652 was put in the output buffer, or Py_None, if the mapping was undefined
5653 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005654 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005655static charmapencode_result
5656charmapencode_output(Py_UNICODE c, PyObject *mapping,
5657 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005658{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005659 PyObject *rep;
5660 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005661 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005662
Christian Heimes90aa7642007-12-19 02:45:37 +00005663 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005664 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005666 if (res == -1)
5667 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005668 if (outsize<requiredsize)
5669 if (charmapencode_resize(outobj, outpos, requiredsize))
5670 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005671 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 outstart[(*outpos)++] = (char)res;
5673 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005674 }
5675
5676 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005677 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005678 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005679 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005680 Py_DECREF(rep);
5681 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005682 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 if (PyLong_Check(rep)) {
5684 Py_ssize_t requiredsize = *outpos+1;
5685 if (outsize<requiredsize)
5686 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5687 Py_DECREF(rep);
5688 return enc_EXCEPTION;
5689 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005690 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005691 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005692 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 else {
5694 const char *repchars = PyBytes_AS_STRING(rep);
5695 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5696 Py_ssize_t requiredsize = *outpos+repsize;
5697 if (outsize<requiredsize)
5698 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5699 Py_DECREF(rep);
5700 return enc_EXCEPTION;
5701 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005702 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 memcpy(outstart + *outpos, repchars, repsize);
5704 *outpos += repsize;
5705 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005706 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005707 Py_DECREF(rep);
5708 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709}
5710
5711/* handle an error in PyUnicode_EncodeCharmap
5712 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005713static int
5714charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005715 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005716 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005717 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005718 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005719{
5720 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005721 Py_ssize_t repsize;
5722 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005723 Py_UNICODE *uni2;
5724 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005725 Py_ssize_t collstartpos = *inpos;
5726 Py_ssize_t collendpos = *inpos+1;
5727 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005728 char *encoding = "charmap";
5729 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005730 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005731
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005732 /* find all unencodable characters */
5733 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005734 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005735 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 int res = encoding_map_lookup(p[collendpos], mapping);
5737 if (res != -1)
5738 break;
5739 ++collendpos;
5740 continue;
5741 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005742
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 rep = charmapencode_lookup(p[collendpos], mapping);
5744 if (rep==NULL)
5745 return -1;
5746 else if (rep!=Py_None) {
5747 Py_DECREF(rep);
5748 break;
5749 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005750 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752 }
5753 /* cache callback name lookup
5754 * (if not done yet, i.e. it's the first error) */
5755 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005756 if ((errors==NULL) || (!strcmp(errors, "strict")))
5757 *known_errorHandler = 1;
5758 else if (!strcmp(errors, "replace"))
5759 *known_errorHandler = 2;
5760 else if (!strcmp(errors, "ignore"))
5761 *known_errorHandler = 3;
5762 else if (!strcmp(errors, "xmlcharrefreplace"))
5763 *known_errorHandler = 4;
5764 else
5765 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005766 }
5767 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005768 case 1: /* strict */
5769 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5770 return -1;
5771 case 2: /* replace */
5772 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 x = charmapencode_output('?', mapping, res, respos);
5774 if (x==enc_EXCEPTION) {
5775 return -1;
5776 }
5777 else if (x==enc_FAILED) {
5778 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5779 return -1;
5780 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005781 }
5782 /* fall through */
5783 case 3: /* ignore */
5784 *inpos = collendpos;
5785 break;
5786 case 4: /* xmlcharrefreplace */
5787 /* generate replacement (temporarily (mis)uses p) */
5788 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005789 char buffer[2+29+1+1];
5790 char *cp;
5791 sprintf(buffer, "&#%d;", (int)p[collpos]);
5792 for (cp = buffer; *cp; ++cp) {
5793 x = charmapencode_output(*cp, mapping, res, respos);
5794 if (x==enc_EXCEPTION)
5795 return -1;
5796 else if (x==enc_FAILED) {
5797 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5798 return -1;
5799 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005800 }
5801 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005802 *inpos = collendpos;
5803 break;
5804 default:
5805 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 encoding, reason, p, size, exceptionObject,
5807 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005808 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005810 if (PyBytes_Check(repunicode)) {
5811 /* Directly copy bytes result to output. */
5812 Py_ssize_t outsize = PyBytes_Size(*res);
5813 Py_ssize_t requiredsize;
5814 repsize = PyBytes_Size(repunicode);
5815 requiredsize = *respos + repsize;
5816 if (requiredsize > outsize)
5817 /* Make room for all additional bytes. */
5818 if (charmapencode_resize(res, respos, requiredsize)) {
5819 Py_DECREF(repunicode);
5820 return -1;
5821 }
5822 memcpy(PyBytes_AsString(*res) + *respos,
5823 PyBytes_AsString(repunicode), repsize);
5824 *respos += repsize;
5825 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005826 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005827 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005828 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005829 /* generate replacement */
5830 repsize = PyUnicode_GET_SIZE(repunicode);
5831 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 x = charmapencode_output(*uni2, mapping, res, respos);
5833 if (x==enc_EXCEPTION) {
5834 return -1;
5835 }
5836 else if (x==enc_FAILED) {
5837 Py_DECREF(repunicode);
5838 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5839 return -1;
5840 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005841 }
5842 *inpos = newpos;
5843 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005844 }
5845 return 0;
5846}
5847
Alexander Belopolsky40018472011-02-26 01:02:56 +00005848PyObject *
5849PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5850 Py_ssize_t size,
5851 PyObject *mapping,
5852 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005854 /* output object */
5855 PyObject *res = NULL;
5856 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005857 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005858 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005859 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005860 PyObject *errorHandler = NULL;
5861 PyObject *exc = NULL;
5862 /* the following variable is used for caching string comparisons
5863 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5864 * 3=ignore, 4=xmlcharrefreplace */
5865 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866
5867 /* Default to Latin-1 */
5868 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005869 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005871 /* allocate enough for a simple encoding without
5872 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005873 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005874 if (res == NULL)
5875 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005876 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005877 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005879 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 /* try to encode it */
5881 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5882 if (x==enc_EXCEPTION) /* error */
5883 goto onError;
5884 if (x==enc_FAILED) { /* unencodable character */
5885 if (charmap_encoding_error(p, size, &inpos, mapping,
5886 &exc,
5887 &known_errorHandler, &errorHandler, errors,
5888 &res, &respos)) {
5889 goto onError;
5890 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005891 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005892 else
5893 /* done with this character => adjust input position */
5894 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005897 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005898 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005899 if (_PyBytes_Resize(&res, respos) < 0)
5900 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005901
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005902 Py_XDECREF(exc);
5903 Py_XDECREF(errorHandler);
5904 return res;
5905
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005907 Py_XDECREF(res);
5908 Py_XDECREF(exc);
5909 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 return NULL;
5911}
5912
Alexander Belopolsky40018472011-02-26 01:02:56 +00005913PyObject *
5914PyUnicode_AsCharmapString(PyObject *unicode,
5915 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916{
5917 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 PyErr_BadArgument();
5919 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 }
5921 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 PyUnicode_GET_SIZE(unicode),
5923 mapping,
5924 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925}
5926
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005927/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005928static void
5929make_translate_exception(PyObject **exceptionObject,
5930 const Py_UNICODE *unicode, Py_ssize_t size,
5931 Py_ssize_t startpos, Py_ssize_t endpos,
5932 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005934 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005935 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 }
5938 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5940 goto onError;
5941 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5942 goto onError;
5943 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5944 goto onError;
5945 return;
5946 onError:
5947 Py_DECREF(*exceptionObject);
5948 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 }
5950}
5951
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005952/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005953static void
5954raise_translate_exception(PyObject **exceptionObject,
5955 const Py_UNICODE *unicode, Py_ssize_t size,
5956 Py_ssize_t startpos, Py_ssize_t endpos,
5957 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005958{
5959 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005961 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005963}
5964
5965/* error handling callback helper:
5966 build arguments, call the callback and check the arguments,
5967 put the result into newpos and return the replacement string, which
5968 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005969static PyObject *
5970unicode_translate_call_errorhandler(const char *errors,
5971 PyObject **errorHandler,
5972 const char *reason,
5973 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5974 Py_ssize_t startpos, Py_ssize_t endpos,
5975 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005976{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005977 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005978
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005979 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005980 PyObject *restuple;
5981 PyObject *resunicode;
5982
5983 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005985 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987 }
5988
5989 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005990 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005991 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005993
5994 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005996 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005997 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005998 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00005999 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006000 Py_DECREF(restuple);
6001 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006002 }
6003 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 &resunicode, &i_newpos)) {
6005 Py_DECREF(restuple);
6006 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006007 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006008 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006010 else
6011 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006012 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6014 Py_DECREF(restuple);
6015 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006016 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006017 Py_INCREF(resunicode);
6018 Py_DECREF(restuple);
6019 return resunicode;
6020}
6021
6022/* Lookup the character ch in the mapping and put the result in result,
6023 which must be decrefed by the caller.
6024 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006025static int
6026charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006027{
Christian Heimes217cfd12007-12-02 14:31:20 +00006028 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006029 PyObject *x;
6030
6031 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006032 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006033 x = PyObject_GetItem(mapping, w);
6034 Py_DECREF(w);
6035 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6037 /* No mapping found means: use 1:1 mapping. */
6038 PyErr_Clear();
6039 *result = NULL;
6040 return 0;
6041 } else
6042 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006043 }
6044 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 *result = x;
6046 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006047 }
Christian Heimes217cfd12007-12-02 14:31:20 +00006048 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 long value = PyLong_AS_LONG(x);
6050 long max = PyUnicode_GetMax();
6051 if (value < 0 || value > max) {
6052 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00006053 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 Py_DECREF(x);
6055 return -1;
6056 }
6057 *result = x;
6058 return 0;
6059 }
6060 else if (PyUnicode_Check(x)) {
6061 *result = x;
6062 return 0;
6063 }
6064 else {
6065 /* wrong return value */
6066 PyErr_SetString(PyExc_TypeError,
6067 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006068 Py_DECREF(x);
6069 return -1;
6070 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006071}
6072/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 if not reallocate and adjust various state variables.
6074 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006075static int
6076charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006078{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006079 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006080 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 /* remember old output position */
6082 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6083 /* exponentially overallocate to minimize reallocations */
6084 if (requiredsize < 2 * oldsize)
6085 requiredsize = 2 * oldsize;
6086 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6087 return -1;
6088 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006089 }
6090 return 0;
6091}
6092/* lookup the character, put the result in the output string and adjust
6093 various state variables. Return a new reference to the object that
6094 was put in the output buffer in *result, or Py_None, if the mapping was
6095 undefined (in which case no character was written).
6096 The called must decref result.
6097 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006098static int
6099charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
6100 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6101 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006102{
Walter Dörwald4894c302003-10-24 14:25:28 +00006103 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006104 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006105 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 /* not found => default to 1:1 mapping */
6107 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006108 }
6109 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006111 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006112 /* no overflow check, because we know that the space is enough */
6113 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006114 }
6115 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006116 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6117 if (repsize==1) {
6118 /* no overflow check, because we know that the space is enough */
6119 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6120 }
6121 else if (repsize!=0) {
6122 /* more than one character */
6123 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6124 (insize - (curinp-startinp)) +
6125 repsize - 1;
6126 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6127 return -1;
6128 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6129 *outp += repsize;
6130 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131 }
6132 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134 return 0;
6135}
6136
Alexander Belopolsky40018472011-02-26 01:02:56 +00006137PyObject *
6138PyUnicode_TranslateCharmap(const Py_UNICODE *p,
6139 Py_ssize_t size,
6140 PyObject *mapping,
6141 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143 /* output object */
6144 PyObject *res = NULL;
6145 /* pointers to the beginning and end+1 of input */
6146 const Py_UNICODE *startp = p;
6147 const Py_UNICODE *endp = p + size;
6148 /* pointer into the output */
6149 Py_UNICODE *str;
6150 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006151 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006152 char *reason = "character maps to <undefined>";
6153 PyObject *errorHandler = NULL;
6154 PyObject *exc = NULL;
6155 /* the following variable is used for caching string comparisons
6156 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6157 * 3=ignore, 4=xmlcharrefreplace */
6158 int known_errorHandler = -1;
6159
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 PyErr_BadArgument();
6162 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006164
6165 /* allocate enough for a simple 1:1 translation without
6166 replacements, if we need more, we'll resize */
6167 res = PyUnicode_FromUnicode(NULL, size);
6168 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006169 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006172 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006174 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006175 /* try to encode it */
6176 PyObject *x = NULL;
6177 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6178 Py_XDECREF(x);
6179 goto onError;
6180 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006181 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 if (x!=Py_None) /* it worked => adjust input pointer */
6183 ++p;
6184 else { /* untranslatable character */
6185 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6186 Py_ssize_t repsize;
6187 Py_ssize_t newpos;
6188 Py_UNICODE *uni2;
6189 /* startpos for collecting untranslatable chars */
6190 const Py_UNICODE *collstart = p;
6191 const Py_UNICODE *collend = p+1;
6192 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 /* find all untranslatable characters */
6195 while (collend < endp) {
6196 if (charmaptranslate_lookup(*collend, mapping, &x))
6197 goto onError;
6198 Py_XDECREF(x);
6199 if (x!=Py_None)
6200 break;
6201 ++collend;
6202 }
6203 /* cache callback name lookup
6204 * (if not done yet, i.e. it's the first error) */
6205 if (known_errorHandler==-1) {
6206 if ((errors==NULL) || (!strcmp(errors, "strict")))
6207 known_errorHandler = 1;
6208 else if (!strcmp(errors, "replace"))
6209 known_errorHandler = 2;
6210 else if (!strcmp(errors, "ignore"))
6211 known_errorHandler = 3;
6212 else if (!strcmp(errors, "xmlcharrefreplace"))
6213 known_errorHandler = 4;
6214 else
6215 known_errorHandler = 0;
6216 }
6217 switch (known_errorHandler) {
6218 case 1: /* strict */
6219 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006220 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006221 case 2: /* replace */
6222 /* No need to check for space, this is a 1:1 replacement */
6223 for (coll = collstart; coll<collend; ++coll)
6224 *str++ = '?';
6225 /* fall through */
6226 case 3: /* ignore */
6227 p = collend;
6228 break;
6229 case 4: /* xmlcharrefreplace */
6230 /* generate replacement (temporarily (mis)uses p) */
6231 for (p = collstart; p < collend; ++p) {
6232 char buffer[2+29+1+1];
6233 char *cp;
6234 sprintf(buffer, "&#%d;", (int)*p);
6235 if (charmaptranslate_makespace(&res, &str,
6236 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6237 goto onError;
6238 for (cp = buffer; *cp; ++cp)
6239 *str++ = *cp;
6240 }
6241 p = collend;
6242 break;
6243 default:
6244 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6245 reason, startp, size, &exc,
6246 collstart-startp, collend-startp, &newpos);
6247 if (repunicode == NULL)
6248 goto onError;
6249 /* generate replacement */
6250 repsize = PyUnicode_GET_SIZE(repunicode);
6251 if (charmaptranslate_makespace(&res, &str,
6252 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6253 Py_DECREF(repunicode);
6254 goto onError;
6255 }
6256 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6257 *str++ = *uni2;
6258 p = startp + newpos;
6259 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006260 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006261 }
6262 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006263 /* Resize if we allocated to much */
6264 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006265 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 if (PyUnicode_Resize(&res, respos) < 0)
6267 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006268 }
6269 Py_XDECREF(exc);
6270 Py_XDECREF(errorHandler);
6271 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006274 Py_XDECREF(res);
6275 Py_XDECREF(exc);
6276 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 return NULL;
6278}
6279
Alexander Belopolsky40018472011-02-26 01:02:56 +00006280PyObject *
6281PyUnicode_Translate(PyObject *str,
6282 PyObject *mapping,
6283 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284{
6285 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006286
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 str = PyUnicode_FromObject(str);
6288 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006291 PyUnicode_GET_SIZE(str),
6292 mapping,
6293 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 Py_DECREF(str);
6295 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006296
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 Py_XDECREF(str);
6299 return NULL;
6300}
Tim Petersced69f82003-09-16 20:30:58 +00006301
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006302PyObject *
6303PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6304 Py_ssize_t length)
6305{
6306 PyObject *result;
6307 Py_UNICODE *p; /* write pointer into result */
6308 Py_ssize_t i;
6309 /* Copy to a new string */
6310 result = (PyObject *)_PyUnicode_New(length);
6311 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6312 if (result == NULL)
6313 return result;
6314 p = PyUnicode_AS_UNICODE(result);
6315 /* Iterate over code points */
6316 for (i = 0; i < length; i++) {
6317 Py_UNICODE ch =s[i];
6318 if (ch > 127) {
6319 int decimal = Py_UNICODE_TODECIMAL(ch);
6320 if (decimal >= 0)
6321 p[i] = '0' + decimal;
6322 }
6323 }
6324 return result;
6325}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006326/* --- Decimal Encoder ---------------------------------------------------- */
6327
Alexander Belopolsky40018472011-02-26 01:02:56 +00006328int
6329PyUnicode_EncodeDecimal(Py_UNICODE *s,
6330 Py_ssize_t length,
6331 char *output,
6332 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006333{
6334 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006335 PyObject *errorHandler = NULL;
6336 PyObject *exc = NULL;
6337 const char *encoding = "decimal";
6338 const char *reason = "invalid decimal Unicode string";
6339 /* the following variable is used for caching string comparisons
6340 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6341 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006342
6343 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 PyErr_BadArgument();
6345 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006346 }
6347
6348 p = s;
6349 end = s + length;
6350 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 register Py_UNICODE ch = *p;
6352 int decimal;
6353 PyObject *repunicode;
6354 Py_ssize_t repsize;
6355 Py_ssize_t newpos;
6356 Py_UNICODE *uni2;
6357 Py_UNICODE *collstart;
6358 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006359
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006361 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006362 ++p;
6363 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006364 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 decimal = Py_UNICODE_TODECIMAL(ch);
6366 if (decimal >= 0) {
6367 *output++ = '0' + decimal;
6368 ++p;
6369 continue;
6370 }
6371 if (0 < ch && ch < 256) {
6372 *output++ = (char)ch;
6373 ++p;
6374 continue;
6375 }
6376 /* All other characters are considered unencodable */
6377 collstart = p;
6378 collend = p+1;
6379 while (collend < end) {
6380 if ((0 < *collend && *collend < 256) ||
6381 !Py_UNICODE_ISSPACE(*collend) ||
6382 Py_UNICODE_TODECIMAL(*collend))
6383 break;
6384 }
6385 /* cache callback name lookup
6386 * (if not done yet, i.e. it's the first error) */
6387 if (known_errorHandler==-1) {
6388 if ((errors==NULL) || (!strcmp(errors, "strict")))
6389 known_errorHandler = 1;
6390 else if (!strcmp(errors, "replace"))
6391 known_errorHandler = 2;
6392 else if (!strcmp(errors, "ignore"))
6393 known_errorHandler = 3;
6394 else if (!strcmp(errors, "xmlcharrefreplace"))
6395 known_errorHandler = 4;
6396 else
6397 known_errorHandler = 0;
6398 }
6399 switch (known_errorHandler) {
6400 case 1: /* strict */
6401 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6402 goto onError;
6403 case 2: /* replace */
6404 for (p = collstart; p < collend; ++p)
6405 *output++ = '?';
6406 /* fall through */
6407 case 3: /* ignore */
6408 p = collend;
6409 break;
6410 case 4: /* xmlcharrefreplace */
6411 /* generate replacement (temporarily (mis)uses p) */
6412 for (p = collstart; p < collend; ++p)
6413 output += sprintf(output, "&#%d;", (int)*p);
6414 p = collend;
6415 break;
6416 default:
6417 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6418 encoding, reason, s, length, &exc,
6419 collstart-s, collend-s, &newpos);
6420 if (repunicode == NULL)
6421 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006422 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006423 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006424 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6425 Py_DECREF(repunicode);
6426 goto onError;
6427 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006428 /* generate replacement */
6429 repsize = PyUnicode_GET_SIZE(repunicode);
6430 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6431 Py_UNICODE ch = *uni2;
6432 if (Py_UNICODE_ISSPACE(ch))
6433 *output++ = ' ';
6434 else {
6435 decimal = Py_UNICODE_TODECIMAL(ch);
6436 if (decimal >= 0)
6437 *output++ = '0' + decimal;
6438 else if (0 < ch && ch < 256)
6439 *output++ = (char)ch;
6440 else {
6441 Py_DECREF(repunicode);
6442 raise_encode_exception(&exc, encoding,
6443 s, length, collstart-s, collend-s, reason);
6444 goto onError;
6445 }
6446 }
6447 }
6448 p = s + newpos;
6449 Py_DECREF(repunicode);
6450 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006451 }
6452 /* 0-terminate the output string */
6453 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006454 Py_XDECREF(exc);
6455 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006456 return 0;
6457
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459 Py_XDECREF(exc);
6460 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006461 return -1;
6462}
6463
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464/* --- Helpers ------------------------------------------------------------ */
6465
Eric Smith8c663262007-08-25 02:26:07 +00006466#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006467#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006468
Thomas Wouters477c8d52006-05-27 19:21:47 +00006469#include "stringlib/count.h"
6470#include "stringlib/find.h"
6471#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006472#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006473
Eric Smith5807c412008-05-11 21:00:57 +00006474#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006475#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006476#include "stringlib/localeutil.h"
6477
Thomas Wouters477c8d52006-05-27 19:21:47 +00006478/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006479#define ADJUST_INDICES(start, end, len) \
6480 if (end > len) \
6481 end = len; \
6482 else if (end < 0) { \
6483 end += len; \
6484 if (end < 0) \
6485 end = 0; \
6486 } \
6487 if (start < 0) { \
6488 start += len; \
6489 if (start < 0) \
6490 start = 0; \
6491 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006492
Alexander Belopolsky40018472011-02-26 01:02:56 +00006493Py_ssize_t
6494PyUnicode_Count(PyObject *str,
6495 PyObject *substr,
6496 Py_ssize_t start,
6497 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006499 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006500 PyUnicodeObject* str_obj;
6501 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006502
Thomas Wouters477c8d52006-05-27 19:21:47 +00006503 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6504 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006506 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6507 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 Py_DECREF(str_obj);
6509 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 }
Tim Petersced69f82003-09-16 20:30:58 +00006511
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006512 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006513 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006514 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6515 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006516 );
6517
6518 Py_DECREF(sub_obj);
6519 Py_DECREF(str_obj);
6520
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 return result;
6522}
6523
Alexander Belopolsky40018472011-02-26 01:02:56 +00006524Py_ssize_t
6525PyUnicode_Find(PyObject *str,
6526 PyObject *sub,
6527 Py_ssize_t start,
6528 Py_ssize_t end,
6529 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006531 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006532
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006534 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006536 sub = PyUnicode_FromObject(sub);
6537 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 Py_DECREF(str);
6539 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 }
Tim Petersced69f82003-09-16 20:30:58 +00006541
Thomas Wouters477c8d52006-05-27 19:21:47 +00006542 if (direction > 0)
6543 result = stringlib_find_slice(
6544 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6545 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6546 start, end
6547 );
6548 else
6549 result = stringlib_rfind_slice(
6550 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6551 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6552 start, end
6553 );
6554
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006556 Py_DECREF(sub);
6557
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 return result;
6559}
6560
Alexander Belopolsky40018472011-02-26 01:02:56 +00006561static int
6562tailmatch(PyUnicodeObject *self,
6563 PyUnicodeObject *substring,
6564 Py_ssize_t start,
6565 Py_ssize_t end,
6566 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006568 if (substring->length == 0)
6569 return 1;
6570
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006571 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 end -= substring->length;
6573 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575
6576 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 if (Py_UNICODE_MATCH(self, end, substring))
6578 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 } else {
6580 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 }
6583
6584 return 0;
6585}
6586
Alexander Belopolsky40018472011-02-26 01:02:56 +00006587Py_ssize_t
6588PyUnicode_Tailmatch(PyObject *str,
6589 PyObject *substr,
6590 Py_ssize_t start,
6591 Py_ssize_t end,
6592 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006594 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006595
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596 str = PyUnicode_FromObject(str);
6597 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599 substr = PyUnicode_FromObject(substr);
6600 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 Py_DECREF(str);
6602 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 }
Tim Petersced69f82003-09-16 20:30:58 +00006604
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 (PyUnicodeObject *)substr,
6607 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 Py_DECREF(str);
6609 Py_DECREF(substr);
6610 return result;
6611}
6612
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613/* Apply fixfct filter to the Unicode object self and return a
6614 reference to the modified object */
6615
Alexander Belopolsky40018472011-02-26 01:02:56 +00006616static PyObject *
6617fixup(PyUnicodeObject *self,
6618 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619{
6620
6621 PyUnicodeObject *u;
6622
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006623 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006626
6627 Py_UNICODE_COPY(u->str, self->str, self->length);
6628
Tim Peters7a29bd52001-09-12 03:03:31 +00006629 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 /* fixfct should return TRUE if it modified the buffer. If
6631 FALSE, return a reference to the original buffer instead
6632 (to save space, not time) */
6633 Py_INCREF(self);
6634 Py_DECREF(u);
6635 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636 }
6637 return (PyObject*) u;
6638}
6639
Alexander Belopolsky40018472011-02-26 01:02:56 +00006640static int
6641fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006643 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644 Py_UNICODE *s = self->str;
6645 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006646
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006648 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006649
Benjamin Peterson29060642009-01-31 22:14:21 +00006650 ch = Py_UNICODE_TOUPPER(*s);
6651 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006653 *s = ch;
6654 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655 s++;
6656 }
6657
6658 return status;
6659}
6660
Alexander Belopolsky40018472011-02-26 01:02:56 +00006661static int
6662fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006664 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006665 Py_UNICODE *s = self->str;
6666 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006667
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006670
Benjamin Peterson29060642009-01-31 22:14:21 +00006671 ch = Py_UNICODE_TOLOWER(*s);
6672 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 *s = ch;
6675 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 s++;
6677 }
6678
6679 return status;
6680}
6681
Alexander Belopolsky40018472011-02-26 01:02:56 +00006682static int
6683fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006685 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686 Py_UNICODE *s = self->str;
6687 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006688
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 while (len-- > 0) {
6690 if (Py_UNICODE_ISUPPER(*s)) {
6691 *s = Py_UNICODE_TOLOWER(*s);
6692 status = 1;
6693 } else if (Py_UNICODE_ISLOWER(*s)) {
6694 *s = Py_UNICODE_TOUPPER(*s);
6695 status = 1;
6696 }
6697 s++;
6698 }
6699
6700 return status;
6701}
6702
Alexander Belopolsky40018472011-02-26 01:02:56 +00006703static int
6704fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006705{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006706 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006707 Py_UNICODE *s = self->str;
6708 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006709
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006710 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006711 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006712 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 *s = Py_UNICODE_TOUPPER(*s);
6714 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006716 s++;
6717 while (--len > 0) {
6718 if (Py_UNICODE_ISUPPER(*s)) {
6719 *s = Py_UNICODE_TOLOWER(*s);
6720 status = 1;
6721 }
6722 s++;
6723 }
6724 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725}
6726
Alexander Belopolsky40018472011-02-26 01:02:56 +00006727static int
6728fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729{
6730 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6731 register Py_UNICODE *e;
6732 int previous_is_cased;
6733
6734 /* Shortcut for single character strings */
6735 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6737 if (*p != ch) {
6738 *p = ch;
6739 return 1;
6740 }
6741 else
6742 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 }
Tim Petersced69f82003-09-16 20:30:58 +00006744
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 e = p + PyUnicode_GET_SIZE(self);
6746 previous_is_cased = 0;
6747 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006748 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006749
Benjamin Peterson29060642009-01-31 22:14:21 +00006750 if (previous_is_cased)
6751 *p = Py_UNICODE_TOLOWER(ch);
6752 else
6753 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006754
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 if (Py_UNICODE_ISLOWER(ch) ||
6756 Py_UNICODE_ISUPPER(ch) ||
6757 Py_UNICODE_ISTITLE(ch))
6758 previous_is_cased = 1;
6759 else
6760 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 }
6762 return 1;
6763}
6764
Tim Peters8ce9f162004-08-27 01:49:32 +00006765PyObject *
6766PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767{
Skip Montanaro6543b452004-09-16 03:28:13 +00006768 const Py_UNICODE blank = ' ';
6769 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006770 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006771 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006772 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6773 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006774 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6775 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006776 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006777 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778
Tim Peters05eba1f2004-08-27 21:32:02 +00006779 fseq = PySequence_Fast(seq, "");
6780 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006781 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006782 }
6783
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006784 /* NOTE: the following code can't call back into Python code,
6785 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006786 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006787
Tim Peters05eba1f2004-08-27 21:32:02 +00006788 seqlen = PySequence_Fast_GET_SIZE(fseq);
6789 /* If empty sequence, return u"". */
6790 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006791 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6792 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006793 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006794 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006795 /* If singleton sequence with an exact Unicode, return that. */
6796 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006797 item = items[0];
6798 if (PyUnicode_CheckExact(item)) {
6799 Py_INCREF(item);
6800 res = (PyUnicodeObject *)item;
6801 goto Done;
6802 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006803 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006804 else {
6805 /* Set up sep and seplen */
6806 if (separator == NULL) {
6807 sep = &blank;
6808 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006809 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006810 else {
6811 if (!PyUnicode_Check(separator)) {
6812 PyErr_Format(PyExc_TypeError,
6813 "separator: expected str instance,"
6814 " %.80s found",
6815 Py_TYPE(separator)->tp_name);
6816 goto onError;
6817 }
6818 sep = PyUnicode_AS_UNICODE(separator);
6819 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006820 }
6821 }
6822
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006823 /* There are at least two things to join, or else we have a subclass
6824 * of str in the sequence.
6825 * Do a pre-pass to figure out the total amount of space we'll
6826 * need (sz), and see whether all argument are strings.
6827 */
6828 sz = 0;
6829 for (i = 0; i < seqlen; i++) {
6830 const Py_ssize_t old_sz = sz;
6831 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 if (!PyUnicode_Check(item)) {
6833 PyErr_Format(PyExc_TypeError,
6834 "sequence item %zd: expected str instance,"
6835 " %.80s found",
6836 i, Py_TYPE(item)->tp_name);
6837 goto onError;
6838 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006839 sz += PyUnicode_GET_SIZE(item);
6840 if (i != 0)
6841 sz += seplen;
6842 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6843 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006844 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006845 goto onError;
6846 }
6847 }
Tim Petersced69f82003-09-16 20:30:58 +00006848
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006849 res = _PyUnicode_New(sz);
6850 if (res == NULL)
6851 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006852
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006853 /* Catenate everything. */
6854 res_p = PyUnicode_AS_UNICODE(res);
6855 for (i = 0; i < seqlen; ++i) {
6856 Py_ssize_t itemlen;
6857 item = items[i];
6858 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 /* Copy item, and maybe the separator. */
6860 if (i) {
6861 Py_UNICODE_COPY(res_p, sep, seplen);
6862 res_p += seplen;
6863 }
6864 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6865 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006866 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006867
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006869 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 return (PyObject *)res;
6871
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006873 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006874 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006875 return NULL;
6876}
6877
Alexander Belopolsky40018472011-02-26 01:02:56 +00006878static PyUnicodeObject *
6879pad(PyUnicodeObject *self,
6880 Py_ssize_t left,
6881 Py_ssize_t right,
6882 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883{
6884 PyUnicodeObject *u;
6885
6886 if (left < 0)
6887 left = 0;
6888 if (right < 0)
6889 right = 0;
6890
Tim Peters7a29bd52001-09-12 03:03:31 +00006891 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892 Py_INCREF(self);
6893 return self;
6894 }
6895
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006896 if (left > PY_SSIZE_T_MAX - self->length ||
6897 right > PY_SSIZE_T_MAX - (left + self->length)) {
6898 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6899 return NULL;
6900 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 u = _PyUnicode_New(left + self->length + right);
6902 if (u) {
6903 if (left)
6904 Py_UNICODE_FILL(u->str, fill, left);
6905 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6906 if (right)
6907 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6908 }
6909
6910 return u;
6911}
6912
Alexander Belopolsky40018472011-02-26 01:02:56 +00006913PyObject *
6914PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006915{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006917
6918 string = PyUnicode_FromObject(string);
6919 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006922 list = stringlib_splitlines(
6923 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6924 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925
6926 Py_DECREF(string);
6927 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006928}
6929
Alexander Belopolsky40018472011-02-26 01:02:56 +00006930static PyObject *
6931split(PyUnicodeObject *self,
6932 PyUnicodeObject *substring,
6933 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006936 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006939 return stringlib_split_whitespace(
6940 (PyObject*) self, self->str, self->length, maxcount
6941 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006943 return stringlib_split(
6944 (PyObject*) self, self->str, self->length,
6945 substring->str, substring->length,
6946 maxcount
6947 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006948}
6949
Alexander Belopolsky40018472011-02-26 01:02:56 +00006950static PyObject *
6951rsplit(PyUnicodeObject *self,
6952 PyUnicodeObject *substring,
6953 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006954{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006955 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006956 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006957
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006958 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006959 return stringlib_rsplit_whitespace(
6960 (PyObject*) self, self->str, self->length, maxcount
6961 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006962
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006963 return stringlib_rsplit(
6964 (PyObject*) self, self->str, self->length,
6965 substring->str, substring->length,
6966 maxcount
6967 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006968}
6969
Alexander Belopolsky40018472011-02-26 01:02:56 +00006970static PyObject *
6971replace(PyUnicodeObject *self,
6972 PyUnicodeObject *str1,
6973 PyUnicodeObject *str2,
6974 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006975{
6976 PyUnicodeObject *u;
6977
6978 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006979 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006980 else if (maxcount == 0 || self->length == 0)
6981 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982
Thomas Wouters477c8d52006-05-27 19:21:47 +00006983 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006984 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006985 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006986 if (str1->length == 0)
6987 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006988 if (str1->length == 1) {
6989 /* replace characters */
6990 Py_UNICODE u1, u2;
6991 if (!findchar(self->str, self->length, str1->str[0]))
6992 goto nothing;
6993 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6994 if (!u)
6995 return NULL;
6996 Py_UNICODE_COPY(u->str, self->str, self->length);
6997 u1 = str1->str[0];
6998 u2 = str2->str[0];
6999 for (i = 0; i < u->length; i++)
7000 if (u->str[i] == u1) {
7001 if (--maxcount < 0)
7002 break;
7003 u->str[i] = u2;
7004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007005 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007006 i = stringlib_find(
7007 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00007009 if (i < 0)
7010 goto nothing;
7011 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7012 if (!u)
7013 return NULL;
7014 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007015
7016 /* change everything in-place, starting with this one */
7017 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7018 i += str1->length;
7019
7020 while ( --maxcount > 0) {
7021 i = stringlib_find(self->str+i, self->length-i,
7022 str1->str, str1->length,
7023 i);
7024 if (i == -1)
7025 break;
7026 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7027 i += str1->length;
7028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007030 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007031
Brett Cannonb94767f2011-02-22 20:15:44 +00007032 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007033 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034 Py_UNICODE *p;
7035
7036 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007037 n = stringlib_count(self->str, self->length, str1->str, str1->length,
7038 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007039 if (n == 0)
7040 goto nothing;
7041 /* new_size = self->length + n * (str2->length - str1->length)); */
7042 delta = (str2->length - str1->length);
7043 if (delta == 0) {
7044 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007046 product = n * (str2->length - str1->length);
7047 if ((product / (str2->length - str1->length)) != n) {
7048 PyErr_SetString(PyExc_OverflowError,
7049 "replace string is too long");
7050 return NULL;
7051 }
7052 new_size = self->length + product;
7053 if (new_size < 0) {
7054 PyErr_SetString(PyExc_OverflowError,
7055 "replace string is too long");
7056 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007057 }
7058 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007059 u = _PyUnicode_New(new_size);
7060 if (!u)
7061 return NULL;
7062 i = 0;
7063 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007064 if (str1->length > 0) {
7065 while (n-- > 0) {
7066 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007067 j = stringlib_find(self->str+i, self->length-i,
7068 str1->str, str1->length,
7069 i);
7070 if (j == -1)
7071 break;
7072 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007073 /* copy unchanged part [i:j] */
7074 Py_UNICODE_COPY(p, self->str+i, j-i);
7075 p += j - i;
7076 }
7077 /* copy substitution string */
7078 if (str2->length > 0) {
7079 Py_UNICODE_COPY(p, str2->str, str2->length);
7080 p += str2->length;
7081 }
7082 i = j + str1->length;
7083 }
7084 if (i < self->length)
7085 /* copy tail [i:] */
7086 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7087 } else {
7088 /* interleave */
7089 while (n > 0) {
7090 Py_UNICODE_COPY(p, str2->str, str2->length);
7091 p += str2->length;
7092 if (--n <= 0)
7093 break;
7094 *p++ = self->str[i++];
7095 }
7096 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007100
Benjamin Peterson29060642009-01-31 22:14:21 +00007101 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007102 /* nothing to replace; return original string (when possible) */
7103 if (PyUnicode_CheckExact(self)) {
7104 Py_INCREF(self);
7105 return (PyObject *) self;
7106 }
7107 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108}
7109
7110/* --- Unicode Object Methods --------------------------------------------- */
7111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007112PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007114\n\
7115Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007116characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117
7118static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007119unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007120{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121 return fixup(self, fixtitle);
7122}
7123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007124PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126\n\
7127Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007128have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129
7130static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007131unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133 return fixup(self, fixcapitalize);
7134}
7135
7136#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007137PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139\n\
7140Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007141normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142
7143static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007144unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145{
7146 PyObject *list;
7147 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007148 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150 /* Split into words */
7151 list = split(self, NULL, -1);
7152 if (!list)
7153 return NULL;
7154
7155 /* Capitalize each word */
7156 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7157 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007158 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 if (item == NULL)
7160 goto onError;
7161 Py_DECREF(PyList_GET_ITEM(list, i));
7162 PyList_SET_ITEM(list, i, item);
7163 }
7164
7165 /* Join the words to form a new string */
7166 item = PyUnicode_Join(NULL, list);
7167
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169 Py_DECREF(list);
7170 return (PyObject *)item;
7171}
7172#endif
7173
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007174/* Argument converter. Coerces to a single unicode character */
7175
7176static int
7177convert_uc(PyObject *obj, void *addr)
7178{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007179 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7180 PyObject *uniobj;
7181 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007182
Benjamin Peterson14339b62009-01-31 16:36:08 +00007183 uniobj = PyUnicode_FromObject(obj);
7184 if (uniobj == NULL) {
7185 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007186 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007187 return 0;
7188 }
7189 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7190 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007191 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007192 Py_DECREF(uniobj);
7193 return 0;
7194 }
7195 unistr = PyUnicode_AS_UNICODE(uniobj);
7196 *fillcharloc = unistr[0];
7197 Py_DECREF(uniobj);
7198 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007199}
7200
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007201PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007203\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007204Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007205done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206
7207static PyObject *
7208unicode_center(PyUnicodeObject *self, PyObject *args)
7209{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007210 Py_ssize_t marg, left;
7211 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007212 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213
Thomas Woutersde017742006-02-16 19:34:37 +00007214 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 return NULL;
7216
Tim Peters7a29bd52001-09-12 03:03:31 +00007217 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218 Py_INCREF(self);
7219 return (PyObject*) self;
7220 }
7221
7222 marg = width - self->length;
7223 left = marg / 2 + (marg & width & 1);
7224
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007225 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226}
7227
Marc-André Lemburge5034372000-08-08 08:04:29 +00007228#if 0
7229
7230/* This code should go into some future Unicode collation support
7231 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007232 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007233
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007234/* speedy UTF-16 code point order comparison */
7235/* gleaned from: */
7236/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7237
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007238static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007239{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007240 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007241 0, 0, 0, 0, 0, 0, 0, 0,
7242 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007243 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007244};
7245
Guido van Rossumd57fd912000-03-10 22:53:23 +00007246static int
7247unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7248{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007249 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007250
Guido van Rossumd57fd912000-03-10 22:53:23 +00007251 Py_UNICODE *s1 = str1->str;
7252 Py_UNICODE *s2 = str2->str;
7253
7254 len1 = str1->length;
7255 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007256
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007258 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007259
7260 c1 = *s1++;
7261 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007262
Benjamin Peterson29060642009-01-31 22:14:21 +00007263 if (c1 > (1<<11) * 26)
7264 c1 += utf16Fixup[c1>>11];
7265 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007266 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007267 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007268
7269 if (c1 != c2)
7270 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007271
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007272 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273 }
7274
7275 return (len1 < len2) ? -1 : (len1 != len2);
7276}
7277
Marc-André Lemburge5034372000-08-08 08:04:29 +00007278#else
7279
7280static int
7281unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7282{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007283 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007284
7285 Py_UNICODE *s1 = str1->str;
7286 Py_UNICODE *s2 = str2->str;
7287
7288 len1 = str1->length;
7289 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007290
Marc-André Lemburge5034372000-08-08 08:04:29 +00007291 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007292 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007293
Fredrik Lundh45714e92001-06-26 16:39:36 +00007294 c1 = *s1++;
7295 c2 = *s2++;
7296
7297 if (c1 != c2)
7298 return (c1 < c2) ? -1 : 1;
7299
Marc-André Lemburge5034372000-08-08 08:04:29 +00007300 len1--; len2--;
7301 }
7302
7303 return (len1 < len2) ? -1 : (len1 != len2);
7304}
7305
7306#endif
7307
Alexander Belopolsky40018472011-02-26 01:02:56 +00007308int
7309PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007311 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7312 return unicode_compare((PyUnicodeObject *)left,
7313 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007314 PyErr_Format(PyExc_TypeError,
7315 "Can't compare %.100s and %.100s",
7316 left->ob_type->tp_name,
7317 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318 return -1;
7319}
7320
Martin v. Löwis5b222132007-06-10 09:51:05 +00007321int
7322PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7323{
7324 int i;
7325 Py_UNICODE *id;
7326 assert(PyUnicode_Check(uni));
7327 id = PyUnicode_AS_UNICODE(uni);
7328 /* Compare Unicode string and source character set string */
7329 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007330 if (id[i] != str[i])
7331 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007332 /* This check keeps Python strings that end in '\0' from comparing equal
7333 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007334 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007336 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007337 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007338 return 0;
7339}
7340
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007341
Benjamin Peterson29060642009-01-31 22:14:21 +00007342#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007343 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007344
Alexander Belopolsky40018472011-02-26 01:02:56 +00007345PyObject *
7346PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007347{
7348 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007349
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007350 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7351 PyObject *v;
7352 if (((PyUnicodeObject *) left)->length !=
7353 ((PyUnicodeObject *) right)->length) {
7354 if (op == Py_EQ) {
7355 Py_INCREF(Py_False);
7356 return Py_False;
7357 }
7358 if (op == Py_NE) {
7359 Py_INCREF(Py_True);
7360 return Py_True;
7361 }
7362 }
7363 if (left == right)
7364 result = 0;
7365 else
7366 result = unicode_compare((PyUnicodeObject *)left,
7367 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007368
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007369 /* Convert the return value to a Boolean */
7370 switch (op) {
7371 case Py_EQ:
7372 v = TEST_COND(result == 0);
7373 break;
7374 case Py_NE:
7375 v = TEST_COND(result != 0);
7376 break;
7377 case Py_LE:
7378 v = TEST_COND(result <= 0);
7379 break;
7380 case Py_GE:
7381 v = TEST_COND(result >= 0);
7382 break;
7383 case Py_LT:
7384 v = TEST_COND(result == -1);
7385 break;
7386 case Py_GT:
7387 v = TEST_COND(result == 1);
7388 break;
7389 default:
7390 PyErr_BadArgument();
7391 return NULL;
7392 }
7393 Py_INCREF(v);
7394 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007395 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007396
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007397 Py_INCREF(Py_NotImplemented);
7398 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007399}
7400
Alexander Belopolsky40018472011-02-26 01:02:56 +00007401int
7402PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007403{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007404 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007405 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007406
7407 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007408 sub = PyUnicode_FromObject(element);
7409 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 PyErr_Format(PyExc_TypeError,
7411 "'in <string>' requires string as left operand, not %s",
7412 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007413 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007414 }
7415
Thomas Wouters477c8d52006-05-27 19:21:47 +00007416 str = PyUnicode_FromObject(container);
7417 if (!str) {
7418 Py_DECREF(sub);
7419 return -1;
7420 }
7421
7422 result = stringlib_contains_obj(str, sub);
7423
7424 Py_DECREF(str);
7425 Py_DECREF(sub);
7426
Guido van Rossum403d68b2000-03-13 15:55:09 +00007427 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007428}
7429
Guido van Rossumd57fd912000-03-10 22:53:23 +00007430/* Concat to string or Unicode object giving a new Unicode object. */
7431
Alexander Belopolsky40018472011-02-26 01:02:56 +00007432PyObject *
7433PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434{
7435 PyUnicodeObject *u = NULL, *v = NULL, *w;
7436
7437 /* Coerce the two arguments */
7438 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7439 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007440 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007441 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7442 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007443 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007444
7445 /* Shortcuts */
7446 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 Py_DECREF(v);
7448 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 }
7450 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 Py_DECREF(u);
7452 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 }
7454
7455 /* Concat the two Unicode strings */
7456 w = _PyUnicode_New(u->length + v->length);
7457 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 Py_UNICODE_COPY(w->str, u->str, u->length);
7460 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7461
7462 Py_DECREF(u);
7463 Py_DECREF(v);
7464 return (PyObject *)w;
7465
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467 Py_XDECREF(u);
7468 Py_XDECREF(v);
7469 return NULL;
7470}
7471
Walter Dörwald1ab83302007-05-18 17:15:44 +00007472void
7473PyUnicode_Append(PyObject **pleft, PyObject *right)
7474{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007475 PyObject *new;
7476 if (*pleft == NULL)
7477 return;
7478 if (right == NULL || !PyUnicode_Check(*pleft)) {
7479 Py_DECREF(*pleft);
7480 *pleft = NULL;
7481 return;
7482 }
7483 new = PyUnicode_Concat(*pleft, right);
7484 Py_DECREF(*pleft);
7485 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007486}
7487
7488void
7489PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7490{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007491 PyUnicode_Append(pleft, right);
7492 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007493}
7494
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007495PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007496 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007497\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007498Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007499string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007500interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501
7502static PyObject *
7503unicode_count(PyUnicodeObject *self, PyObject *args)
7504{
7505 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007506 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007507 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508 PyObject *result;
7509
Guido van Rossumb8872e62000-05-09 14:14:27 +00007510 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007511 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512 return NULL;
7513
7514 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007515 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007518
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007519 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007520 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007521 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007522 substring->str, substring->length,
7523 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007524 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007525
7526 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007527
Guido van Rossumd57fd912000-03-10 22:53:23 +00007528 return result;
7529}
7530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007531PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007532 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007534Encode S using the codec registered for encoding. Default encoding\n\
7535is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007536handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007537a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7538'xmlcharrefreplace' as well as any other name registered with\n\
7539codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007540
7541static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007542unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007544 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007545 char *encoding = NULL;
7546 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007547
Benjamin Peterson308d6372009-09-18 21:42:35 +00007548 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7549 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007550 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007551 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007552}
7553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007554PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007556\n\
7557Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007558If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007559
7560static PyObject*
7561unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7562{
7563 Py_UNICODE *e;
7564 Py_UNICODE *p;
7565 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007566 Py_UNICODE *qe;
7567 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568 PyUnicodeObject *u;
7569 int tabsize = 8;
7570
7571 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007573
Thomas Wouters7e474022000-07-16 12:04:32 +00007574 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007575 i = 0; /* chars up to and including most recent \n or \r */
7576 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7577 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007578 for (p = self->str; p < e; p++)
7579 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 if (tabsize > 0) {
7581 incr = tabsize - (j % tabsize); /* cannot overflow */
7582 if (j > PY_SSIZE_T_MAX - incr)
7583 goto overflow1;
7584 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007585 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 if (j > PY_SSIZE_T_MAX - 1)
7589 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 j++;
7591 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007592 if (i > PY_SSIZE_T_MAX - j)
7593 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007595 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596 }
7597 }
7598
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007599 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007601
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602 /* Second pass: create output string and fill it */
7603 u = _PyUnicode_New(i + j);
7604 if (!u)
7605 return NULL;
7606
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007607 j = 0; /* same as in first pass */
7608 q = u->str; /* next output char */
7609 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610
7611 for (p = self->str; p < e; p++)
7612 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007613 if (tabsize > 0) {
7614 i = tabsize - (j % tabsize);
7615 j += i;
7616 while (i--) {
7617 if (q >= qe)
7618 goto overflow2;
7619 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007620 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007621 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007622 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 else {
7624 if (q >= qe)
7625 goto overflow2;
7626 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007627 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628 if (*p == '\n' || *p == '\r')
7629 j = 0;
7630 }
7631
7632 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007633
7634 overflow2:
7635 Py_DECREF(u);
7636 overflow1:
7637 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639}
7640
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007641PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643\n\
7644Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007645such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646arguments start and end are interpreted as in slice notation.\n\
7647\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007648Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649
7650static PyObject *
7651unicode_find(PyUnicodeObject *self, PyObject *args)
7652{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007653 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007654 Py_ssize_t start;
7655 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007656 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657
Christian Heimes9cd17752007-11-18 19:35:23 +00007658 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007660
Thomas Wouters477c8d52006-05-27 19:21:47 +00007661 result = stringlib_find_slice(
7662 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7663 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7664 start, end
7665 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666
7667 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007668
Christian Heimes217cfd12007-12-02 14:31:20 +00007669 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670}
7671
7672static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007673unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674{
7675 if (index < 0 || index >= self->length) {
7676 PyErr_SetString(PyExc_IndexError, "string index out of range");
7677 return NULL;
7678 }
7679
7680 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7681}
7682
Guido van Rossumc2504932007-09-18 19:42:40 +00007683/* Believe it or not, this produces the same value for ASCII strings
7684 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007685static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007686unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687{
Guido van Rossumc2504932007-09-18 19:42:40 +00007688 Py_ssize_t len;
7689 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007690 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007691
7692 if (self->hash != -1)
7693 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007694 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007695 p = self->str;
7696 x = *p << 7;
7697 while (--len >= 0)
7698 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007699 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007700 if (x == -1)
7701 x = -2;
7702 self->hash = x;
7703 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704}
7705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007706PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007707 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007709Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710
7711static PyObject *
7712unicode_index(PyUnicodeObject *self, PyObject *args)
7713{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007714 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007715 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007716 Py_ssize_t start;
7717 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718
Christian Heimes9cd17752007-11-18 19:35:23 +00007719 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007721
Thomas Wouters477c8d52006-05-27 19:21:47 +00007722 result = stringlib_find_slice(
7723 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7724 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7725 start, end
7726 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727
7728 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007729
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730 if (result < 0) {
7731 PyErr_SetString(PyExc_ValueError, "substring not found");
7732 return NULL;
7733 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007734
Christian Heimes217cfd12007-12-02 14:31:20 +00007735 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736}
7737
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007738PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007741Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007742at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007743
7744static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007745unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746{
7747 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7748 register const Py_UNICODE *e;
7749 int cased;
7750
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751 /* Shortcut for single character strings */
7752 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007753 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007755 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007756 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007757 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007758
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759 e = p + PyUnicode_GET_SIZE(self);
7760 cased = 0;
7761 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007763
Benjamin Peterson29060642009-01-31 22:14:21 +00007764 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7765 return PyBool_FromLong(0);
7766 else if (!cased && Py_UNICODE_ISLOWER(ch))
7767 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007769 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007770}
7771
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007772PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007775Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007776at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007777
7778static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007779unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007780{
7781 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7782 register const Py_UNICODE *e;
7783 int cased;
7784
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785 /* Shortcut for single character strings */
7786 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007789 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007790 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007792
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 e = p + PyUnicode_GET_SIZE(self);
7794 cased = 0;
7795 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007797
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7799 return PyBool_FromLong(0);
7800 else if (!cased && Py_UNICODE_ISUPPER(ch))
7801 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007802 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007803 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007804}
7805
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007806PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007807 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007809Return True if S is a titlecased string and there is at least one\n\
7810character in S, i.e. upper- and titlecase characters may only\n\
7811follow uncased characters and lowercase characters only cased ones.\n\
7812Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813
7814static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007815unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816{
7817 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7818 register const Py_UNICODE *e;
7819 int cased, previous_is_cased;
7820
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821 /* Shortcut for single character strings */
7822 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7824 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007826 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007827 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007829
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830 e = p + PyUnicode_GET_SIZE(self);
7831 cased = 0;
7832 previous_is_cased = 0;
7833 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007835
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7837 if (previous_is_cased)
7838 return PyBool_FromLong(0);
7839 previous_is_cased = 1;
7840 cased = 1;
7841 }
7842 else if (Py_UNICODE_ISLOWER(ch)) {
7843 if (!previous_is_cased)
7844 return PyBool_FromLong(0);
7845 previous_is_cased = 1;
7846 cased = 1;
7847 }
7848 else
7849 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007851 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852}
7853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007854PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007855 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007857Return True if all characters in S are whitespace\n\
7858and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007859
7860static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007861unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862{
7863 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7864 register const Py_UNICODE *e;
7865
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866 /* Shortcut for single character strings */
7867 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 Py_UNICODE_ISSPACE(*p))
7869 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007871 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007872 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007874
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875 e = p + PyUnicode_GET_SIZE(self);
7876 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 if (!Py_UNICODE_ISSPACE(*p))
7878 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007880 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007881}
7882
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007883PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007884 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007885\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007886Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007887and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007888
7889static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007890unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007891{
7892 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7893 register const Py_UNICODE *e;
7894
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007895 /* Shortcut for single character strings */
7896 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 Py_UNICODE_ISALPHA(*p))
7898 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007899
7900 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007901 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007903
7904 e = p + PyUnicode_GET_SIZE(self);
7905 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 if (!Py_UNICODE_ISALPHA(*p))
7907 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007908 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007909 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007910}
7911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007912PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007913 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007914\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007915Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007916and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007917
7918static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007919unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007920{
7921 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7922 register const Py_UNICODE *e;
7923
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007924 /* Shortcut for single character strings */
7925 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 Py_UNICODE_ISALNUM(*p))
7927 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007928
7929 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007930 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007931 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007932
7933 e = p + PyUnicode_GET_SIZE(self);
7934 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 if (!Py_UNICODE_ISALNUM(*p))
7936 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007937 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007938 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007939}
7940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007941PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007942 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007943\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007944Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007945False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946
7947static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007948unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007949{
7950 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7951 register const Py_UNICODE *e;
7952
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953 /* Shortcut for single character strings */
7954 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 Py_UNICODE_ISDECIMAL(*p))
7956 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007958 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007959 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007961
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962 e = p + PyUnicode_GET_SIZE(self);
7963 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007964 if (!Py_UNICODE_ISDECIMAL(*p))
7965 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007967 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007968}
7969
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007970PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007971 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007973Return True if all characters in S are digits\n\
7974and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975
7976static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007977unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978{
7979 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7980 register const Py_UNICODE *e;
7981
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982 /* Shortcut for single character strings */
7983 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 Py_UNICODE_ISDIGIT(*p))
7985 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007987 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007988 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007990
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 e = p + PyUnicode_GET_SIZE(self);
7992 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 if (!Py_UNICODE_ISDIGIT(*p))
7994 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007996 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997}
7998
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007999PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00008002Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008003False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004
8005static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008006unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007{
8008 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8009 register const Py_UNICODE *e;
8010
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011 /* Shortcut for single character strings */
8012 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 Py_UNICODE_ISNUMERIC(*p))
8014 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008016 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008017 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008018 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008019
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 e = p + PyUnicode_GET_SIZE(self);
8021 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 if (!Py_UNICODE_ISNUMERIC(*p))
8023 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008025 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026}
8027
Martin v. Löwis47383402007-08-15 07:32:56 +00008028int
8029PyUnicode_IsIdentifier(PyObject *self)
8030{
8031 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
8032 register const Py_UNICODE *e;
8033
8034 /* Special case for empty strings */
8035 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008037
8038 /* PEP 3131 says that the first character must be in
8039 XID_Start and subsequent characters in XID_Continue,
8040 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00008041 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00008042 letters, digits, underscore). However, given the current
8043 definition of XID_Start and XID_Continue, it is sufficient
8044 to check just for these, except that _ must be allowed
8045 as starting an identifier. */
8046 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
8047 return 0;
8048
8049 e = p + PyUnicode_GET_SIZE(self);
8050 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 if (!_PyUnicode_IsXidContinue(*p))
8052 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008053 }
8054 return 1;
8055}
8056
8057PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008059\n\
8060Return True if S is a valid identifier according\n\
8061to the language definition.");
8062
8063static PyObject*
8064unicode_isidentifier(PyObject *self)
8065{
8066 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8067}
8068
Georg Brandl559e5d72008-06-11 18:37:52 +00008069PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008071\n\
8072Return True if all characters in S are considered\n\
8073printable in repr() or S is empty, False otherwise.");
8074
8075static PyObject*
8076unicode_isprintable(PyObject *self)
8077{
8078 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8079 register const Py_UNICODE *e;
8080
8081 /* Shortcut for single character strings */
8082 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8083 Py_RETURN_TRUE;
8084 }
8085
8086 e = p + PyUnicode_GET_SIZE(self);
8087 for (; p < e; p++) {
8088 if (!Py_UNICODE_ISPRINTABLE(*p)) {
8089 Py_RETURN_FALSE;
8090 }
8091 }
8092 Py_RETURN_TRUE;
8093}
8094
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008095PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008096 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008097\n\
8098Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008099iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008100
8101static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008102unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008103{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008104 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105}
8106
Martin v. Löwis18e16552006-02-15 17:27:45 +00008107static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108unicode_length(PyUnicodeObject *self)
8109{
8110 return self->length;
8111}
8112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008113PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008116Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008117done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118
8119static PyObject *
8120unicode_ljust(PyUnicodeObject *self, PyObject *args)
8121{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008122 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008123 Py_UNICODE fillchar = ' ';
8124
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008125 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126 return NULL;
8127
Tim Peters7a29bd52001-09-12 03:03:31 +00008128 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129 Py_INCREF(self);
8130 return (PyObject*) self;
8131 }
8132
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008133 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134}
8135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008136PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008139Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140
8141static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008142unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144 return fixup(self, fixlower);
8145}
8146
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008147#define LEFTSTRIP 0
8148#define RIGHTSTRIP 1
8149#define BOTHSTRIP 2
8150
8151/* Arrays indexed by above */
8152static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8153
8154#define STRIPNAME(i) (stripformat[i]+3)
8155
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008156/* externally visible for str.strip(unicode) */
8157PyObject *
8158_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8159{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008160 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8161 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8162 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8163 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8164 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008165
Benjamin Peterson29060642009-01-31 22:14:21 +00008166 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008167
Benjamin Peterson14339b62009-01-31 16:36:08 +00008168 i = 0;
8169 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008170 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8171 i++;
8172 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008173 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008174
Benjamin Peterson14339b62009-01-31 16:36:08 +00008175 j = len;
8176 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 do {
8178 j--;
8179 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8180 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008181 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008182
Benjamin Peterson14339b62009-01-31 16:36:08 +00008183 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 Py_INCREF(self);
8185 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008186 }
8187 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008189}
8190
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191
8192static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008193do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008195 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8196 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008197
Benjamin Peterson14339b62009-01-31 16:36:08 +00008198 i = 0;
8199 if (striptype != RIGHTSTRIP) {
8200 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8201 i++;
8202 }
8203 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008204
Benjamin Peterson14339b62009-01-31 16:36:08 +00008205 j = len;
8206 if (striptype != LEFTSTRIP) {
8207 do {
8208 j--;
8209 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8210 j++;
8211 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008212
Benjamin Peterson14339b62009-01-31 16:36:08 +00008213 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8214 Py_INCREF(self);
8215 return (PyObject*)self;
8216 }
8217 else
8218 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008219}
8220
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008221
8222static PyObject *
8223do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8224{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008225 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008226
Benjamin Peterson14339b62009-01-31 16:36:08 +00008227 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8228 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008229
Benjamin Peterson14339b62009-01-31 16:36:08 +00008230 if (sep != NULL && sep != Py_None) {
8231 if (PyUnicode_Check(sep))
8232 return _PyUnicode_XStrip(self, striptype, sep);
8233 else {
8234 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 "%s arg must be None or str",
8236 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008237 return NULL;
8238 }
8239 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008240
Benjamin Peterson14339b62009-01-31 16:36:08 +00008241 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008242}
8243
8244
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008245PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008247\n\
8248Return a copy of the string S with leading and trailing\n\
8249whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008250If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008251
8252static PyObject *
8253unicode_strip(PyUnicodeObject *self, PyObject *args)
8254{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008255 if (PyTuple_GET_SIZE(args) == 0)
8256 return do_strip(self, BOTHSTRIP); /* Common case */
8257 else
8258 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008259}
8260
8261
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008262PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008264\n\
8265Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008266If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008267
8268static PyObject *
8269unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8270{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008271 if (PyTuple_GET_SIZE(args) == 0)
8272 return do_strip(self, LEFTSTRIP); /* Common case */
8273 else
8274 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008275}
8276
8277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008278PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008280\n\
8281Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008282If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008283
8284static PyObject *
8285unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8286{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008287 if (PyTuple_GET_SIZE(args) == 0)
8288 return do_strip(self, RIGHTSTRIP); /* Common case */
8289 else
8290 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008291}
8292
8293
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008295unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296{
8297 PyUnicodeObject *u;
8298 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008299 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008300 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301
Georg Brandl222de0f2009-04-12 12:01:50 +00008302 if (len < 1) {
8303 Py_INCREF(unicode_empty);
8304 return (PyObject *)unicode_empty;
8305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306
Tim Peters7a29bd52001-09-12 03:03:31 +00008307 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 /* no repeat, return original string */
8309 Py_INCREF(str);
8310 return (PyObject*) str;
8311 }
Tim Peters8f422462000-09-09 06:13:41 +00008312
8313 /* ensure # of chars needed doesn't overflow int and # of bytes
8314 * needed doesn't overflow size_t
8315 */
8316 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008317 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008318 PyErr_SetString(PyExc_OverflowError,
8319 "repeated string is too long");
8320 return NULL;
8321 }
8322 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8323 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8324 PyErr_SetString(PyExc_OverflowError,
8325 "repeated string is too long");
8326 return NULL;
8327 }
8328 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329 if (!u)
8330 return NULL;
8331
8332 p = u->str;
8333
Georg Brandl222de0f2009-04-12 12:01:50 +00008334 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008335 Py_UNICODE_FILL(p, str->str[0], len);
8336 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008337 Py_ssize_t done = str->length; /* number of characters copied this far */
8338 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008340 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008341 Py_UNICODE_COPY(p+done, p, n);
8342 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344 }
8345
8346 return (PyObject*) u;
8347}
8348
Alexander Belopolsky40018472011-02-26 01:02:56 +00008349PyObject *
8350PyUnicode_Replace(PyObject *obj,
8351 PyObject *subobj,
8352 PyObject *replobj,
8353 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354{
8355 PyObject *self;
8356 PyObject *str1;
8357 PyObject *str2;
8358 PyObject *result;
8359
8360 self = PyUnicode_FromObject(obj);
8361 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 str1 = PyUnicode_FromObject(subobj);
8364 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 Py_DECREF(self);
8366 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367 }
8368 str2 = PyUnicode_FromObject(replobj);
8369 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 Py_DECREF(self);
8371 Py_DECREF(str1);
8372 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008373 }
Tim Petersced69f82003-09-16 20:30:58 +00008374 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 (PyUnicodeObject *)str1,
8376 (PyUnicodeObject *)str2,
8377 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 Py_DECREF(self);
8379 Py_DECREF(str1);
8380 Py_DECREF(str2);
8381 return result;
8382}
8383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008384PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008385 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386\n\
8387Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008388old replaced by new. If the optional argument count is\n\
8389given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008390
8391static PyObject*
8392unicode_replace(PyUnicodeObject *self, PyObject *args)
8393{
8394 PyUnicodeObject *str1;
8395 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008396 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397 PyObject *result;
8398
Martin v. Löwis18e16552006-02-15 17:27:45 +00008399 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400 return NULL;
8401 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8402 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008405 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 Py_DECREF(str1);
8407 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008408 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008409
8410 result = replace(self, str1, str2, maxcount);
8411
8412 Py_DECREF(str1);
8413 Py_DECREF(str2);
8414 return result;
8415}
8416
Alexander Belopolsky40018472011-02-26 01:02:56 +00008417static PyObject *
8418unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008420 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008421 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008422 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8423 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8424
8425 /* XXX(nnorwitz): rather than over-allocating, it would be
8426 better to choose a different scheme. Perhaps scan the
8427 first N-chars of the string and allocate based on that size.
8428 */
8429 /* Initial allocation is based on the longest-possible unichr
8430 escape.
8431
8432 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8433 unichr, so in this case it's the longest unichr escape. In
8434 narrow (UTF-16) builds this is five chars per source unichr
8435 since there are two unichrs in the surrogate pair, so in narrow
8436 (UTF-16) builds it's not the longest unichr escape.
8437
8438 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8439 so in the narrow (UTF-16) build case it's the longest unichr
8440 escape.
8441 */
8442
Walter Dörwald1ab83302007-05-18 17:15:44 +00008443 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008445#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008447#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008449#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008451 if (repr == NULL)
8452 return NULL;
8453
Walter Dörwald1ab83302007-05-18 17:15:44 +00008454 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008455
8456 /* Add quote */
8457 *p++ = (findchar(s, size, '\'') &&
8458 !findchar(s, size, '"')) ? '"' : '\'';
8459 while (size-- > 0) {
8460 Py_UNICODE ch = *s++;
8461
8462 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008463 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008464 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008465 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008466 continue;
8467 }
8468
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008470 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008471 *p++ = '\\';
8472 *p++ = 't';
8473 }
8474 else if (ch == '\n') {
8475 *p++ = '\\';
8476 *p++ = 'n';
8477 }
8478 else if (ch == '\r') {
8479 *p++ = '\\';
8480 *p++ = 'r';
8481 }
8482
8483 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008484 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008485 *p++ = '\\';
8486 *p++ = 'x';
8487 *p++ = hexdigits[(ch >> 4) & 0x000F];
8488 *p++ = hexdigits[ch & 0x000F];
8489 }
8490
Georg Brandl559e5d72008-06-11 18:37:52 +00008491 /* Copy ASCII characters as-is */
8492 else if (ch < 0x7F) {
8493 *p++ = ch;
8494 }
8495
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008497 else {
8498 Py_UCS4 ucs = ch;
8499
8500#ifndef Py_UNICODE_WIDE
8501 Py_UNICODE ch2 = 0;
8502 /* Get code point from surrogate pair */
8503 if (size > 0) {
8504 ch2 = *s;
8505 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008507 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008509 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008510 size--;
8511 }
8512 }
8513#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008514 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008515 (categories Z* and C* except ASCII space)
8516 */
8517 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8518 /* Map 8-bit characters to '\xhh' */
8519 if (ucs <= 0xff) {
8520 *p++ = '\\';
8521 *p++ = 'x';
8522 *p++ = hexdigits[(ch >> 4) & 0x000F];
8523 *p++ = hexdigits[ch & 0x000F];
8524 }
8525 /* Map 21-bit characters to '\U00xxxxxx' */
8526 else if (ucs >= 0x10000) {
8527 *p++ = '\\';
8528 *p++ = 'U';
8529 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8530 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8531 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8532 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8533 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8534 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8535 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8536 *p++ = hexdigits[ucs & 0x0000000F];
8537 }
8538 /* Map 16-bit characters to '\uxxxx' */
8539 else {
8540 *p++ = '\\';
8541 *p++ = 'u';
8542 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8543 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8544 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8545 *p++ = hexdigits[ucs & 0x000F];
8546 }
8547 }
8548 /* Copy characters as-is */
8549 else {
8550 *p++ = ch;
8551#ifndef Py_UNICODE_WIDE
8552 if (ucs >= 0x10000)
8553 *p++ = ch2;
8554#endif
8555 }
8556 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008557 }
8558 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008559 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008560
8561 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008562 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008563 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564}
8565
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008566PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568\n\
8569Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008570such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571arguments start and end are interpreted as in slice notation.\n\
8572\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008573Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574
8575static PyObject *
8576unicode_rfind(PyUnicodeObject *self, PyObject *args)
8577{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008578 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008579 Py_ssize_t start;
8580 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008581 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582
Christian Heimes9cd17752007-11-18 19:35:23 +00008583 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008584 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585
Thomas Wouters477c8d52006-05-27 19:21:47 +00008586 result = stringlib_rfind_slice(
8587 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8588 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8589 start, end
8590 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591
8592 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008593
Christian Heimes217cfd12007-12-02 14:31:20 +00008594 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595}
8596
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008597PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008598 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008600Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601
8602static PyObject *
8603unicode_rindex(PyUnicodeObject *self, PyObject *args)
8604{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008605 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008606 Py_ssize_t start;
8607 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008608 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609
Christian Heimes9cd17752007-11-18 19:35:23 +00008610 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008611 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612
Thomas Wouters477c8d52006-05-27 19:21:47 +00008613 result = stringlib_rfind_slice(
8614 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8615 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8616 start, end
8617 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618
8619 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008620
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 if (result < 0) {
8622 PyErr_SetString(PyExc_ValueError, "substring not found");
8623 return NULL;
8624 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008625 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626}
8627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008628PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008631Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008632done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633
8634static PyObject *
8635unicode_rjust(PyUnicodeObject *self, PyObject *args)
8636{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008637 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008638 Py_UNICODE fillchar = ' ';
8639
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008640 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641 return NULL;
8642
Tim Peters7a29bd52001-09-12 03:03:31 +00008643 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644 Py_INCREF(self);
8645 return (PyObject*) self;
8646 }
8647
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008648 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649}
8650
Alexander Belopolsky40018472011-02-26 01:02:56 +00008651PyObject *
8652PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653{
8654 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008655
Guido van Rossumd57fd912000-03-10 22:53:23 +00008656 s = PyUnicode_FromObject(s);
8657 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008658 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 if (sep != NULL) {
8660 sep = PyUnicode_FromObject(sep);
8661 if (sep == NULL) {
8662 Py_DECREF(s);
8663 return NULL;
8664 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 }
8666
8667 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8668
8669 Py_DECREF(s);
8670 Py_XDECREF(sep);
8671 return result;
8672}
8673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008674PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676\n\
8677Return a list of the words in S, using sep as the\n\
8678delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008679splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008680whitespace string is a separator and empty strings are\n\
8681removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682
8683static PyObject*
8684unicode_split(PyUnicodeObject *self, PyObject *args)
8685{
8686 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008687 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008688
Martin v. Löwis18e16552006-02-15 17:27:45 +00008689 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690 return NULL;
8691
8692 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008694 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008696 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698}
8699
Thomas Wouters477c8d52006-05-27 19:21:47 +00008700PyObject *
8701PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8702{
8703 PyObject* str_obj;
8704 PyObject* sep_obj;
8705 PyObject* out;
8706
8707 str_obj = PyUnicode_FromObject(str_in);
8708 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008710 sep_obj = PyUnicode_FromObject(sep_in);
8711 if (!sep_obj) {
8712 Py_DECREF(str_obj);
8713 return NULL;
8714 }
8715
8716 out = stringlib_partition(
8717 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8718 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8719 );
8720
8721 Py_DECREF(sep_obj);
8722 Py_DECREF(str_obj);
8723
8724 return out;
8725}
8726
8727
8728PyObject *
8729PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8730{
8731 PyObject* str_obj;
8732 PyObject* sep_obj;
8733 PyObject* out;
8734
8735 str_obj = PyUnicode_FromObject(str_in);
8736 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008738 sep_obj = PyUnicode_FromObject(sep_in);
8739 if (!sep_obj) {
8740 Py_DECREF(str_obj);
8741 return NULL;
8742 }
8743
8744 out = stringlib_rpartition(
8745 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8746 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8747 );
8748
8749 Py_DECREF(sep_obj);
8750 Py_DECREF(str_obj);
8751
8752 return out;
8753}
8754
8755PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008756 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008757\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008758Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008759the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008760found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008761
8762static PyObject*
8763unicode_partition(PyUnicodeObject *self, PyObject *separator)
8764{
8765 return PyUnicode_Partition((PyObject *)self, separator);
8766}
8767
8768PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008769 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008770\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008771Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008772the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008773separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008774
8775static PyObject*
8776unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8777{
8778 return PyUnicode_RPartition((PyObject *)self, separator);
8779}
8780
Alexander Belopolsky40018472011-02-26 01:02:56 +00008781PyObject *
8782PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008783{
8784 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008785
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008786 s = PyUnicode_FromObject(s);
8787 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008788 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008789 if (sep != NULL) {
8790 sep = PyUnicode_FromObject(sep);
8791 if (sep == NULL) {
8792 Py_DECREF(s);
8793 return NULL;
8794 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008795 }
8796
8797 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8798
8799 Py_DECREF(s);
8800 Py_XDECREF(sep);
8801 return result;
8802}
8803
8804PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008806\n\
8807Return a list of the words in S, using sep as the\n\
8808delimiter string, starting at the end of the string and\n\
8809working to the front. If maxsplit is given, at most maxsplit\n\
8810splits are done. If sep is not specified, any whitespace string\n\
8811is a separator.");
8812
8813static PyObject*
8814unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8815{
8816 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008817 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008818
Martin v. Löwis18e16552006-02-15 17:27:45 +00008819 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008820 return NULL;
8821
8822 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008824 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008826 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008828}
8829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008830PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008831 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008832\n\
8833Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008834Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008835is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836
8837static PyObject*
8838unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8839{
Guido van Rossum86662912000-04-11 15:38:46 +00008840 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008841
Guido van Rossum86662912000-04-11 15:38:46 +00008842 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008843 return NULL;
8844
Guido van Rossum86662912000-04-11 15:38:46 +00008845 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008846}
8847
8848static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008849PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850{
Walter Dörwald346737f2007-05-31 10:44:43 +00008851 if (PyUnicode_CheckExact(self)) {
8852 Py_INCREF(self);
8853 return self;
8854 } else
8855 /* Subtype -- return genuine unicode string with the same value. */
8856 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8857 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858}
8859
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008860PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008861 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862\n\
8863Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008864and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865
8866static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008867unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869 return fixup(self, fixswapcase);
8870}
8871
Georg Brandlceee0772007-11-27 23:48:05 +00008872PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008873 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008874\n\
8875Return a translation table usable for str.translate().\n\
8876If there is only one argument, it must be a dictionary mapping Unicode\n\
8877ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008878Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008879If there are two arguments, they must be strings of equal length, and\n\
8880in the resulting dictionary, each character in x will be mapped to the\n\
8881character at the same position in y. If there is a third argument, it\n\
8882must be a string, whose characters will be mapped to None in the result.");
8883
8884static PyObject*
8885unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8886{
8887 PyObject *x, *y = NULL, *z = NULL;
8888 PyObject *new = NULL, *key, *value;
8889 Py_ssize_t i = 0;
8890 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008891
Georg Brandlceee0772007-11-27 23:48:05 +00008892 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8893 return NULL;
8894 new = PyDict_New();
8895 if (!new)
8896 return NULL;
8897 if (y != NULL) {
8898 /* x must be a string too, of equal length */
8899 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8900 if (!PyUnicode_Check(x)) {
8901 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8902 "be a string if there is a second argument");
8903 goto err;
8904 }
8905 if (PyUnicode_GET_SIZE(x) != ylen) {
8906 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8907 "arguments must have equal length");
8908 goto err;
8909 }
8910 /* create entries for translating chars in x to those in y */
8911 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008912 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8913 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008914 if (!key || !value)
8915 goto err;
8916 res = PyDict_SetItem(new, key, value);
8917 Py_DECREF(key);
8918 Py_DECREF(value);
8919 if (res < 0)
8920 goto err;
8921 }
8922 /* create entries for deleting chars in z */
8923 if (z != NULL) {
8924 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008925 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008926 if (!key)
8927 goto err;
8928 res = PyDict_SetItem(new, key, Py_None);
8929 Py_DECREF(key);
8930 if (res < 0)
8931 goto err;
8932 }
8933 }
8934 } else {
8935 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008936 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008937 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8938 "to maketrans it must be a dict");
8939 goto err;
8940 }
8941 /* copy entries into the new dict, converting string keys to int keys */
8942 while (PyDict_Next(x, &i, &key, &value)) {
8943 if (PyUnicode_Check(key)) {
8944 /* convert string keys to integer keys */
8945 PyObject *newkey;
8946 if (PyUnicode_GET_SIZE(key) != 1) {
8947 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8948 "table must be of length 1");
8949 goto err;
8950 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008951 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008952 if (!newkey)
8953 goto err;
8954 res = PyDict_SetItem(new, newkey, value);
8955 Py_DECREF(newkey);
8956 if (res < 0)
8957 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008958 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008959 /* just keep integer keys */
8960 if (PyDict_SetItem(new, key, value) < 0)
8961 goto err;
8962 } else {
8963 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8964 "be strings or integers");
8965 goto err;
8966 }
8967 }
8968 }
8969 return new;
8970 err:
8971 Py_DECREF(new);
8972 return NULL;
8973}
8974
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008975PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008976 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977\n\
8978Return a copy of the string S, where all characters have been mapped\n\
8979through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008980Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008981Unmapped characters are left untouched. Characters mapped to None\n\
8982are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983
8984static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008985unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986{
Georg Brandlceee0772007-11-27 23:48:05 +00008987 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988}
8989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008990PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008991 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008993Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994
8995static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008996unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998 return fixup(self, fixupper);
8999}
9000
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009001PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009002 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00009004Pad a numeric string S with zeros on the left, to fill a field\n\
9005of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006
9007static PyObject *
9008unicode_zfill(PyUnicodeObject *self, PyObject *args)
9009{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009010 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011 PyUnicodeObject *u;
9012
Martin v. Löwis18e16552006-02-15 17:27:45 +00009013 Py_ssize_t width;
9014 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015 return NULL;
9016
9017 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00009018 if (PyUnicode_CheckExact(self)) {
9019 Py_INCREF(self);
9020 return (PyObject*) self;
9021 }
9022 else
9023 return PyUnicode_FromUnicode(
9024 PyUnicode_AS_UNICODE(self),
9025 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00009026 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027 }
9028
9029 fill = width - self->length;
9030
9031 u = pad(self, fill, 0, '0');
9032
Walter Dörwald068325e2002-04-15 13:36:47 +00009033 if (u == NULL)
9034 return NULL;
9035
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036 if (u->str[fill] == '+' || u->str[fill] == '-') {
9037 /* move sign to beginning of string */
9038 u->str[0] = u->str[fill];
9039 u->str[fill] = '0';
9040 }
9041
9042 return (PyObject*) u;
9043}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044
9045#if 0
9046static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009047unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048{
Christian Heimes2202f872008-02-06 14:31:34 +00009049 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009050}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009051
9052static PyObject *
9053unicode__decimal2ascii(PyObject *self)
9054{
9055 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9056 PyUnicode_GET_SIZE(self));
9057}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058#endif
9059
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009060PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009061 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009063Return True if S starts with the specified prefix, False otherwise.\n\
9064With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009065With optional end, stop comparing S at that position.\n\
9066prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067
9068static PyObject *
9069unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009070 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009072 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009074 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009075 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009076 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009078 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009079 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9080 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009081 if (PyTuple_Check(subobj)) {
9082 Py_ssize_t i;
9083 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9084 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009085 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009086 if (substring == NULL)
9087 return NULL;
9088 result = tailmatch(self, substring, start, end, -1);
9089 Py_DECREF(substring);
9090 if (result) {
9091 Py_RETURN_TRUE;
9092 }
9093 }
9094 /* nothing matched */
9095 Py_RETURN_FALSE;
9096 }
9097 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009099 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009100 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009102 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103}
9104
9105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009106PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009107 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009109Return True if S ends with the specified suffix, False otherwise.\n\
9110With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009111With optional end, stop comparing S at that position.\n\
9112suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009113
9114static PyObject *
9115unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009116 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009118 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009119 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009120 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009121 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009122 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009124 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009125 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9126 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009127 if (PyTuple_Check(subobj)) {
9128 Py_ssize_t i;
9129 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9130 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009132 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009133 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009134 result = tailmatch(self, substring, start, end, +1);
9135 Py_DECREF(substring);
9136 if (result) {
9137 Py_RETURN_TRUE;
9138 }
9139 }
9140 Py_RETURN_FALSE;
9141 }
9142 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009143 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009144 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009146 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009148 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149}
9150
Eric Smith8c663262007-08-25 02:26:07 +00009151#include "stringlib/string_format.h"
9152
9153PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009154 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009155\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009156Return a formatted version of S, using substitutions from args and kwargs.\n\
9157The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009158
Eric Smith27bbca62010-11-04 17:06:58 +00009159PyDoc_STRVAR(format_map__doc__,
9160 "S.format_map(mapping) -> str\n\
9161\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009162Return a formatted version of S, using substitutions from mapping.\n\
9163The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009164
Eric Smith4a7d76d2008-05-30 18:10:19 +00009165static PyObject *
9166unicode__format__(PyObject* self, PyObject* args)
9167{
9168 PyObject *format_spec;
9169
9170 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9171 return NULL;
9172
9173 return _PyUnicode_FormatAdvanced(self,
9174 PyUnicode_AS_UNICODE(format_spec),
9175 PyUnicode_GET_SIZE(format_spec));
9176}
9177
Eric Smith8c663262007-08-25 02:26:07 +00009178PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009179 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009180\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009181Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009182
9183static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009184unicode__sizeof__(PyUnicodeObject *v)
9185{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009186 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9187 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009188}
9189
9190PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009191 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009192
9193static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009194unicode_getnewargs(PyUnicodeObject *v)
9195{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009196 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009197}
9198
Guido van Rossumd57fd912000-03-10 22:53:23 +00009199static PyMethodDef unicode_methods[] = {
9200
9201 /* Order is according to common usage: often used methods should
9202 appear first, since lookup is done sequentially. */
9203
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009204 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009205 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9206 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009207 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009208 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9209 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9210 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9211 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9212 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9213 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9214 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009215 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009216 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9217 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9218 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009219 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009220 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9221 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9222 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009223 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009224 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009225 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009226 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009227 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9228 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9229 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9230 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9231 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9232 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9233 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9234 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9235 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9236 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9237 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9238 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9239 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9240 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009241 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009242 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009243 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009244 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009245 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009246 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009247 {"maketrans", (PyCFunction) unicode_maketrans,
9248 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009249 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009250#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009251 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009252#endif
9253
9254#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009255 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009256 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009257 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258#endif
9259
Benjamin Peterson14339b62009-01-31 16:36:08 +00009260 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009261 {NULL, NULL}
9262};
9263
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009264static PyObject *
9265unicode_mod(PyObject *v, PyObject *w)
9266{
Benjamin Peterson29060642009-01-31 22:14:21 +00009267 if (!PyUnicode_Check(v)) {
9268 Py_INCREF(Py_NotImplemented);
9269 return Py_NotImplemented;
9270 }
9271 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009272}
9273
9274static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009275 0, /*nb_add*/
9276 0, /*nb_subtract*/
9277 0, /*nb_multiply*/
9278 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009279};
9280
Guido van Rossumd57fd912000-03-10 22:53:23 +00009281static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009282 (lenfunc) unicode_length, /* sq_length */
9283 PyUnicode_Concat, /* sq_concat */
9284 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9285 (ssizeargfunc) unicode_getitem, /* sq_item */
9286 0, /* sq_slice */
9287 0, /* sq_ass_item */
9288 0, /* sq_ass_slice */
9289 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290};
9291
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009292static PyObject*
9293unicode_subscript(PyUnicodeObject* self, PyObject* item)
9294{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009295 if (PyIndex_Check(item)) {
9296 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009297 if (i == -1 && PyErr_Occurred())
9298 return NULL;
9299 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009300 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009301 return unicode_getitem(self, i);
9302 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009303 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009304 Py_UNICODE* source_buf;
9305 Py_UNICODE* result_buf;
9306 PyObject* result;
9307
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009308 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009309 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009310 return NULL;
9311 }
9312
9313 if (slicelength <= 0) {
9314 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009315 } else if (start == 0 && step == 1 && slicelength == self->length &&
9316 PyUnicode_CheckExact(self)) {
9317 Py_INCREF(self);
9318 return (PyObject *)self;
9319 } else if (step == 1) {
9320 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009321 } else {
9322 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009323 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9324 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009325
Benjamin Peterson29060642009-01-31 22:14:21 +00009326 if (result_buf == NULL)
9327 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009328
9329 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9330 result_buf[i] = source_buf[cur];
9331 }
Tim Petersced69f82003-09-16 20:30:58 +00009332
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009333 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009334 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009335 return result;
9336 }
9337 } else {
9338 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9339 return NULL;
9340 }
9341}
9342
9343static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009344 (lenfunc)unicode_length, /* mp_length */
9345 (binaryfunc)unicode_subscript, /* mp_subscript */
9346 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009347};
9348
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349
Guido van Rossumd57fd912000-03-10 22:53:23 +00009350/* Helpers for PyUnicode_Format() */
9351
9352static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009353getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009355 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009356 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009357 (*p_argidx)++;
9358 if (arglen < 0)
9359 return args;
9360 else
9361 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362 }
9363 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009364 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 return NULL;
9366}
9367
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009368/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009370static PyObject *
9371formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009373 char *p;
9374 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009376
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377 x = PyFloat_AsDouble(v);
9378 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009379 return NULL;
9380
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009382 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009383
Eric Smith0923d1d2009-04-16 20:16:10 +00009384 p = PyOS_double_to_string(x, type, prec,
9385 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009386 if (p == NULL)
9387 return NULL;
9388 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009389 PyMem_Free(p);
9390 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009391}
9392
Tim Peters38fd5b62000-09-21 05:43:11 +00009393static PyObject*
9394formatlong(PyObject *val, int flags, int prec, int type)
9395{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009396 char *buf;
9397 int len;
9398 PyObject *str; /* temporary string object. */
9399 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009400
Benjamin Peterson14339b62009-01-31 16:36:08 +00009401 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9402 if (!str)
9403 return NULL;
9404 result = PyUnicode_FromStringAndSize(buf, len);
9405 Py_DECREF(str);
9406 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009407}
9408
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409static int
9410formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009411 size_t buflen,
9412 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009413{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009414 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009415 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009416 if (PyUnicode_GET_SIZE(v) == 1) {
9417 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9418 buf[1] = '\0';
9419 return 1;
9420 }
9421#ifndef Py_UNICODE_WIDE
9422 if (PyUnicode_GET_SIZE(v) == 2) {
9423 /* Decode a valid surrogate pair */
9424 int c0 = PyUnicode_AS_UNICODE(v)[0];
9425 int c1 = PyUnicode_AS_UNICODE(v)[1];
9426 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9427 0xDC00 <= c1 && c1 <= 0xDFFF) {
9428 buf[0] = c0;
9429 buf[1] = c1;
9430 buf[2] = '\0';
9431 return 2;
9432 }
9433 }
9434#endif
9435 goto onError;
9436 }
9437 else {
9438 /* Integer input truncated to a character */
9439 long x;
9440 x = PyLong_AsLong(v);
9441 if (x == -1 && PyErr_Occurred())
9442 goto onError;
9443
9444 if (x < 0 || x > 0x10ffff) {
9445 PyErr_SetString(PyExc_OverflowError,
9446 "%c arg not in range(0x110000)");
9447 return -1;
9448 }
9449
9450#ifndef Py_UNICODE_WIDE
9451 if (x > 0xffff) {
9452 x -= 0x10000;
9453 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9454 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9455 return 2;
9456 }
9457#endif
9458 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009459 buf[1] = '\0';
9460 return 1;
9461 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009462
Benjamin Peterson29060642009-01-31 22:14:21 +00009463 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009464 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009465 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009466 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467}
9468
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009469/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009470 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009471*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009472#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009473
Alexander Belopolsky40018472011-02-26 01:02:56 +00009474PyObject *
9475PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476{
9477 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009478 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479 int args_owned = 0;
9480 PyUnicodeObject *result = NULL;
9481 PyObject *dict = NULL;
9482 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009483
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009485 PyErr_BadInternalCall();
9486 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487 }
9488 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009489 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491 fmt = PyUnicode_AS_UNICODE(uformat);
9492 fmtcnt = PyUnicode_GET_SIZE(uformat);
9493
9494 reslen = rescnt = fmtcnt + 100;
9495 result = _PyUnicode_New(reslen);
9496 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009497 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498 res = PyUnicode_AS_UNICODE(result);
9499
9500 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009501 arglen = PyTuple_Size(args);
9502 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503 }
9504 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009505 arglen = -1;
9506 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009508 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009509 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511
9512 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009513 if (*fmt != '%') {
9514 if (--rescnt < 0) {
9515 rescnt = fmtcnt + 100;
9516 reslen += rescnt;
9517 if (_PyUnicode_Resize(&result, reslen) < 0)
9518 goto onError;
9519 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9520 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009521 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009522 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009523 }
9524 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009525 /* Got a format specifier */
9526 int flags = 0;
9527 Py_ssize_t width = -1;
9528 int prec = -1;
9529 Py_UNICODE c = '\0';
9530 Py_UNICODE fill;
9531 int isnumok;
9532 PyObject *v = NULL;
9533 PyObject *temp = NULL;
9534 Py_UNICODE *pbuf;
9535 Py_UNICODE sign;
9536 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009537 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538
Benjamin Peterson29060642009-01-31 22:14:21 +00009539 fmt++;
9540 if (*fmt == '(') {
9541 Py_UNICODE *keystart;
9542 Py_ssize_t keylen;
9543 PyObject *key;
9544 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009545
Benjamin Peterson29060642009-01-31 22:14:21 +00009546 if (dict == NULL) {
9547 PyErr_SetString(PyExc_TypeError,
9548 "format requires a mapping");
9549 goto onError;
9550 }
9551 ++fmt;
9552 --fmtcnt;
9553 keystart = fmt;
9554 /* Skip over balanced parentheses */
9555 while (pcount > 0 && --fmtcnt >= 0) {
9556 if (*fmt == ')')
9557 --pcount;
9558 else if (*fmt == '(')
9559 ++pcount;
9560 fmt++;
9561 }
9562 keylen = fmt - keystart - 1;
9563 if (fmtcnt < 0 || pcount > 0) {
9564 PyErr_SetString(PyExc_ValueError,
9565 "incomplete format key");
9566 goto onError;
9567 }
9568#if 0
9569 /* keys are converted to strings using UTF-8 and
9570 then looked up since Python uses strings to hold
9571 variables names etc. in its namespaces and we
9572 wouldn't want to break common idioms. */
9573 key = PyUnicode_EncodeUTF8(keystart,
9574 keylen,
9575 NULL);
9576#else
9577 key = PyUnicode_FromUnicode(keystart, keylen);
9578#endif
9579 if (key == NULL)
9580 goto onError;
9581 if (args_owned) {
9582 Py_DECREF(args);
9583 args_owned = 0;
9584 }
9585 args = PyObject_GetItem(dict, key);
9586 Py_DECREF(key);
9587 if (args == NULL) {
9588 goto onError;
9589 }
9590 args_owned = 1;
9591 arglen = -1;
9592 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009593 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009594 while (--fmtcnt >= 0) {
9595 switch (c = *fmt++) {
9596 case '-': flags |= F_LJUST; continue;
9597 case '+': flags |= F_SIGN; continue;
9598 case ' ': flags |= F_BLANK; continue;
9599 case '#': flags |= F_ALT; continue;
9600 case '0': flags |= F_ZERO; continue;
9601 }
9602 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009603 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009604 if (c == '*') {
9605 v = getnextarg(args, arglen, &argidx);
9606 if (v == NULL)
9607 goto onError;
9608 if (!PyLong_Check(v)) {
9609 PyErr_SetString(PyExc_TypeError,
9610 "* wants int");
9611 goto onError;
9612 }
9613 width = PyLong_AsLong(v);
9614 if (width == -1 && PyErr_Occurred())
9615 goto onError;
9616 if (width < 0) {
9617 flags |= F_LJUST;
9618 width = -width;
9619 }
9620 if (--fmtcnt >= 0)
9621 c = *fmt++;
9622 }
9623 else if (c >= '0' && c <= '9') {
9624 width = c - '0';
9625 while (--fmtcnt >= 0) {
9626 c = *fmt++;
9627 if (c < '0' || c > '9')
9628 break;
9629 if ((width*10) / 10 != width) {
9630 PyErr_SetString(PyExc_ValueError,
9631 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009632 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009633 }
9634 width = width*10 + (c - '0');
9635 }
9636 }
9637 if (c == '.') {
9638 prec = 0;
9639 if (--fmtcnt >= 0)
9640 c = *fmt++;
9641 if (c == '*') {
9642 v = getnextarg(args, arglen, &argidx);
9643 if (v == NULL)
9644 goto onError;
9645 if (!PyLong_Check(v)) {
9646 PyErr_SetString(PyExc_TypeError,
9647 "* wants int");
9648 goto onError;
9649 }
9650 prec = PyLong_AsLong(v);
9651 if (prec == -1 && PyErr_Occurred())
9652 goto onError;
9653 if (prec < 0)
9654 prec = 0;
9655 if (--fmtcnt >= 0)
9656 c = *fmt++;
9657 }
9658 else if (c >= '0' && c <= '9') {
9659 prec = c - '0';
9660 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009661 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009662 if (c < '0' || c > '9')
9663 break;
9664 if ((prec*10) / 10 != prec) {
9665 PyErr_SetString(PyExc_ValueError,
9666 "prec too big");
9667 goto onError;
9668 }
9669 prec = prec*10 + (c - '0');
9670 }
9671 }
9672 } /* prec */
9673 if (fmtcnt >= 0) {
9674 if (c == 'h' || c == 'l' || c == 'L') {
9675 if (--fmtcnt >= 0)
9676 c = *fmt++;
9677 }
9678 }
9679 if (fmtcnt < 0) {
9680 PyErr_SetString(PyExc_ValueError,
9681 "incomplete format");
9682 goto onError;
9683 }
9684 if (c != '%') {
9685 v = getnextarg(args, arglen, &argidx);
9686 if (v == NULL)
9687 goto onError;
9688 }
9689 sign = 0;
9690 fill = ' ';
9691 switch (c) {
9692
9693 case '%':
9694 pbuf = formatbuf;
9695 /* presume that buffer length is at least 1 */
9696 pbuf[0] = '%';
9697 len = 1;
9698 break;
9699
9700 case 's':
9701 case 'r':
9702 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009703 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009704 temp = v;
9705 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009706 }
9707 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009708 if (c == 's')
9709 temp = PyObject_Str(v);
9710 else if (c == 'r')
9711 temp = PyObject_Repr(v);
9712 else
9713 temp = PyObject_ASCII(v);
9714 if (temp == NULL)
9715 goto onError;
9716 if (PyUnicode_Check(temp))
9717 /* nothing to do */;
9718 else {
9719 Py_DECREF(temp);
9720 PyErr_SetString(PyExc_TypeError,
9721 "%s argument has non-string str()");
9722 goto onError;
9723 }
9724 }
9725 pbuf = PyUnicode_AS_UNICODE(temp);
9726 len = PyUnicode_GET_SIZE(temp);
9727 if (prec >= 0 && len > prec)
9728 len = prec;
9729 break;
9730
9731 case 'i':
9732 case 'd':
9733 case 'u':
9734 case 'o':
9735 case 'x':
9736 case 'X':
9737 if (c == 'i')
9738 c = 'd';
9739 isnumok = 0;
9740 if (PyNumber_Check(v)) {
9741 PyObject *iobj=NULL;
9742
9743 if (PyLong_Check(v)) {
9744 iobj = v;
9745 Py_INCREF(iobj);
9746 }
9747 else {
9748 iobj = PyNumber_Long(v);
9749 }
9750 if (iobj!=NULL) {
9751 if (PyLong_Check(iobj)) {
9752 isnumok = 1;
9753 temp = formatlong(iobj, flags, prec, c);
9754 Py_DECREF(iobj);
9755 if (!temp)
9756 goto onError;
9757 pbuf = PyUnicode_AS_UNICODE(temp);
9758 len = PyUnicode_GET_SIZE(temp);
9759 sign = 1;
9760 }
9761 else {
9762 Py_DECREF(iobj);
9763 }
9764 }
9765 }
9766 if (!isnumok) {
9767 PyErr_Format(PyExc_TypeError,
9768 "%%%c format: a number is required, "
9769 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9770 goto onError;
9771 }
9772 if (flags & F_ZERO)
9773 fill = '0';
9774 break;
9775
9776 case 'e':
9777 case 'E':
9778 case 'f':
9779 case 'F':
9780 case 'g':
9781 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009782 temp = formatfloat(v, flags, prec, c);
9783 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009784 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009785 pbuf = PyUnicode_AS_UNICODE(temp);
9786 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009787 sign = 1;
9788 if (flags & F_ZERO)
9789 fill = '0';
9790 break;
9791
9792 case 'c':
9793 pbuf = formatbuf;
9794 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9795 if (len < 0)
9796 goto onError;
9797 break;
9798
9799 default:
9800 PyErr_Format(PyExc_ValueError,
9801 "unsupported format character '%c' (0x%x) "
9802 "at index %zd",
9803 (31<=c && c<=126) ? (char)c : '?',
9804 (int)c,
9805 (Py_ssize_t)(fmt - 1 -
9806 PyUnicode_AS_UNICODE(uformat)));
9807 goto onError;
9808 }
9809 if (sign) {
9810 if (*pbuf == '-' || *pbuf == '+') {
9811 sign = *pbuf++;
9812 len--;
9813 }
9814 else if (flags & F_SIGN)
9815 sign = '+';
9816 else if (flags & F_BLANK)
9817 sign = ' ';
9818 else
9819 sign = 0;
9820 }
9821 if (width < len)
9822 width = len;
9823 if (rescnt - (sign != 0) < width) {
9824 reslen -= rescnt;
9825 rescnt = width + fmtcnt + 100;
9826 reslen += rescnt;
9827 if (reslen < 0) {
9828 Py_XDECREF(temp);
9829 PyErr_NoMemory();
9830 goto onError;
9831 }
9832 if (_PyUnicode_Resize(&result, reslen) < 0) {
9833 Py_XDECREF(temp);
9834 goto onError;
9835 }
9836 res = PyUnicode_AS_UNICODE(result)
9837 + reslen - rescnt;
9838 }
9839 if (sign) {
9840 if (fill != ' ')
9841 *res++ = sign;
9842 rescnt--;
9843 if (width > len)
9844 width--;
9845 }
9846 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9847 assert(pbuf[0] == '0');
9848 assert(pbuf[1] == c);
9849 if (fill != ' ') {
9850 *res++ = *pbuf++;
9851 *res++ = *pbuf++;
9852 }
9853 rescnt -= 2;
9854 width -= 2;
9855 if (width < 0)
9856 width = 0;
9857 len -= 2;
9858 }
9859 if (width > len && !(flags & F_LJUST)) {
9860 do {
9861 --rescnt;
9862 *res++ = fill;
9863 } while (--width > len);
9864 }
9865 if (fill == ' ') {
9866 if (sign)
9867 *res++ = sign;
9868 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9869 assert(pbuf[0] == '0');
9870 assert(pbuf[1] == c);
9871 *res++ = *pbuf++;
9872 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009873 }
9874 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009875 Py_UNICODE_COPY(res, pbuf, len);
9876 res += len;
9877 rescnt -= len;
9878 while (--width >= len) {
9879 --rescnt;
9880 *res++ = ' ';
9881 }
9882 if (dict && (argidx < arglen) && c != '%') {
9883 PyErr_SetString(PyExc_TypeError,
9884 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009885 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009886 goto onError;
9887 }
9888 Py_XDECREF(temp);
9889 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009890 } /* until end */
9891 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009892 PyErr_SetString(PyExc_TypeError,
9893 "not all arguments converted during string formatting");
9894 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895 }
9896
Thomas Woutersa96affe2006-03-12 00:29:36 +00009897 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009898 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009900 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901 }
9902 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009903 return (PyObject *)result;
9904
Benjamin Peterson29060642009-01-31 22:14:21 +00009905 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906 Py_XDECREF(result);
9907 Py_DECREF(uformat);
9908 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009909 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009910 }
9911 return NULL;
9912}
9913
Jeremy Hylton938ace62002-07-17 16:30:39 +00009914static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009915unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9916
Tim Peters6d6c1a32001-08-02 04:15:00 +00009917static PyObject *
9918unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9919{
Benjamin Peterson29060642009-01-31 22:14:21 +00009920 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009921 static char *kwlist[] = {"object", "encoding", "errors", 0};
9922 char *encoding = NULL;
9923 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009924
Benjamin Peterson14339b62009-01-31 16:36:08 +00009925 if (type != &PyUnicode_Type)
9926 return unicode_subtype_new(type, args, kwds);
9927 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009928 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009929 return NULL;
9930 if (x == NULL)
9931 return (PyObject *)_PyUnicode_New(0);
9932 if (encoding == NULL && errors == NULL)
9933 return PyObject_Str(x);
9934 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009935 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009936}
9937
Guido van Rossume023fe02001-08-30 03:12:59 +00009938static PyObject *
9939unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9940{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009941 PyUnicodeObject *tmp, *pnew;
9942 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009943
Benjamin Peterson14339b62009-01-31 16:36:08 +00009944 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9945 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9946 if (tmp == NULL)
9947 return NULL;
9948 assert(PyUnicode_Check(tmp));
9949 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9950 if (pnew == NULL) {
9951 Py_DECREF(tmp);
9952 return NULL;
9953 }
9954 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9955 if (pnew->str == NULL) {
9956 _Py_ForgetReference((PyObject *)pnew);
9957 PyObject_Del(pnew);
9958 Py_DECREF(tmp);
9959 return PyErr_NoMemory();
9960 }
9961 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9962 pnew->length = n;
9963 pnew->hash = tmp->hash;
9964 Py_DECREF(tmp);
9965 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009966}
9967
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009968PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009969 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009970\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009971Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009972encoding defaults to the current default string encoding.\n\
9973errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009974
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009975static PyObject *unicode_iter(PyObject *seq);
9976
Guido van Rossumd57fd912000-03-10 22:53:23 +00009977PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009978 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009979 "str", /* tp_name */
9980 sizeof(PyUnicodeObject), /* tp_size */
9981 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009983 (destructor)unicode_dealloc, /* tp_dealloc */
9984 0, /* tp_print */
9985 0, /* tp_getattr */
9986 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009987 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009988 unicode_repr, /* tp_repr */
9989 &unicode_as_number, /* tp_as_number */
9990 &unicode_as_sequence, /* tp_as_sequence */
9991 &unicode_as_mapping, /* tp_as_mapping */
9992 (hashfunc) unicode_hash, /* tp_hash*/
9993 0, /* tp_call*/
9994 (reprfunc) unicode_str, /* tp_str */
9995 PyObject_GenericGetAttr, /* tp_getattro */
9996 0, /* tp_setattro */
9997 0, /* tp_as_buffer */
9998 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +00009999 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010000 unicode_doc, /* tp_doc */
10001 0, /* tp_traverse */
10002 0, /* tp_clear */
10003 PyUnicode_RichCompare, /* tp_richcompare */
10004 0, /* tp_weaklistoffset */
10005 unicode_iter, /* tp_iter */
10006 0, /* tp_iternext */
10007 unicode_methods, /* tp_methods */
10008 0, /* tp_members */
10009 0, /* tp_getset */
10010 &PyBaseObject_Type, /* tp_base */
10011 0, /* tp_dict */
10012 0, /* tp_descr_get */
10013 0, /* tp_descr_set */
10014 0, /* tp_dictoffset */
10015 0, /* tp_init */
10016 0, /* tp_alloc */
10017 unicode_new, /* tp_new */
10018 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019};
10020
10021/* Initialize the Unicode implementation */
10022
Thomas Wouters78890102000-07-22 19:25:51 +000010023void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010024{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010025 int i;
10026
Thomas Wouters477c8d52006-05-27 19:21:47 +000010027 /* XXX - move this array to unicodectype.c ? */
10028 Py_UNICODE linebreak[] = {
10029 0x000A, /* LINE FEED */
10030 0x000D, /* CARRIAGE RETURN */
10031 0x001C, /* FILE SEPARATOR */
10032 0x001D, /* GROUP SEPARATOR */
10033 0x001E, /* RECORD SEPARATOR */
10034 0x0085, /* NEXT LINE */
10035 0x2028, /* LINE SEPARATOR */
10036 0x2029, /* PARAGRAPH SEPARATOR */
10037 };
10038
Fred Drakee4315f52000-05-09 19:53:39 +000010039 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +000010040 free_list = NULL;
10041 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010042 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010043 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +000010044 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010045
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010046 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000010047 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000010048 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010049 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000010050
10051 /* initialize the linebreak bloom filter */
10052 bloom_linebreak = make_bloom_mask(
10053 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10054 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010055
10056 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010057}
10058
10059/* Finalize the Unicode implementation */
10060
Christian Heimesa156e092008-02-16 07:38:31 +000010061int
10062PyUnicode_ClearFreeList(void)
10063{
10064 int freelist_size = numfree;
10065 PyUnicodeObject *u;
10066
10067 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010068 PyUnicodeObject *v = u;
10069 u = *(PyUnicodeObject **)u;
10070 if (v->str)
10071 PyObject_DEL(v->str);
10072 Py_XDECREF(v->defenc);
10073 PyObject_Del(v);
10074 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010075 }
10076 free_list = NULL;
10077 assert(numfree == 0);
10078 return freelist_size;
10079}
10080
Guido van Rossumd57fd912000-03-10 22:53:23 +000010081void
Thomas Wouters78890102000-07-22 19:25:51 +000010082_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010083{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010084 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010085
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010086 Py_XDECREF(unicode_empty);
10087 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010088
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010089 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010090 if (unicode_latin1[i]) {
10091 Py_DECREF(unicode_latin1[i]);
10092 unicode_latin1[i] = NULL;
10093 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010094 }
Christian Heimesa156e092008-02-16 07:38:31 +000010095 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010096}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010097
Walter Dörwald16807132007-05-25 13:52:07 +000010098void
10099PyUnicode_InternInPlace(PyObject **p)
10100{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010101 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10102 PyObject *t;
10103 if (s == NULL || !PyUnicode_Check(s))
10104 Py_FatalError(
10105 "PyUnicode_InternInPlace: unicode strings only please!");
10106 /* If it's a subclass, we don't really know what putting
10107 it in the interned dict might do. */
10108 if (!PyUnicode_CheckExact(s))
10109 return;
10110 if (PyUnicode_CHECK_INTERNED(s))
10111 return;
10112 if (interned == NULL) {
10113 interned = PyDict_New();
10114 if (interned == NULL) {
10115 PyErr_Clear(); /* Don't leave an exception */
10116 return;
10117 }
10118 }
10119 /* It might be that the GetItem call fails even
10120 though the key is present in the dictionary,
10121 namely when this happens during a stack overflow. */
10122 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010123 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010124 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010125
Benjamin Peterson29060642009-01-31 22:14:21 +000010126 if (t) {
10127 Py_INCREF(t);
10128 Py_DECREF(*p);
10129 *p = t;
10130 return;
10131 }
Walter Dörwald16807132007-05-25 13:52:07 +000010132
Benjamin Peterson14339b62009-01-31 16:36:08 +000010133 PyThreadState_GET()->recursion_critical = 1;
10134 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10135 PyErr_Clear();
10136 PyThreadState_GET()->recursion_critical = 0;
10137 return;
10138 }
10139 PyThreadState_GET()->recursion_critical = 0;
10140 /* The two references in interned are not counted by refcnt.
10141 The deallocator will take care of this */
10142 Py_REFCNT(s) -= 2;
10143 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010144}
10145
10146void
10147PyUnicode_InternImmortal(PyObject **p)
10148{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010149 PyUnicode_InternInPlace(p);
10150 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10151 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10152 Py_INCREF(*p);
10153 }
Walter Dörwald16807132007-05-25 13:52:07 +000010154}
10155
10156PyObject *
10157PyUnicode_InternFromString(const char *cp)
10158{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010159 PyObject *s = PyUnicode_FromString(cp);
10160 if (s == NULL)
10161 return NULL;
10162 PyUnicode_InternInPlace(&s);
10163 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010164}
10165
Alexander Belopolsky40018472011-02-26 01:02:56 +000010166void
10167_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000010168{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010169 PyObject *keys;
10170 PyUnicodeObject *s;
10171 Py_ssize_t i, n;
10172 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010173
Benjamin Peterson14339b62009-01-31 16:36:08 +000010174 if (interned == NULL || !PyDict_Check(interned))
10175 return;
10176 keys = PyDict_Keys(interned);
10177 if (keys == NULL || !PyList_Check(keys)) {
10178 PyErr_Clear();
10179 return;
10180 }
Walter Dörwald16807132007-05-25 13:52:07 +000010181
Benjamin Peterson14339b62009-01-31 16:36:08 +000010182 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10183 detector, interned unicode strings are not forcibly deallocated;
10184 rather, we give them their stolen references back, and then clear
10185 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010186
Benjamin Peterson14339b62009-01-31 16:36:08 +000010187 n = PyList_GET_SIZE(keys);
10188 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010189 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010190 for (i = 0; i < n; i++) {
10191 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10192 switch (s->state) {
10193 case SSTATE_NOT_INTERNED:
10194 /* XXX Shouldn't happen */
10195 break;
10196 case SSTATE_INTERNED_IMMORTAL:
10197 Py_REFCNT(s) += 1;
10198 immortal_size += s->length;
10199 break;
10200 case SSTATE_INTERNED_MORTAL:
10201 Py_REFCNT(s) += 2;
10202 mortal_size += s->length;
10203 break;
10204 default:
10205 Py_FatalError("Inconsistent interned string state.");
10206 }
10207 s->state = SSTATE_NOT_INTERNED;
10208 }
10209 fprintf(stderr, "total size of all interned strings: "
10210 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10211 "mortal/immortal\n", mortal_size, immortal_size);
10212 Py_DECREF(keys);
10213 PyDict_Clear(interned);
10214 Py_DECREF(interned);
10215 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010216}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010217
10218
10219/********************* Unicode Iterator **************************/
10220
10221typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010222 PyObject_HEAD
10223 Py_ssize_t it_index;
10224 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010225} unicodeiterobject;
10226
10227static void
10228unicodeiter_dealloc(unicodeiterobject *it)
10229{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010230 _PyObject_GC_UNTRACK(it);
10231 Py_XDECREF(it->it_seq);
10232 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010233}
10234
10235static int
10236unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10237{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010238 Py_VISIT(it->it_seq);
10239 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010240}
10241
10242static PyObject *
10243unicodeiter_next(unicodeiterobject *it)
10244{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010245 PyUnicodeObject *seq;
10246 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010247
Benjamin Peterson14339b62009-01-31 16:36:08 +000010248 assert(it != NULL);
10249 seq = it->it_seq;
10250 if (seq == NULL)
10251 return NULL;
10252 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010253
Benjamin Peterson14339b62009-01-31 16:36:08 +000010254 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10255 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010256 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010257 if (item != NULL)
10258 ++it->it_index;
10259 return item;
10260 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010261
Benjamin Peterson14339b62009-01-31 16:36:08 +000010262 Py_DECREF(seq);
10263 it->it_seq = NULL;
10264 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010265}
10266
10267static PyObject *
10268unicodeiter_len(unicodeiterobject *it)
10269{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010270 Py_ssize_t len = 0;
10271 if (it->it_seq)
10272 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10273 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010274}
10275
10276PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10277
10278static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010279 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010280 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010281 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010282};
10283
10284PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010285 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10286 "str_iterator", /* tp_name */
10287 sizeof(unicodeiterobject), /* tp_basicsize */
10288 0, /* tp_itemsize */
10289 /* methods */
10290 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10291 0, /* tp_print */
10292 0, /* tp_getattr */
10293 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010294 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010295 0, /* tp_repr */
10296 0, /* tp_as_number */
10297 0, /* tp_as_sequence */
10298 0, /* tp_as_mapping */
10299 0, /* tp_hash */
10300 0, /* tp_call */
10301 0, /* tp_str */
10302 PyObject_GenericGetAttr, /* tp_getattro */
10303 0, /* tp_setattro */
10304 0, /* tp_as_buffer */
10305 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10306 0, /* tp_doc */
10307 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10308 0, /* tp_clear */
10309 0, /* tp_richcompare */
10310 0, /* tp_weaklistoffset */
10311 PyObject_SelfIter, /* tp_iter */
10312 (iternextfunc)unicodeiter_next, /* tp_iternext */
10313 unicodeiter_methods, /* tp_methods */
10314 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010315};
10316
10317static PyObject *
10318unicode_iter(PyObject *seq)
10319{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010320 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010321
Benjamin Peterson14339b62009-01-31 16:36:08 +000010322 if (!PyUnicode_Check(seq)) {
10323 PyErr_BadInternalCall();
10324 return NULL;
10325 }
10326 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10327 if (it == NULL)
10328 return NULL;
10329 it->it_index = 0;
10330 Py_INCREF(seq);
10331 it->it_seq = (PyUnicodeObject *)seq;
10332 _PyObject_GC_TRACK(it);
10333 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010334}
10335
Martin v. Löwis5b222132007-06-10 09:51:05 +000010336size_t
10337Py_UNICODE_strlen(const Py_UNICODE *u)
10338{
10339 int res = 0;
10340 while(*u++)
10341 res++;
10342 return res;
10343}
10344
10345Py_UNICODE*
10346Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10347{
10348 Py_UNICODE *u = s1;
10349 while ((*u++ = *s2++));
10350 return s1;
10351}
10352
10353Py_UNICODE*
10354Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10355{
10356 Py_UNICODE *u = s1;
10357 while ((*u++ = *s2++))
10358 if (n-- == 0)
10359 break;
10360 return s1;
10361}
10362
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010363Py_UNICODE*
10364Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10365{
10366 Py_UNICODE *u1 = s1;
10367 u1 += Py_UNICODE_strlen(u1);
10368 Py_UNICODE_strcpy(u1, s2);
10369 return s1;
10370}
10371
Martin v. Löwis5b222132007-06-10 09:51:05 +000010372int
10373Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10374{
10375 while (*s1 && *s2 && *s1 == *s2)
10376 s1++, s2++;
10377 if (*s1 && *s2)
10378 return (*s1 < *s2) ? -1 : +1;
10379 if (*s1)
10380 return 1;
10381 if (*s2)
10382 return -1;
10383 return 0;
10384}
10385
Victor Stinneref8d95c2010-08-16 22:03:11 +000010386int
10387Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10388{
10389 register Py_UNICODE u1, u2;
10390 for (; n != 0; n--) {
10391 u1 = *s1;
10392 u2 = *s2;
10393 if (u1 != u2)
10394 return (u1 < u2) ? -1 : +1;
10395 if (u1 == '\0')
10396 return 0;
10397 s1++;
10398 s2++;
10399 }
10400 return 0;
10401}
10402
Martin v. Löwis5b222132007-06-10 09:51:05 +000010403Py_UNICODE*
10404Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10405{
10406 const Py_UNICODE *p;
10407 for (p = s; *p; p++)
10408 if (*p == c)
10409 return (Py_UNICODE*)p;
10410 return NULL;
10411}
10412
Victor Stinner331ea922010-08-10 16:37:20 +000010413Py_UNICODE*
10414Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10415{
10416 const Py_UNICODE *p;
10417 p = s + Py_UNICODE_strlen(s);
10418 while (p != s) {
10419 p--;
10420 if (*p == c)
10421 return (Py_UNICODE*)p;
10422 }
10423 return NULL;
10424}
10425
Victor Stinner71133ff2010-09-01 23:43:53 +000010426Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010427PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010428{
10429 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10430 Py_UNICODE *copy;
10431 Py_ssize_t size;
10432
10433 /* Ensure we won't overflow the size. */
10434 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10435 PyErr_NoMemory();
10436 return NULL;
10437 }
10438 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10439 size *= sizeof(Py_UNICODE);
10440 copy = PyMem_Malloc(size);
10441 if (copy == NULL) {
10442 PyErr_NoMemory();
10443 return NULL;
10444 }
10445 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10446 return copy;
10447}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010448
Georg Brandl66c221e2010-10-14 07:04:07 +000010449/* A _string module, to export formatter_parser and formatter_field_name_split
10450 to the string.Formatter class implemented in Python. */
10451
10452static PyMethodDef _string_methods[] = {
10453 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10454 METH_O, PyDoc_STR("split the argument as a field name")},
10455 {"formatter_parser", (PyCFunction) formatter_parser,
10456 METH_O, PyDoc_STR("parse the argument as a format string")},
10457 {NULL, NULL}
10458};
10459
10460static struct PyModuleDef _string_module = {
10461 PyModuleDef_HEAD_INIT,
10462 "_string",
10463 PyDoc_STR("string helper module"),
10464 0,
10465 _string_methods,
10466 NULL,
10467 NULL,
10468 NULL,
10469 NULL
10470};
10471
10472PyMODINIT_FUNC
10473PyInit__string(void)
10474{
10475 return PyModule_Create(&_string_module);
10476}
10477
10478
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010479#ifdef __cplusplus
10480}
10481#endif