blob: db9f7060ed71d5c0350ba00c1d92afd6b7c4e206 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Alexander Belopolsky40018472011-02-26 01:02:56 +0000145static PyObject *
146unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000147 PyObject **errorHandler,const char *encoding, const char *reason,
148 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
149 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
150
Alexander Belopolsky40018472011-02-26 01:02:56 +0000151static void
152raise_encode_exception(PyObject **exceptionObject,
153 const char *encoding,
154 const Py_UNICODE *unicode, Py_ssize_t size,
155 Py_ssize_t startpos, Py_ssize_t endpos,
156 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000157
Christian Heimes190d79e2008-01-30 11:58:22 +0000158/* Same for linebreaks */
159static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000161/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000162/* 0x000B, * LINE TABULATION */
163/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000165 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000166 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000167/* 0x001C, * FILE SEPARATOR */
168/* 0x001D, * GROUP SEPARATOR */
169/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000175
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000184};
185
186
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000188PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000190#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000192#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 /* This is actually an illegal character, so it should
194 not be passed to unichr. */
195 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000196#endif
197}
198
Thomas Wouters477c8d52006-05-27 19:21:47 +0000199/* --- Bloom Filters ----------------------------------------------------- */
200
201/* stuff to implement simple "bloom filters" for Unicode characters.
202 to keep things simple, we use a single bitmask, using the least 5
203 bits from each unicode characters as the bit index. */
204
205/* the linebreak mask is set up by Unicode_Init below */
206
Antoine Pitrouf068f942010-01-13 14:19:12 +0000207#if LONG_BIT >= 128
208#define BLOOM_WIDTH 128
209#elif LONG_BIT >= 64
210#define BLOOM_WIDTH 64
211#elif LONG_BIT >= 32
212#define BLOOM_WIDTH 32
213#else
214#error "LONG_BIT is smaller than 32"
215#endif
216
Thomas Wouters477c8d52006-05-27 19:21:47 +0000217#define BLOOM_MASK unsigned long
218
219static BLOOM_MASK bloom_linebreak;
220
Antoine Pitrouf068f942010-01-13 14:19:12 +0000221#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
222#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223
Benjamin Peterson29060642009-01-31 22:14:21 +0000224#define BLOOM_LINEBREAK(ch) \
225 ((ch) < 128U ? ascii_linebreak[(ch)] : \
226 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227
Alexander Belopolsky40018472011-02-26 01:02:56 +0000228Py_LOCAL_INLINE(BLOOM_MASK)
229make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243Py_LOCAL_INLINE(int)
244unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245{
246 Py_ssize_t i;
247
248 for (i = 0; i < setlen; i++)
249 if (set[i] == chr)
250 return 1;
251
252 return 0;
253}
254
Benjamin Peterson29060642009-01-31 22:14:21 +0000255#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000256 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
257
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258/* --- Unicode Object ----------------------------------------------------- */
259
Alexander Belopolsky40018472011-02-26 01:02:56 +0000260static int
261unicode_resize(register PyUnicodeObject *unicode,
262 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263{
264 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000265
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000268 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000270 /* Resizing shared object (unicode_empty or single character
271 objects) in-place is not allowed. Use PyUnicode_Resize()
272 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000275 (unicode->length == 1 &&
276 unicode->str[0] < 256U &&
277 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000279 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return -1;
281 }
282
Thomas Wouters477c8d52006-05-27 19:21:47 +0000283 /* We allocate one more byte to make sure the string is Ux0000 terminated.
284 The overallocation is also used by fastsearch, which assumes that it's
285 safe to look at str[length] (without making any assumptions about what
286 it contains). */
287
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000289 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000290 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000292 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 PyErr_NoMemory();
294 return -1;
295 }
296 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000297 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000301 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000302 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000303 }
304 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000305
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306 return 0;
307}
308
309/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000310 Ux0000 terminated; some code (e.g. new_identifier)
311 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312
313 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000314 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
316*/
317
Alexander Belopolsky40018472011-02-26 01:02:56 +0000318static PyUnicodeObject *
319_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320{
321 register PyUnicodeObject *unicode;
322
Thomas Wouters477c8d52006-05-27 19:21:47 +0000323 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (length == 0 && unicode_empty != NULL) {
325 Py_INCREF(unicode_empty);
326 return unicode_empty;
327 }
328
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000329 /* Ensure we won't overflow the size. */
330 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
331 return (PyUnicodeObject *)PyErr_NoMemory();
332 }
333
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000335 if (free_list) {
336 unicode = free_list;
337 free_list = *(PyUnicodeObject **)unicode;
338 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 if (unicode->str) {
340 /* Keep-Alive optimization: we only upsize the buffer,
341 never downsize it. */
342 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000343 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000344 PyObject_DEL(unicode->str);
345 unicode->str = NULL;
346 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000347 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000348 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000351 }
352 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 }
354 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000356 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 if (unicode == NULL)
358 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000359 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
360 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 }
362
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000363 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 PyErr_NoMemory();
365 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000366 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000367 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000368 * the caller fails before initializing str -- unicode_resize()
369 * reads str[0], and the Keep-Alive optimization can keep memory
370 * allocated for str alive across a call to unicode_dealloc(unicode).
371 * We don't want unicode_resize to read uninitialized memory in
372 * that case.
373 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000374 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000378 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000379 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000380 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381
Benjamin Peterson29060642009-01-31 22:14:21 +0000382 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000383 /* XXX UNREF/NEWREF interface should be more symmetrical */
384 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000385 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000386 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388}
389
Alexander Belopolsky40018472011-02-26 01:02:56 +0000390static void
391unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392{
Walter Dörwald16807132007-05-25 13:52:07 +0000393 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000394 case SSTATE_NOT_INTERNED:
395 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000396
Benjamin Peterson29060642009-01-31 22:14:21 +0000397 case SSTATE_INTERNED_MORTAL:
398 /* revive dead object temporarily for DelItem */
399 Py_REFCNT(unicode) = 3;
400 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
401 Py_FatalError(
402 "deletion of interned string failed");
403 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 case SSTATE_INTERNED_IMMORTAL:
406 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000407
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 default:
409 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000410 }
411
Guido van Rossum604ddf82001-12-06 20:03:56 +0000412 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000414 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000415 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
416 PyObject_DEL(unicode->str);
417 unicode->str = NULL;
418 unicode->length = 0;
419 }
420 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000421 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 }
423 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000424 *(PyUnicodeObject **)unicode = free_list;
425 free_list = unicode;
426 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 }
428 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000429 PyObject_DEL(unicode->str);
430 Py_XDECREF(unicode->defenc);
431 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 }
433}
434
Alexander Belopolsky40018472011-02-26 01:02:56 +0000435static int
436_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000437{
438 register PyUnicodeObject *v;
439
440 /* Argument checks */
441 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000442 PyErr_BadInternalCall();
443 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000446 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000447 PyErr_BadInternalCall();
448 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000449 }
450
451 /* Resizing unicode_empty and single character objects is not
452 possible since these are being shared. We simply return a fresh
453 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000454 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000455 (v == unicode_empty || v->length == 1)) {
456 PyUnicodeObject *w = _PyUnicode_New(length);
457 if (w == NULL)
458 return -1;
459 Py_UNICODE_COPY(w->str, v->str,
460 length < v->length ? length : v->length);
461 Py_DECREF(*unicode);
462 *unicode = w;
463 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000464 }
465
466 /* Note that we don't have to modify *unicode for unshared Unicode
467 objects, since we can modify them in-place. */
468 return unicode_resize(v, length);
469}
470
Alexander Belopolsky40018472011-02-26 01:02:56 +0000471int
472PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000473{
474 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
475}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476
Alexander Belopolsky40018472011-02-26 01:02:56 +0000477PyObject *
478PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479{
480 PyUnicodeObject *unicode;
481
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects. */
484 if (u != NULL) {
485
Benjamin Peterson29060642009-01-31 22:14:21 +0000486 /* Optimization for empty strings */
487 if (size == 0 && unicode_empty != NULL) {
488 Py_INCREF(unicode_empty);
489 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000490 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000491
492 /* Single character Unicode objects in the Latin-1 range are
493 shared when using this constructor */
494 if (size == 1 && *u < 256) {
495 unicode = unicode_latin1[*u];
496 if (!unicode) {
497 unicode = _PyUnicode_New(1);
498 if (!unicode)
499 return NULL;
500 unicode->str[0] = *u;
501 unicode_latin1[*u] = unicode;
502 }
503 Py_INCREF(unicode);
504 return (PyObject *)unicode;
505 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 unicode = _PyUnicode_New(size);
509 if (!unicode)
510 return NULL;
511
512 /* Copy the Unicode data into the new object */
513 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000514 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000515
516 return (PyObject *)unicode;
517}
518
Alexander Belopolsky40018472011-02-26 01:02:56 +0000519PyObject *
520PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000521{
522 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 if (size < 0) {
525 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000526 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000527 return NULL;
528 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000531 some optimizations which share commonly used objects.
532 Also, this means the input must be UTF-8, so fall back to the
533 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 if (u != NULL) {
535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536 /* Optimization for empty strings */
537 if (size == 0 && unicode_empty != NULL) {
538 Py_INCREF(unicode_empty);
539 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000540 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000541
542 /* Single characters are shared when using this constructor.
543 Restrict to ASCII, since the input must be UTF-8. */
544 if (size == 1 && Py_CHARMASK(*u) < 128) {
545 unicode = unicode_latin1[Py_CHARMASK(*u)];
546 if (!unicode) {
547 unicode = _PyUnicode_New(1);
548 if (!unicode)
549 return NULL;
550 unicode->str[0] = Py_CHARMASK(*u);
551 unicode_latin1[Py_CHARMASK(*u)] = unicode;
552 }
553 Py_INCREF(unicode);
554 return (PyObject *)unicode;
555 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000556
557 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 }
559
Walter Dörwald55507312007-05-18 13:12:10 +0000560 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 if (!unicode)
562 return NULL;
563
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 return (PyObject *)unicode;
565}
566
Alexander Belopolsky40018472011-02-26 01:02:56 +0000567PyObject *
568PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +0000569{
570 size_t size = strlen(u);
571 if (size > PY_SSIZE_T_MAX) {
572 PyErr_SetString(PyExc_OverflowError, "input too long");
573 return NULL;
574 }
575
576 return PyUnicode_FromStringAndSize(u, size);
577}
578
Guido van Rossumd57fd912000-03-10 22:53:23 +0000579#ifdef HAVE_WCHAR_H
580
Mark Dickinson081dfee2009-03-18 14:47:41 +0000581#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
582# define CONVERT_WCHAR_TO_SURROGATES
583#endif
584
585#ifdef CONVERT_WCHAR_TO_SURROGATES
586
587/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
588 to convert from UTF32 to UTF16. */
589
Alexander Belopolsky40018472011-02-26 01:02:56 +0000590PyObject *
591PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +0000592{
593 PyUnicodeObject *unicode;
594 register Py_ssize_t i;
595 Py_ssize_t alloc;
596 const wchar_t *orig_w;
597
598 if (w == NULL) {
599 if (size == 0)
600 return PyUnicode_FromStringAndSize(NULL, 0);
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 if (size == -1) {
606 size = wcslen(w);
607 }
608
609 alloc = size;
610 orig_w = w;
611 for (i = size; i > 0; i--) {
612 if (*w > 0xFFFF)
613 alloc++;
614 w++;
615 }
616 w = orig_w;
617 unicode = _PyUnicode_New(alloc);
618 if (!unicode)
619 return NULL;
620
621 /* Copy the wchar_t data into the new object */
622 {
623 register Py_UNICODE *u;
624 u = PyUnicode_AS_UNICODE(unicode);
625 for (i = size; i > 0; i--) {
626 if (*w > 0xFFFF) {
627 wchar_t ordinal = *w++;
628 ordinal -= 0x10000;
629 *u++ = 0xD800 | (ordinal >> 10);
630 *u++ = 0xDC00 | (ordinal & 0x3FF);
631 }
632 else
633 *u++ = *w++;
634 }
635 }
636 return (PyObject *)unicode;
637}
638
639#else
640
Alexander Belopolsky40018472011-02-26 01:02:56 +0000641PyObject *
642PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643{
644 PyUnicodeObject *unicode;
645
646 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000647 if (size == 0)
648 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 PyErr_BadInternalCall();
650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 }
652
Martin v. Löwis790465f2008-04-05 20:41:37 +0000653 if (size == -1) {
654 size = wcslen(w);
655 }
656
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 unicode = _PyUnicode_New(size);
658 if (!unicode)
659 return NULL;
660
661 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000662#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000664#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000666 register Py_UNICODE *u;
667 register Py_ssize_t i;
668 u = PyUnicode_AS_UNICODE(unicode);
669 for (i = size; i > 0; i--)
670 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671 }
672#endif
673
674 return (PyObject *)unicode;
675}
676
Mark Dickinson081dfee2009-03-18 14:47:41 +0000677#endif /* CONVERT_WCHAR_TO_SURROGATES */
678
679#undef CONVERT_WCHAR_TO_SURROGATES
680
Walter Dörwald346737f2007-05-31 10:44:43 +0000681static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000682makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
683 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000684{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000685 *fmt++ = '%';
686 if (width) {
687 if (zeropad)
688 *fmt++ = '0';
689 fmt += sprintf(fmt, "%d", width);
690 }
691 if (precision)
692 fmt += sprintf(fmt, ".%d", precision);
693 if (longflag)
694 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000695 else if (longlongflag) {
696 /* longlongflag should only ever be nonzero on machines with
697 HAVE_LONG_LONG defined */
698#ifdef HAVE_LONG_LONG
699 char *f = PY_FORMAT_LONG_LONG;
700 while (*f)
701 *fmt++ = *f++;
702#else
703 /* we shouldn't ever get here */
704 assert(0);
705 *fmt++ = 'l';
706#endif
707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000708 else if (size_tflag) {
709 char *f = PY_FORMAT_SIZE_T;
710 while (*f)
711 *fmt++ = *f++;
712 }
713 *fmt++ = c;
714 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000715}
716
Victor Stinner96865452011-03-01 23:44:09 +0000717/* helper for PyUnicode_FromFormatV() */
718
719static const char*
720parse_format_flags(const char *f,
721 int *p_width, int *p_precision,
722 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
723{
724 int width, precision, longflag, longlongflag, size_tflag;
725
726 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
727 f++;
728 width = 0;
729 while (Py_ISDIGIT((unsigned)*f))
730 width = (width*10) + *f++ - '0';
731 precision = 0;
732 if (*f == '.') {
733 f++;
734 while (Py_ISDIGIT((unsigned)*f))
735 precision = (precision*10) + *f++ - '0';
736 if (*f == '%') {
737 /* "%.3%s" => f points to "3" */
738 f--;
739 }
740 }
741 if (*f == '\0') {
742 /* bogus format "%.1" => go backward, f points to "1" */
743 f--;
744 }
745 if (p_width != NULL)
746 *p_width = width;
747 if (p_precision != NULL)
748 *p_precision = precision;
749
750 /* Handle %ld, %lu, %lld and %llu. */
751 longflag = 0;
752 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +0000753 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +0000754
755 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +0000756 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +0000757 longflag = 1;
758 ++f;
759 }
760#ifdef HAVE_LONG_LONG
761 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +0000762 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +0000763 longlongflag = 1;
764 f += 2;
765 }
766#endif
767 }
768 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +0000769 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +0000770 size_tflag = 1;
771 ++f;
772 }
773 if (p_longflag != NULL)
774 *p_longflag = longflag;
775 if (p_longlongflag != NULL)
776 *p_longlongflag = longlongflag;
777 if (p_size_tflag != NULL)
778 *p_size_tflag = size_tflag;
779 return f;
780}
781
Walter Dörwaldd2034312007-05-18 16:29:38 +0000782#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
783
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000784/* size of fixed-size buffer for formatting single arguments */
785#define ITEM_BUFFER_LEN 21
786/* maximum number of characters required for output of %ld. 21 characters
787 allows for 64-bit integers (in decimal) and an optional sign. */
788#define MAX_LONG_CHARS 21
789/* maximum number of characters required for output of %lld.
790 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
791 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
792#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
793
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794PyObject *
795PyUnicode_FromFormatV(const char *format, va_list vargs)
796{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000797 va_list count;
798 Py_ssize_t callcount = 0;
799 PyObject **callresults = NULL;
800 PyObject **callresult = NULL;
801 Py_ssize_t n = 0;
802 int width = 0;
803 int precision = 0;
804 int zeropad;
805 const char* f;
806 Py_UNICODE *s;
807 PyObject *string;
808 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000809 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* use abuffer instead of buffer, if we need more space
811 * (which can happen if there's a format specifier with width). */
812 char *abuffer = NULL;
813 char *realbuffer;
814 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000815 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000816 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000817
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000818 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000819 /* step 1: count the number of %S/%R/%A/%s format specifications
820 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
821 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
822 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000823 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000824 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +0000825 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
826 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
827 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000828 ++callcount;
829 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000830 else if (128 <= (unsigned char)*f) {
831 PyErr_Format(PyExc_ValueError,
832 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000833 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000834 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000835 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000836 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000837 }
838 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000839 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000840 if (callcount) {
841 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
842 if (!callresults) {
843 PyErr_NoMemory();
844 return NULL;
845 }
846 callresult = callresults;
847 }
848 /* step 3: figure out how large a buffer we need */
849 for (f = format; *f; f++) {
850 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000851#ifdef HAVE_LONG_LONG
Victor Stinner96865452011-03-01 23:44:09 +0000852 int longlongflag;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000853#endif
Victor Stinner96865452011-03-01 23:44:09 +0000854 const char* p;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000855
Victor Stinner96865452011-03-01 23:44:09 +0000856 p = f;
857 f = parse_format_flags(f, &width, NULL,
858 NULL, &longlongflag, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859
Benjamin Peterson14339b62009-01-31 16:36:08 +0000860 switch (*f) {
861 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000862 {
863#ifndef Py_UNICODE_WIDE
864 int ordinal = va_arg(count, int);
865 if (ordinal > 0xffff)
866 n += 2;
867 else
868 n++;
869#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000870 (void)va_arg(count, int);
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000871 n++;
872#endif
873 break;
874 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000875 case '%':
876 n++;
877 break;
878 case 'd': case 'u': case 'i': case 'x':
879 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000880#ifdef HAVE_LONG_LONG
881 if (longlongflag) {
882 if (width < MAX_LONG_LONG_CHARS)
883 width = MAX_LONG_LONG_CHARS;
884 }
885 else
886#endif
887 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
888 including sign. Decimal takes the most space. This
889 isn't enough for octal. If a width is specified we
890 need more (which we allocate later). */
891 if (width < MAX_LONG_CHARS)
892 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000893 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000894 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000895 if (abuffersize < width)
896 abuffersize = width;
897 break;
898 case 's':
899 {
900 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000901 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000902 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
903 if (!str)
904 goto fail;
905 n += PyUnicode_GET_SIZE(str);
906 /* Remember the str and switch to the next slot */
907 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000908 break;
909 }
910 case 'U':
911 {
912 PyObject *obj = va_arg(count, PyObject *);
913 assert(obj && PyUnicode_Check(obj));
914 n += PyUnicode_GET_SIZE(obj);
915 break;
916 }
917 case 'V':
918 {
919 PyObject *obj = va_arg(count, PyObject *);
920 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000921 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000922 assert(obj || str);
923 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +0000924 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000925 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000926 *callresult++ = NULL;
927 }
928 else {
929 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
930 if (!str_obj)
931 goto fail;
932 n += PyUnicode_GET_SIZE(str_obj);
933 *callresult++ = str_obj;
934 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000935 break;
936 }
937 case 'S':
938 {
939 PyObject *obj = va_arg(count, PyObject *);
940 PyObject *str;
941 assert(obj);
942 str = PyObject_Str(obj);
943 if (!str)
944 goto fail;
945 n += PyUnicode_GET_SIZE(str);
946 /* Remember the str and switch to the next slot */
947 *callresult++ = str;
948 break;
949 }
950 case 'R':
951 {
952 PyObject *obj = va_arg(count, PyObject *);
953 PyObject *repr;
954 assert(obj);
955 repr = PyObject_Repr(obj);
956 if (!repr)
957 goto fail;
958 n += PyUnicode_GET_SIZE(repr);
959 /* Remember the repr and switch to the next slot */
960 *callresult++ = repr;
961 break;
962 }
963 case 'A':
964 {
965 PyObject *obj = va_arg(count, PyObject *);
966 PyObject *ascii;
967 assert(obj);
968 ascii = PyObject_ASCII(obj);
969 if (!ascii)
970 goto fail;
971 n += PyUnicode_GET_SIZE(ascii);
972 /* Remember the repr and switch to the next slot */
973 *callresult++ = ascii;
974 break;
975 }
976 case 'p':
977 (void) va_arg(count, int);
978 /* maximum 64-bit pointer representation:
979 * 0xffffffffffffffff
980 * so 19 characters is enough.
981 * XXX I count 18 -- what's the extra for?
982 */
983 n += 19;
984 break;
985 default:
986 /* if we stumble upon an unknown
987 formatting code, copy the rest of
988 the format string to the output
989 string. (we cannot just skip the
990 code, since there's no way to know
991 what's in the argument list) */
992 n += strlen(p);
993 goto expand;
994 }
995 } else
996 n++;
997 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000998 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000999 if (abuffersize > ITEM_BUFFER_LEN) {
1000 /* add 1 for sprintf's trailing null byte */
1001 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001002 if (!abuffer) {
1003 PyErr_NoMemory();
1004 goto fail;
1005 }
1006 realbuffer = abuffer;
1007 }
1008 else
1009 realbuffer = buffer;
1010 /* step 4: fill the buffer */
1011 /* Since we've analyzed how much space we need for the worst case,
1012 we don't have to resize the string.
1013 There can be no errors beyond this point. */
1014 string = PyUnicode_FromUnicode(NULL, n);
1015 if (!string)
1016 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001017
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 s = PyUnicode_AS_UNICODE(string);
1019 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001020
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 for (f = format; *f; f++) {
1022 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001023 const char* p;
1024 int longflag;
1025 int longlongflag;
1026 int size_tflag;
1027
1028 p = f;
1029 zeropad = (f[1] == '0');
1030 f = parse_format_flags(f, &width, &precision,
1031 &longflag, &longlongflag, &size_tflag);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001032
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 switch (*f) {
1034 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001035 {
1036 int ordinal = va_arg(vargs, int);
1037#ifndef Py_UNICODE_WIDE
1038 if (ordinal > 0xffff) {
1039 ordinal -= 0x10000;
1040 *s++ = 0xD800 | (ordinal >> 10);
1041 *s++ = 0xDC00 | (ordinal & 0x3FF);
1042 } else
1043#endif
1044 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001046 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001047 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001048 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
Victor Stinner6d970f42011-03-02 00:04:25 +00001050 width, precision, *f);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 if (longflag)
1052 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001053#ifdef HAVE_LONG_LONG
1054 else if (longlongflag)
1055 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1056#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001057 else if (size_tflag)
1058 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1059 else
1060 sprintf(realbuffer, fmt, va_arg(vargs, int));
1061 appendstring(realbuffer);
1062 break;
1063 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001064 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1065 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001066 if (longflag)
1067 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001068#ifdef HAVE_LONG_LONG
1069 else if (longlongflag)
1070 sprintf(realbuffer, fmt, va_arg(vargs,
1071 unsigned PY_LONG_LONG));
1072#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001073 else if (size_tflag)
1074 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1075 else
1076 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1077 appendstring(realbuffer);
1078 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001079 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001080 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001081 sprintf(realbuffer, fmt, va_arg(vargs, int));
1082 appendstring(realbuffer);
1083 break;
1084 case 's':
1085 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001086 /* unused, since we already have the result */
1087 (void) va_arg(vargs, char *);
1088 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1089 PyUnicode_GET_SIZE(*callresult));
1090 s += PyUnicode_GET_SIZE(*callresult);
1091 /* We're done with the unicode()/repr() => forget it */
1092 Py_DECREF(*callresult);
1093 /* switch to next unicode()/repr() result */
1094 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001095 break;
1096 }
1097 case 'U':
1098 {
1099 PyObject *obj = va_arg(vargs, PyObject *);
1100 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1101 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1102 s += size;
1103 break;
1104 }
1105 case 'V':
1106 {
1107 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001108 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001109 if (obj) {
1110 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1111 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1112 s += size;
1113 } else {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001114 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1115 PyUnicode_GET_SIZE(*callresult));
1116 s += PyUnicode_GET_SIZE(*callresult);
1117 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001118 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001119 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001120 break;
1121 }
1122 case 'S':
1123 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001124 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001125 {
1126 Py_UNICODE *ucopy;
1127 Py_ssize_t usize;
1128 Py_ssize_t upos;
1129 /* unused, since we already have the result */
1130 (void) va_arg(vargs, PyObject *);
1131 ucopy = PyUnicode_AS_UNICODE(*callresult);
1132 usize = PyUnicode_GET_SIZE(*callresult);
1133 for (upos = 0; upos<usize;)
1134 *s++ = ucopy[upos++];
1135 /* We're done with the unicode()/repr() => forget it */
1136 Py_DECREF(*callresult);
1137 /* switch to next unicode()/repr() result */
1138 ++callresult;
1139 break;
1140 }
1141 case 'p':
1142 sprintf(buffer, "%p", va_arg(vargs, void*));
1143 /* %p is ill-defined: ensure leading 0x. */
1144 if (buffer[1] == 'X')
1145 buffer[1] = 'x';
1146 else if (buffer[1] != 'x') {
1147 memmove(buffer+2, buffer, strlen(buffer)+1);
1148 buffer[0] = '0';
1149 buffer[1] = 'x';
1150 }
1151 appendstring(buffer);
1152 break;
1153 case '%':
1154 *s++ = '%';
1155 break;
1156 default:
1157 appendstring(p);
1158 goto end;
1159 }
Victor Stinner1205f272010-09-11 00:54:47 +00001160 }
Victor Stinner1205f272010-09-11 00:54:47 +00001161 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 *s++ = *f;
1163 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001164
Benjamin Peterson29060642009-01-31 22:14:21 +00001165 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001166 if (callresults)
1167 PyObject_Free(callresults);
1168 if (abuffer)
1169 PyObject_Free(abuffer);
1170 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1171 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001173 if (callresults) {
1174 PyObject **callresult2 = callresults;
1175 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001176 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001177 ++callresult2;
1178 }
1179 PyObject_Free(callresults);
1180 }
1181 if (abuffer)
1182 PyObject_Free(abuffer);
1183 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001184}
1185
1186#undef appendstring
1187
1188PyObject *
1189PyUnicode_FromFormat(const char *format, ...)
1190{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001191 PyObject* ret;
1192 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001193
1194#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001195 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001196#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001197 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001198#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001199 ret = PyUnicode_FromFormatV(format, vargs);
1200 va_end(vargs);
1201 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001202}
1203
Victor Stinner5593d8a2010-10-02 11:11:27 +00001204/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1205 convert a Unicode object to a wide character string.
1206
1207 - If w is NULL: return the number of wide characters (including the nul
1208 character) required to convert the unicode object. Ignore size argument.
1209
1210 - Otherwise: return the number of wide characters (excluding the nul
1211 character) written into w. Write at most size wide characters (including
1212 the nul character). */
1213static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001214unicode_aswidechar(PyUnicodeObject *unicode,
1215 wchar_t *w,
1216 Py_ssize_t size)
1217{
1218#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001219 Py_ssize_t res;
1220 if (w != NULL) {
1221 res = PyUnicode_GET_SIZE(unicode);
1222 if (size > res)
1223 size = res + 1;
1224 else
1225 res = size;
1226 memcpy(w, unicode->str, size * sizeof(wchar_t));
1227 return res;
1228 }
1229 else
1230 return PyUnicode_GET_SIZE(unicode) + 1;
1231#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1232 register const Py_UNICODE *u;
1233 const Py_UNICODE *uend;
1234 const wchar_t *worig, *wend;
1235 Py_ssize_t nchar;
1236
Victor Stinner137c34c2010-09-29 10:25:54 +00001237 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001238 uend = u + PyUnicode_GET_SIZE(unicode);
1239 if (w != NULL) {
1240 worig = w;
1241 wend = w + size;
1242 while (u != uend && w != wend) {
1243 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1244 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1245 {
1246 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1247 u += 2;
1248 }
1249 else {
1250 *w = *u;
1251 u++;
1252 }
1253 w++;
1254 }
1255 if (w != wend)
1256 *w = L'\0';
1257 return w - worig;
1258 }
1259 else {
1260 nchar = 1; /* nul character at the end */
1261 while (u != uend) {
1262 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1263 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1264 u += 2;
1265 else
1266 u++;
1267 nchar++;
1268 }
1269 }
1270 return nchar;
1271#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1272 register Py_UNICODE *u, *uend, ordinal;
1273 register Py_ssize_t i;
1274 wchar_t *worig, *wend;
1275 Py_ssize_t nchar;
1276
1277 u = PyUnicode_AS_UNICODE(unicode);
1278 uend = u + PyUnicode_GET_SIZE(u);
1279 if (w != NULL) {
1280 worig = w;
1281 wend = w + size;
1282 while (u != uend && w != wend) {
1283 ordinal = *u;
1284 if (ordinal > 0xffff) {
1285 ordinal -= 0x10000;
1286 *w++ = 0xD800 | (ordinal >> 10);
1287 *w++ = 0xDC00 | (ordinal & 0x3FF);
1288 }
1289 else
1290 *w++ = ordinal;
1291 u++;
1292 }
1293 if (w != wend)
1294 *w = 0;
1295 return w - worig;
1296 }
1297 else {
1298 nchar = 1; /* nul character */
1299 while (u != uend) {
1300 if (*u > 0xffff)
1301 nchar += 2;
1302 else
1303 nchar++;
1304 u++;
1305 }
1306 return nchar;
1307 }
1308#else
1309# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001310#endif
1311}
1312
1313Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001314PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001315 wchar_t *w,
1316 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317{
1318 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001319 PyErr_BadInternalCall();
1320 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001322 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323}
1324
Victor Stinner137c34c2010-09-29 10:25:54 +00001325wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001326PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001327 Py_ssize_t *size)
1328{
1329 wchar_t* buffer;
1330 Py_ssize_t buflen;
1331
1332 if (unicode == NULL) {
1333 PyErr_BadInternalCall();
1334 return NULL;
1335 }
1336
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001337 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001338 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001339 PyErr_NoMemory();
1340 return NULL;
1341 }
1342
Victor Stinner137c34c2010-09-29 10:25:54 +00001343 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1344 if (buffer == NULL) {
1345 PyErr_NoMemory();
1346 return NULL;
1347 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001348 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001349 if (size != NULL)
1350 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001351 return buffer;
1352}
1353
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354#endif
1355
Alexander Belopolsky40018472011-02-26 01:02:56 +00001356PyObject *
1357PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001358{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001359 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001360
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001361 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001362 PyErr_SetString(PyExc_ValueError,
1363 "chr() arg not in range(0x110000)");
1364 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001365 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001366
1367#ifndef Py_UNICODE_WIDE
1368 if (ordinal > 0xffff) {
1369 ordinal -= 0x10000;
1370 s[0] = 0xD800 | (ordinal >> 10);
1371 s[1] = 0xDC00 | (ordinal & 0x3FF);
1372 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001373 }
1374#endif
1375
Hye-Shik Chang40574832004-04-06 07:24:51 +00001376 s[0] = (Py_UNICODE)ordinal;
1377 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001378}
1379
Alexander Belopolsky40018472011-02-26 01:02:56 +00001380PyObject *
1381PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001383 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001384 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001385 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001386 Py_INCREF(obj);
1387 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001388 }
1389 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 /* For a Unicode subtype that's not a Unicode object,
1391 return a true Unicode object with the same data. */
1392 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1393 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001394 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001395 PyErr_Format(PyExc_TypeError,
1396 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001397 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001398 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001399}
1400
Alexander Belopolsky40018472011-02-26 01:02:56 +00001401PyObject *
1402PyUnicode_FromEncodedObject(register PyObject *obj,
1403 const char *encoding,
1404 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001405{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001406 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001407 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001408
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001410 PyErr_BadInternalCall();
1411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001413
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001414 /* Decoding bytes objects is the most common case and should be fast */
1415 if (PyBytes_Check(obj)) {
1416 if (PyBytes_GET_SIZE(obj) == 0) {
1417 Py_INCREF(unicode_empty);
1418 v = (PyObject *) unicode_empty;
1419 }
1420 else {
1421 v = PyUnicode_Decode(
1422 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1423 encoding, errors);
1424 }
1425 return v;
1426 }
1427
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001428 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001429 PyErr_SetString(PyExc_TypeError,
1430 "decoding str is not supported");
1431 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001432 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001433
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001434 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1435 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1436 PyErr_Format(PyExc_TypeError,
1437 "coercing to str: need bytes, bytearray "
1438 "or buffer-like object, %.80s found",
1439 Py_TYPE(obj)->tp_name);
1440 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001441 }
Tim Petersced69f82003-09-16 20:30:58 +00001442
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001443 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001444 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001445 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446 }
Tim Petersced69f82003-09-16 20:30:58 +00001447 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001448 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001449
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001450 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001451 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452}
1453
Victor Stinner600d3be2010-06-10 12:00:55 +00001454/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001455 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1456 1 on success. */
1457static int
1458normalize_encoding(const char *encoding,
1459 char *lower,
1460 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001462 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001463 char *l;
1464 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001465
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001466 e = encoding;
1467 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001468 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001469 while (*e) {
1470 if (l == l_end)
1471 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001472 if (Py_ISUPPER(*e)) {
1473 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001474 }
1475 else if (*e == '_') {
1476 *l++ = '-';
1477 e++;
1478 }
1479 else {
1480 *l++ = *e++;
1481 }
1482 }
1483 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001484 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001485}
1486
Alexander Belopolsky40018472011-02-26 01:02:56 +00001487PyObject *
1488PyUnicode_Decode(const char *s,
1489 Py_ssize_t size,
1490 const char *encoding,
1491 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00001492{
1493 PyObject *buffer = NULL, *unicode;
1494 Py_buffer info;
1495 char lower[11]; /* Enough for any encoding shortcut */
1496
1497 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001499
1500 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001501 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001502 if ((strcmp(lower, "utf-8") == 0) ||
1503 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001504 return PyUnicode_DecodeUTF8(s, size, errors);
1505 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001506 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001507 (strcmp(lower, "iso-8859-1") == 0))
1508 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001509#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001510 else if (strcmp(lower, "mbcs") == 0)
1511 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001512#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001513 else if (strcmp(lower, "ascii") == 0)
1514 return PyUnicode_DecodeASCII(s, size, errors);
1515 else if (strcmp(lower, "utf-16") == 0)
1516 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1517 else if (strcmp(lower, "utf-32") == 0)
1518 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001520
1521 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001522 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001523 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001524 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001525 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 if (buffer == NULL)
1527 goto onError;
1528 unicode = PyCodec_Decode(buffer, encoding, errors);
1529 if (unicode == NULL)
1530 goto onError;
1531 if (!PyUnicode_Check(unicode)) {
1532 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001533 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001534 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001535 Py_DECREF(unicode);
1536 goto onError;
1537 }
1538 Py_DECREF(buffer);
1539 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001540
Benjamin Peterson29060642009-01-31 22:14:21 +00001541 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 Py_XDECREF(buffer);
1543 return NULL;
1544}
1545
Alexander Belopolsky40018472011-02-26 01:02:56 +00001546PyObject *
1547PyUnicode_AsDecodedObject(PyObject *unicode,
1548 const char *encoding,
1549 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001550{
1551 PyObject *v;
1552
1553 if (!PyUnicode_Check(unicode)) {
1554 PyErr_BadArgument();
1555 goto onError;
1556 }
1557
1558 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001559 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001560
1561 /* Decode via the codec registry */
1562 v = PyCodec_Decode(unicode, encoding, errors);
1563 if (v == NULL)
1564 goto onError;
1565 return v;
1566
Benjamin Peterson29060642009-01-31 22:14:21 +00001567 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001568 return NULL;
1569}
1570
Alexander Belopolsky40018472011-02-26 01:02:56 +00001571PyObject *
1572PyUnicode_AsDecodedUnicode(PyObject *unicode,
1573 const char *encoding,
1574 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001575{
1576 PyObject *v;
1577
1578 if (!PyUnicode_Check(unicode)) {
1579 PyErr_BadArgument();
1580 goto onError;
1581 }
1582
1583 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001584 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001585
1586 /* Decode via the codec registry */
1587 v = PyCodec_Decode(unicode, encoding, errors);
1588 if (v == NULL)
1589 goto onError;
1590 if (!PyUnicode_Check(v)) {
1591 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001592 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001593 Py_TYPE(v)->tp_name);
1594 Py_DECREF(v);
1595 goto onError;
1596 }
1597 return v;
1598
Benjamin Peterson29060642009-01-31 22:14:21 +00001599 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001600 return NULL;
1601}
1602
Alexander Belopolsky40018472011-02-26 01:02:56 +00001603PyObject *
1604PyUnicode_Encode(const Py_UNICODE *s,
1605 Py_ssize_t size,
1606 const char *encoding,
1607 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608{
1609 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001610
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 unicode = PyUnicode_FromUnicode(s, size);
1612 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001613 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1615 Py_DECREF(unicode);
1616 return v;
1617}
1618
Alexander Belopolsky40018472011-02-26 01:02:56 +00001619PyObject *
1620PyUnicode_AsEncodedObject(PyObject *unicode,
1621 const char *encoding,
1622 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001623{
1624 PyObject *v;
1625
1626 if (!PyUnicode_Check(unicode)) {
1627 PyErr_BadArgument();
1628 goto onError;
1629 }
1630
1631 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001632 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001633
1634 /* Encode via the codec registry */
1635 v = PyCodec_Encode(unicode, encoding, errors);
1636 if (v == NULL)
1637 goto onError;
1638 return v;
1639
Benjamin Peterson29060642009-01-31 22:14:21 +00001640 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001641 return NULL;
1642}
1643
Victor Stinnerad158722010-10-27 00:25:46 +00001644PyObject *
1645PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001646{
Victor Stinner313a1202010-06-11 23:56:51 +00001647#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001648 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1649 PyUnicode_GET_SIZE(unicode),
1650 NULL);
1651#elif defined(__APPLE__)
1652 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1653 PyUnicode_GET_SIZE(unicode),
1654 "surrogateescape");
1655#else
1656 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001657 return PyUnicode_AsEncodedString(unicode,
1658 Py_FileSystemDefaultEncoding,
1659 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001660 }
1661 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001662 /* locale encoding with surrogateescape */
1663 wchar_t *wchar;
1664 char *bytes;
1665 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001666 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001667
1668 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1669 if (wchar == NULL)
1670 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001671 bytes = _Py_wchar2char(wchar, &error_pos);
1672 if (bytes == NULL) {
1673 if (error_pos != (size_t)-1) {
1674 char *errmsg = strerror(errno);
1675 PyObject *exc = NULL;
1676 if (errmsg == NULL)
1677 errmsg = "Py_wchar2char() failed";
1678 raise_encode_exception(&exc,
1679 "filesystemencoding",
1680 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1681 error_pos, error_pos+1,
1682 errmsg);
1683 Py_XDECREF(exc);
1684 }
1685 else
1686 PyErr_NoMemory();
1687 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001688 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001689 }
1690 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001691
1692 bytes_obj = PyBytes_FromString(bytes);
1693 PyMem_Free(bytes);
1694 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001695 }
Victor Stinnerad158722010-10-27 00:25:46 +00001696#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001697}
1698
Alexander Belopolsky40018472011-02-26 01:02:56 +00001699PyObject *
1700PyUnicode_AsEncodedString(PyObject *unicode,
1701 const char *encoding,
1702 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703{
1704 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001705 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001706
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 if (!PyUnicode_Check(unicode)) {
1708 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001709 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710 }
Fred Drakee4315f52000-05-09 19:53:39 +00001711
Victor Stinner2f283c22011-03-02 01:21:46 +00001712 if (encoding == NULL) {
1713 if (errors == NULL || strcmp(errors, "strict") == 0)
1714 return PyUnicode_AsUTF8String(unicode);
1715 else
1716 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1717 PyUnicode_GET_SIZE(unicode),
1718 errors);
1719 }
Fred Drakee4315f52000-05-09 19:53:39 +00001720
1721 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001722 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001723 if ((strcmp(lower, "utf-8") == 0) ||
1724 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00001725 {
Victor Stinner2f283c22011-03-02 01:21:46 +00001726 if (errors == NULL || strcmp(errors, "strict") == 0)
Victor Stinnera5c68c32011-03-02 01:03:14 +00001727 return PyUnicode_AsUTF8String(unicode);
Victor Stinner2f283c22011-03-02 01:21:46 +00001728 else
Victor Stinnera5c68c32011-03-02 01:03:14 +00001729 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1730 PyUnicode_GET_SIZE(unicode),
1731 errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00001732 }
Victor Stinner37296e82010-06-10 13:36:23 +00001733 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001734 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001735 (strcmp(lower, "iso-8859-1") == 0))
1736 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1737 PyUnicode_GET_SIZE(unicode),
1738 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001739#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001740 else if (strcmp(lower, "mbcs") == 0)
1741 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1742 PyUnicode_GET_SIZE(unicode),
1743 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001744#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001745 else if (strcmp(lower, "ascii") == 0)
1746 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1747 PyUnicode_GET_SIZE(unicode),
1748 errors);
1749 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750
1751 /* Encode via the codec registry */
1752 v = PyCodec_Encode(unicode, encoding, errors);
1753 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001754 return NULL;
1755
1756 /* The normal path */
1757 if (PyBytes_Check(v))
1758 return v;
1759
1760 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001761 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001762 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001763 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001764
1765 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1766 "encoder %s returned bytearray instead of bytes",
1767 encoding);
1768 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001769 Py_DECREF(v);
1770 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001771 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001772
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001773 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1774 Py_DECREF(v);
1775 return b;
1776 }
1777
1778 PyErr_Format(PyExc_TypeError,
1779 "encoder did not return a bytes object (type=%.400s)",
1780 Py_TYPE(v)->tp_name);
1781 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001782 return NULL;
1783}
1784
Alexander Belopolsky40018472011-02-26 01:02:56 +00001785PyObject *
1786PyUnicode_AsEncodedUnicode(PyObject *unicode,
1787 const char *encoding,
1788 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001789{
1790 PyObject *v;
1791
1792 if (!PyUnicode_Check(unicode)) {
1793 PyErr_BadArgument();
1794 goto onError;
1795 }
1796
1797 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001798 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001799
1800 /* Encode via the codec registry */
1801 v = PyCodec_Encode(unicode, encoding, errors);
1802 if (v == NULL)
1803 goto onError;
1804 if (!PyUnicode_Check(v)) {
1805 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001806 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001807 Py_TYPE(v)->tp_name);
1808 Py_DECREF(v);
1809 goto onError;
1810 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001811 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001812
Benjamin Peterson29060642009-01-31 22:14:21 +00001813 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 return NULL;
1815}
1816
Alexander Belopolsky40018472011-02-26 01:02:56 +00001817PyObject *
Victor Stinnerf3fd7332011-03-02 01:03:11 +00001818_PyUnicode_AsDefaultEncodedString(PyObject *unicode)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001819{
1820 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001821 if (v)
1822 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001823 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001824 PyUnicode_GET_SIZE(unicode),
1825 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001826 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001827 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001828 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001829 return v;
1830}
1831
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001832PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001833PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001834 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001835 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1836}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001837
Christian Heimes5894ba72007-11-04 11:43:14 +00001838PyObject*
1839PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1840{
Victor Stinnerad158722010-10-27 00:25:46 +00001841#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1842 return PyUnicode_DecodeMBCS(s, size, NULL);
1843#elif defined(__APPLE__)
1844 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1845#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001846 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1847 can be undefined. If it is case, decode using UTF-8. The following assumes
1848 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1849 bootstrapping process where the codecs aren't ready yet.
1850 */
1851 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001852 return PyUnicode_Decode(s, size,
1853 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001854 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001855 }
1856 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001857 /* locale encoding with surrogateescape */
1858 wchar_t *wchar;
1859 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001860 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001861
1862 if (s[size] != '\0' || size != strlen(s)) {
1863 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1864 return NULL;
1865 }
1866
Victor Stinner168e1172010-10-16 23:16:16 +00001867 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001868 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001869 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001870
Victor Stinner168e1172010-10-16 23:16:16 +00001871 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001872 PyMem_Free(wchar);
1873 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001874 }
Victor Stinnerad158722010-10-27 00:25:46 +00001875#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001876}
1877
Martin v. Löwis011e8422009-05-05 04:43:17 +00001878
1879int
1880PyUnicode_FSConverter(PyObject* arg, void* addr)
1881{
1882 PyObject *output = NULL;
1883 Py_ssize_t size;
1884 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001885 if (arg == NULL) {
1886 Py_DECREF(*(PyObject**)addr);
1887 return 1;
1888 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001889 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001890 output = arg;
1891 Py_INCREF(output);
1892 }
1893 else {
1894 arg = PyUnicode_FromObject(arg);
1895 if (!arg)
1896 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001897 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001898 Py_DECREF(arg);
1899 if (!output)
1900 return 0;
1901 if (!PyBytes_Check(output)) {
1902 Py_DECREF(output);
1903 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1904 return 0;
1905 }
1906 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001907 size = PyBytes_GET_SIZE(output);
1908 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001909 if (size != strlen(data)) {
1910 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1911 Py_DECREF(output);
1912 return 0;
1913 }
1914 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001915 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001916}
1917
1918
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001919int
1920PyUnicode_FSDecoder(PyObject* arg, void* addr)
1921{
1922 PyObject *output = NULL;
1923 Py_ssize_t size;
1924 void *data;
1925 if (arg == NULL) {
1926 Py_DECREF(*(PyObject**)addr);
1927 return 1;
1928 }
1929 if (PyUnicode_Check(arg)) {
1930 output = arg;
1931 Py_INCREF(output);
1932 }
1933 else {
1934 arg = PyBytes_FromObject(arg);
1935 if (!arg)
1936 return 0;
1937 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1938 PyBytes_GET_SIZE(arg));
1939 Py_DECREF(arg);
1940 if (!output)
1941 return 0;
1942 if (!PyUnicode_Check(output)) {
1943 Py_DECREF(output);
1944 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1945 return 0;
1946 }
1947 }
1948 size = PyUnicode_GET_SIZE(output);
1949 data = PyUnicode_AS_UNICODE(output);
1950 if (size != Py_UNICODE_strlen(data)) {
1951 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1952 Py_DECREF(output);
1953 return 0;
1954 }
1955 *(PyObject**)addr = output;
1956 return Py_CLEANUP_SUPPORTED;
1957}
1958
1959
Martin v. Löwis5b222132007-06-10 09:51:05 +00001960char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001961_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001962{
Christian Heimesf3863112007-11-22 07:46:41 +00001963 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001964 if (!PyUnicode_Check(unicode)) {
1965 PyErr_BadArgument();
1966 return NULL;
1967 }
Victor Stinnerf3fd7332011-03-02 01:03:11 +00001968 bytes = _PyUnicode_AsDefaultEncodedString(unicode);
Christian Heimesf3863112007-11-22 07:46:41 +00001969 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001970 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001971 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001972 *psize = PyBytes_GET_SIZE(bytes);
1973 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001974}
1975
1976char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001977_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001978{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001979 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001980}
1981
Alexander Belopolsky40018472011-02-26 01:02:56 +00001982Py_UNICODE *
1983PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984{
1985 if (!PyUnicode_Check(unicode)) {
1986 PyErr_BadArgument();
1987 goto onError;
1988 }
1989 return PyUnicode_AS_UNICODE(unicode);
1990
Benjamin Peterson29060642009-01-31 22:14:21 +00001991 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 return NULL;
1993}
1994
Alexander Belopolsky40018472011-02-26 01:02:56 +00001995Py_ssize_t
1996PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001997{
1998 if (!PyUnicode_Check(unicode)) {
1999 PyErr_BadArgument();
2000 goto onError;
2001 }
2002 return PyUnicode_GET_SIZE(unicode);
2003
Benjamin Peterson29060642009-01-31 22:14:21 +00002004 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002005 return -1;
2006}
2007
Alexander Belopolsky40018472011-02-26 01:02:56 +00002008const char *
2009PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002010{
Victor Stinner42cb4622010-09-01 19:39:01 +00002011 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002012}
2013
Victor Stinner554f3f02010-06-16 23:33:54 +00002014/* create or adjust a UnicodeDecodeError */
2015static void
2016make_decode_exception(PyObject **exceptionObject,
2017 const char *encoding,
2018 const char *input, Py_ssize_t length,
2019 Py_ssize_t startpos, Py_ssize_t endpos,
2020 const char *reason)
2021{
2022 if (*exceptionObject == NULL) {
2023 *exceptionObject = PyUnicodeDecodeError_Create(
2024 encoding, input, length, startpos, endpos, reason);
2025 }
2026 else {
2027 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2028 goto onError;
2029 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2030 goto onError;
2031 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2032 goto onError;
2033 }
2034 return;
2035
2036onError:
2037 Py_DECREF(*exceptionObject);
2038 *exceptionObject = NULL;
2039}
2040
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002041/* error handling callback helper:
2042 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002043 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002044 and adjust various state variables.
2045 return 0 on success, -1 on error
2046*/
2047
Alexander Belopolsky40018472011-02-26 01:02:56 +00002048static int
2049unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2050 const char *encoding, const char *reason,
2051 const char **input, const char **inend, Py_ssize_t *startinpos,
2052 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2053 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002054{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002055 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056
2057 PyObject *restuple = NULL;
2058 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002059 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002060 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002061 Py_ssize_t requiredsize;
2062 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002063 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002064 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002065 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002066 int res = -1;
2067
2068 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002069 *errorHandler = PyCodec_LookupError(errors);
2070 if (*errorHandler == NULL)
2071 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002072 }
2073
Victor Stinner554f3f02010-06-16 23:33:54 +00002074 make_decode_exception(exceptionObject,
2075 encoding,
2076 *input, *inend - *input,
2077 *startinpos, *endinpos,
2078 reason);
2079 if (*exceptionObject == NULL)
2080 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002081
2082 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2083 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002084 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002085 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002086 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002087 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002088 }
2089 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002090 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002091
2092 /* Copy back the bytes variables, which might have been modified by the
2093 callback */
2094 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2095 if (!inputobj)
2096 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002097 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002098 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002099 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002100 *input = PyBytes_AS_STRING(inputobj);
2101 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002102 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002103 /* we can DECREF safely, as the exception has another reference,
2104 so the object won't go away. */
2105 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002106
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002107 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002108 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002109 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002110 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2111 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002112 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002113
2114 /* need more space? (at least enough for what we
2115 have+the replacement+the rest of the string (starting
2116 at the new input position), so we won't have to check space
2117 when there are no errors in the rest of the string) */
2118 repptr = PyUnicode_AS_UNICODE(repunicode);
2119 repsize = PyUnicode_GET_SIZE(repunicode);
2120 requiredsize = *outpos + repsize + insize-newpos;
2121 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002122 if (requiredsize<2*outsize)
2123 requiredsize = 2*outsize;
2124 if (_PyUnicode_Resize(output, requiredsize) < 0)
2125 goto onError;
2126 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127 }
2128 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002129 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002130 Py_UNICODE_COPY(*outptr, repptr, repsize);
2131 *outptr += repsize;
2132 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002133
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002134 /* we made it! */
2135 res = 0;
2136
Benjamin Peterson29060642009-01-31 22:14:21 +00002137 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002138 Py_XDECREF(restuple);
2139 return res;
2140}
2141
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002142/* --- UTF-7 Codec -------------------------------------------------------- */
2143
Antoine Pitrou244651a2009-05-04 18:56:13 +00002144/* See RFC2152 for details. We encode conservatively and decode liberally. */
2145
2146/* Three simple macros defining base-64. */
2147
2148/* Is c a base-64 character? */
2149
2150#define IS_BASE64(c) \
2151 (((c) >= 'A' && (c) <= 'Z') || \
2152 ((c) >= 'a' && (c) <= 'z') || \
2153 ((c) >= '0' && (c) <= '9') || \
2154 (c) == '+' || (c) == '/')
2155
2156/* given that c is a base-64 character, what is its base-64 value? */
2157
2158#define FROM_BASE64(c) \
2159 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2160 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2161 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2162 (c) == '+' ? 62 : 63)
2163
2164/* What is the base-64 character of the bottom 6 bits of n? */
2165
2166#define TO_BASE64(n) \
2167 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2168
2169/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2170 * decoded as itself. We are permissive on decoding; the only ASCII
2171 * byte not decoding to itself is the + which begins a base64
2172 * string. */
2173
2174#define DECODE_DIRECT(c) \
2175 ((c) <= 127 && (c) != '+')
2176
2177/* The UTF-7 encoder treats ASCII characters differently according to
2178 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2179 * the above). See RFC2152. This array identifies these different
2180 * sets:
2181 * 0 : "Set D"
2182 * alphanumeric and '(),-./:?
2183 * 1 : "Set O"
2184 * !"#$%&*;<=>@[]^_`{|}
2185 * 2 : "whitespace"
2186 * ht nl cr sp
2187 * 3 : special (must be base64 encoded)
2188 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2189 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002190
Tim Petersced69f82003-09-16 20:30:58 +00002191static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002192char utf7_category[128] = {
2193/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2194 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2195/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2196 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2197/* sp ! " # $ % & ' ( ) * + , - . / */
2198 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2199/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2200 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2201/* @ A B C D E F G H I J K L M N O */
2202 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2203/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2204 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2205/* ` a b c d e f g h i j k l m n o */
2206 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2207/* p q r s t u v w x y z { | } ~ del */
2208 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002209};
2210
Antoine Pitrou244651a2009-05-04 18:56:13 +00002211/* ENCODE_DIRECT: this character should be encoded as itself. The
2212 * answer depends on whether we are encoding set O as itself, and also
2213 * on whether we are encoding whitespace as itself. RFC2152 makes it
2214 * clear that the answers to these questions vary between
2215 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002216
Antoine Pitrou244651a2009-05-04 18:56:13 +00002217#define ENCODE_DIRECT(c, directO, directWS) \
2218 ((c) < 128 && (c) > 0 && \
2219 ((utf7_category[(c)] == 0) || \
2220 (directWS && (utf7_category[(c)] == 2)) || \
2221 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002222
Alexander Belopolsky40018472011-02-26 01:02:56 +00002223PyObject *
2224PyUnicode_DecodeUTF7(const char *s,
2225 Py_ssize_t size,
2226 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002227{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002228 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2229}
2230
Antoine Pitrou244651a2009-05-04 18:56:13 +00002231/* The decoder. The only state we preserve is our read position,
2232 * i.e. how many characters we have consumed. So if we end in the
2233 * middle of a shift sequence we have to back off the read position
2234 * and the output to the beginning of the sequence, otherwise we lose
2235 * all the shift state (seen bits, number of bits seen, high
2236 * surrogate). */
2237
Alexander Belopolsky40018472011-02-26 01:02:56 +00002238PyObject *
2239PyUnicode_DecodeUTF7Stateful(const char *s,
2240 Py_ssize_t size,
2241 const char *errors,
2242 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002243{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002244 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002245 Py_ssize_t startinpos;
2246 Py_ssize_t endinpos;
2247 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002248 const char *e;
2249 PyUnicodeObject *unicode;
2250 Py_UNICODE *p;
2251 const char *errmsg = "";
2252 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002253 Py_UNICODE *shiftOutStart;
2254 unsigned int base64bits = 0;
2255 unsigned long base64buffer = 0;
2256 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002257 PyObject *errorHandler = NULL;
2258 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002259
2260 unicode = _PyUnicode_New(size);
2261 if (!unicode)
2262 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002263 if (size == 0) {
2264 if (consumed)
2265 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002266 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002267 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002268
2269 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002270 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002271 e = s + size;
2272
2273 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002274 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002275 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002276 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002277
Antoine Pitrou244651a2009-05-04 18:56:13 +00002278 if (inShift) { /* in a base-64 section */
2279 if (IS_BASE64(ch)) { /* consume a base-64 character */
2280 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2281 base64bits += 6;
2282 s++;
2283 if (base64bits >= 16) {
2284 /* we have enough bits for a UTF-16 value */
2285 Py_UNICODE outCh = (Py_UNICODE)
2286 (base64buffer >> (base64bits-16));
2287 base64bits -= 16;
2288 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2289 if (surrogate) {
2290 /* expecting a second surrogate */
2291 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2292#ifdef Py_UNICODE_WIDE
2293 *p++ = (((surrogate & 0x3FF)<<10)
2294 | (outCh & 0x3FF)) + 0x10000;
2295#else
2296 *p++ = surrogate;
2297 *p++ = outCh;
2298#endif
2299 surrogate = 0;
2300 }
2301 else {
2302 surrogate = 0;
2303 errmsg = "second surrogate missing";
2304 goto utf7Error;
2305 }
2306 }
2307 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2308 /* first surrogate */
2309 surrogate = outCh;
2310 }
2311 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2312 errmsg = "unexpected second surrogate";
2313 goto utf7Error;
2314 }
2315 else {
2316 *p++ = outCh;
2317 }
2318 }
2319 }
2320 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002321 inShift = 0;
2322 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002323 if (surrogate) {
2324 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002325 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002326 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002327 if (base64bits > 0) { /* left-over bits */
2328 if (base64bits >= 6) {
2329 /* We've seen at least one base-64 character */
2330 errmsg = "partial character in shift sequence";
2331 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002332 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002333 else {
2334 /* Some bits remain; they should be zero */
2335 if (base64buffer != 0) {
2336 errmsg = "non-zero padding bits in shift sequence";
2337 goto utf7Error;
2338 }
2339 }
2340 }
2341 if (ch != '-') {
2342 /* '-' is absorbed; other terminating
2343 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002344 *p++ = ch;
2345 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002346 }
2347 }
2348 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002349 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002350 s++; /* consume '+' */
2351 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002352 s++;
2353 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002354 }
2355 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002356 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002357 shiftOutStart = p;
2358 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002359 }
2360 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002361 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002362 *p++ = ch;
2363 s++;
2364 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002365 else {
2366 startinpos = s-starts;
2367 s++;
2368 errmsg = "unexpected special character";
2369 goto utf7Error;
2370 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002371 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002372utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002373 outpos = p-PyUnicode_AS_UNICODE(unicode);
2374 endinpos = s-starts;
2375 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002376 errors, &errorHandler,
2377 "utf7", errmsg,
2378 &starts, &e, &startinpos, &endinpos, &exc, &s,
2379 &unicode, &outpos, &p))
2380 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002381 }
2382
Antoine Pitrou244651a2009-05-04 18:56:13 +00002383 /* end of string */
2384
2385 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2386 /* if we're in an inconsistent state, that's an error */
2387 if (surrogate ||
2388 (base64bits >= 6) ||
2389 (base64bits > 0 && base64buffer != 0)) {
2390 outpos = p-PyUnicode_AS_UNICODE(unicode);
2391 endinpos = size;
2392 if (unicode_decode_call_errorhandler(
2393 errors, &errorHandler,
2394 "utf7", "unterminated shift sequence",
2395 &starts, &e, &startinpos, &endinpos, &exc, &s,
2396 &unicode, &outpos, &p))
2397 goto onError;
2398 if (s < e)
2399 goto restart;
2400 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002401 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002402
2403 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002404 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002405 if (inShift) {
2406 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002407 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002408 }
2409 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002410 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002411 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002412 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002413
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002414 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002415 goto onError;
2416
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002417 Py_XDECREF(errorHandler);
2418 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002419 return (PyObject *)unicode;
2420
Benjamin Peterson29060642009-01-31 22:14:21 +00002421 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002422 Py_XDECREF(errorHandler);
2423 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002424 Py_DECREF(unicode);
2425 return NULL;
2426}
2427
2428
Alexander Belopolsky40018472011-02-26 01:02:56 +00002429PyObject *
2430PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2431 Py_ssize_t size,
2432 int base64SetO,
2433 int base64WhiteSpace,
2434 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002435{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002436 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002437 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002438 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002439 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002440 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002441 unsigned int base64bits = 0;
2442 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002443 char * out;
2444 char * start;
2445
2446 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002447 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002448
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002449 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002450 return PyErr_NoMemory();
2451
Antoine Pitrou244651a2009-05-04 18:56:13 +00002452 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002453 if (v == NULL)
2454 return NULL;
2455
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002456 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002457 for (;i < size; ++i) {
2458 Py_UNICODE ch = s[i];
2459
Antoine Pitrou244651a2009-05-04 18:56:13 +00002460 if (inShift) {
2461 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2462 /* shifting out */
2463 if (base64bits) { /* output remaining bits */
2464 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2465 base64buffer = 0;
2466 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002467 }
2468 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002469 /* Characters not in the BASE64 set implicitly unshift the sequence
2470 so no '-' is required, except if the character is itself a '-' */
2471 if (IS_BASE64(ch) || ch == '-') {
2472 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002473 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002474 *out++ = (char) ch;
2475 }
2476 else {
2477 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002478 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002479 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002480 else { /* not in a shift sequence */
2481 if (ch == '+') {
2482 *out++ = '+';
2483 *out++ = '-';
2484 }
2485 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2486 *out++ = (char) ch;
2487 }
2488 else {
2489 *out++ = '+';
2490 inShift = 1;
2491 goto encode_char;
2492 }
2493 }
2494 continue;
2495encode_char:
2496#ifdef Py_UNICODE_WIDE
2497 if (ch >= 0x10000) {
2498 /* code first surrogate */
2499 base64bits += 16;
2500 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2501 while (base64bits >= 6) {
2502 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2503 base64bits -= 6;
2504 }
2505 /* prepare second surrogate */
2506 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2507 }
2508#endif
2509 base64bits += 16;
2510 base64buffer = (base64buffer << 16) | ch;
2511 while (base64bits >= 6) {
2512 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2513 base64bits -= 6;
2514 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002515 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002516 if (base64bits)
2517 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2518 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002519 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002520 if (_PyBytes_Resize(&v, out - start) < 0)
2521 return NULL;
2522 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002523}
2524
Antoine Pitrou244651a2009-05-04 18:56:13 +00002525#undef IS_BASE64
2526#undef FROM_BASE64
2527#undef TO_BASE64
2528#undef DECODE_DIRECT
2529#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002530
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531/* --- UTF-8 Codec -------------------------------------------------------- */
2532
Tim Petersced69f82003-09-16 20:30:58 +00002533static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002535 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2536 illegal prefix. See RFC 3629 for details */
2537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2542 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2543 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002544 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2549 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2550 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2551 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2552 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553};
2554
Alexander Belopolsky40018472011-02-26 01:02:56 +00002555PyObject *
2556PyUnicode_DecodeUTF8(const char *s,
2557 Py_ssize_t size,
2558 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559{
Walter Dörwald69652032004-09-07 20:24:22 +00002560 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2561}
2562
Antoine Pitrouab868312009-01-10 15:40:25 +00002563/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2564#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2565
2566/* Mask to quickly check whether a C 'long' contains a
2567 non-ASCII, UTF8-encoded char. */
2568#if (SIZEOF_LONG == 8)
2569# define ASCII_CHAR_MASK 0x8080808080808080L
2570#elif (SIZEOF_LONG == 4)
2571# define ASCII_CHAR_MASK 0x80808080L
2572#else
2573# error C 'long' size should be either 4 or 8!
2574#endif
2575
Alexander Belopolsky40018472011-02-26 01:02:56 +00002576PyObject *
2577PyUnicode_DecodeUTF8Stateful(const char *s,
2578 Py_ssize_t size,
2579 const char *errors,
2580 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002581{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002582 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002584 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002585 Py_ssize_t startinpos;
2586 Py_ssize_t endinpos;
2587 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002588 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589 PyUnicodeObject *unicode;
2590 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002591 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002592 PyObject *errorHandler = NULL;
2593 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594
2595 /* Note: size will always be longer than the resulting Unicode
2596 character count */
2597 unicode = _PyUnicode_New(size);
2598 if (!unicode)
2599 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002600 if (size == 0) {
2601 if (consumed)
2602 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605
2606 /* Unpack UTF-8 encoded data */
2607 p = unicode->str;
2608 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002609 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610
2611 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002612 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002613
2614 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002615 /* Fast path for runs of ASCII characters. Given that common UTF-8
2616 input will consist of an overwhelming majority of ASCII
2617 characters, we try to optimize for this case by checking
2618 as many characters as a C 'long' can contain.
2619 First, check if we can do an aligned read, as most CPUs have
2620 a penalty for unaligned reads.
2621 */
2622 if (!((size_t) s & LONG_PTR_MASK)) {
2623 /* Help register allocation */
2624 register const char *_s = s;
2625 register Py_UNICODE *_p = p;
2626 while (_s < aligned_end) {
2627 /* Read a whole long at a time (either 4 or 8 bytes),
2628 and do a fast unrolled copy if it only contains ASCII
2629 characters. */
2630 unsigned long data = *(unsigned long *) _s;
2631 if (data & ASCII_CHAR_MASK)
2632 break;
2633 _p[0] = (unsigned char) _s[0];
2634 _p[1] = (unsigned char) _s[1];
2635 _p[2] = (unsigned char) _s[2];
2636 _p[3] = (unsigned char) _s[3];
2637#if (SIZEOF_LONG == 8)
2638 _p[4] = (unsigned char) _s[4];
2639 _p[5] = (unsigned char) _s[5];
2640 _p[6] = (unsigned char) _s[6];
2641 _p[7] = (unsigned char) _s[7];
2642#endif
2643 _s += SIZEOF_LONG;
2644 _p += SIZEOF_LONG;
2645 }
2646 s = _s;
2647 p = _p;
2648 if (s == e)
2649 break;
2650 ch = (unsigned char)*s;
2651 }
2652 }
2653
2654 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002655 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002656 s++;
2657 continue;
2658 }
2659
2660 n = utf8_code_length[ch];
2661
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002662 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002663 if (consumed)
2664 break;
2665 else {
2666 errmsg = "unexpected end of data";
2667 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002668 endinpos = startinpos+1;
2669 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2670 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002671 goto utf8Error;
2672 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674
2675 switch (n) {
2676
2677 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002678 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002679 startinpos = s-starts;
2680 endinpos = startinpos+1;
2681 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002682
2683 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002684 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002685 startinpos = s-starts;
2686 endinpos = startinpos+1;
2687 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002688
2689 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002690 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002691 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002692 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002693 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002694 goto utf8Error;
2695 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002697 assert ((ch > 0x007F) && (ch <= 0x07FF));
2698 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 break;
2700
2701 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002702 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2703 will result in surrogates in range d800-dfff. Surrogates are
2704 not valid UTF-8 so they are rejected.
2705 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2706 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002707 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002708 (s[2] & 0xc0) != 0x80 ||
2709 ((unsigned char)s[0] == 0xE0 &&
2710 (unsigned char)s[1] < 0xA0) ||
2711 ((unsigned char)s[0] == 0xED &&
2712 (unsigned char)s[1] > 0x9F)) {
2713 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002714 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002715 endinpos = startinpos + 1;
2716
2717 /* if s[1] first two bits are 1 and 0, then the invalid
2718 continuation byte is s[2], so increment endinpos by 1,
2719 if not, s[1] is invalid and endinpos doesn't need to
2720 be incremented. */
2721 if ((s[1] & 0xC0) == 0x80)
2722 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002723 goto utf8Error;
2724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002726 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2727 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002728 break;
2729
2730 case 4:
2731 if ((s[1] & 0xc0) != 0x80 ||
2732 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002733 (s[3] & 0xc0) != 0x80 ||
2734 ((unsigned char)s[0] == 0xF0 &&
2735 (unsigned char)s[1] < 0x90) ||
2736 ((unsigned char)s[0] == 0xF4 &&
2737 (unsigned char)s[1] > 0x8F)) {
2738 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002739 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002740 endinpos = startinpos + 1;
2741 if ((s[1] & 0xC0) == 0x80) {
2742 endinpos++;
2743 if ((s[2] & 0xC0) == 0x80)
2744 endinpos++;
2745 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002746 goto utf8Error;
2747 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002748 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002749 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2750 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2751
Fredrik Lundh8f455852001-06-27 18:59:43 +00002752#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002753 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002754#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002755 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002756
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002757 /* translate from 10000..10FFFF to 0..FFFF */
2758 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002759
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002760 /* high surrogate = top 10 bits added to D800 */
2761 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002762
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002763 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002764 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002765#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 }
2768 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002769 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002770
Benjamin Peterson29060642009-01-31 22:14:21 +00002771 utf8Error:
2772 outpos = p-PyUnicode_AS_UNICODE(unicode);
2773 if (unicode_decode_call_errorhandler(
2774 errors, &errorHandler,
2775 "utf8", errmsg,
2776 &starts, &e, &startinpos, &endinpos, &exc, &s,
2777 &unicode, &outpos, &p))
2778 goto onError;
2779 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 }
Walter Dörwald69652032004-09-07 20:24:22 +00002781 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002782 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783
2784 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002785 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 goto onError;
2787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002788 Py_XDECREF(errorHandler);
2789 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790 return (PyObject *)unicode;
2791
Benjamin Peterson29060642009-01-31 22:14:21 +00002792 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 Py_XDECREF(errorHandler);
2794 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795 Py_DECREF(unicode);
2796 return NULL;
2797}
2798
Antoine Pitrouab868312009-01-10 15:40:25 +00002799#undef ASCII_CHAR_MASK
2800
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002801#ifdef __APPLE__
2802
2803/* Simplified UTF-8 decoder using surrogateescape error handler,
2804 used to decode the command line arguments on Mac OS X. */
2805
2806wchar_t*
2807_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2808{
2809 int n;
2810 const char *e;
2811 wchar_t *unicode, *p;
2812
2813 /* Note: size will always be longer than the resulting Unicode
2814 character count */
2815 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2816 PyErr_NoMemory();
2817 return NULL;
2818 }
2819 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2820 if (!unicode)
2821 return NULL;
2822
2823 /* Unpack UTF-8 encoded data */
2824 p = unicode;
2825 e = s + size;
2826 while (s < e) {
2827 Py_UCS4 ch = (unsigned char)*s;
2828
2829 if (ch < 0x80) {
2830 *p++ = (wchar_t)ch;
2831 s++;
2832 continue;
2833 }
2834
2835 n = utf8_code_length[ch];
2836 if (s + n > e) {
2837 goto surrogateescape;
2838 }
2839
2840 switch (n) {
2841 case 0:
2842 case 1:
2843 goto surrogateescape;
2844
2845 case 2:
2846 if ((s[1] & 0xc0) != 0x80)
2847 goto surrogateescape;
2848 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2849 assert ((ch > 0x007F) && (ch <= 0x07FF));
2850 *p++ = (wchar_t)ch;
2851 break;
2852
2853 case 3:
2854 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2855 will result in surrogates in range d800-dfff. Surrogates are
2856 not valid UTF-8 so they are rejected.
2857 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2858 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2859 if ((s[1] & 0xc0) != 0x80 ||
2860 (s[2] & 0xc0) != 0x80 ||
2861 ((unsigned char)s[0] == 0xE0 &&
2862 (unsigned char)s[1] < 0xA0) ||
2863 ((unsigned char)s[0] == 0xED &&
2864 (unsigned char)s[1] > 0x9F)) {
2865
2866 goto surrogateescape;
2867 }
2868 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2869 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2870 *p++ = (Py_UNICODE)ch;
2871 break;
2872
2873 case 4:
2874 if ((s[1] & 0xc0) != 0x80 ||
2875 (s[2] & 0xc0) != 0x80 ||
2876 (s[3] & 0xc0) != 0x80 ||
2877 ((unsigned char)s[0] == 0xF0 &&
2878 (unsigned char)s[1] < 0x90) ||
2879 ((unsigned char)s[0] == 0xF4 &&
2880 (unsigned char)s[1] > 0x8F)) {
2881 goto surrogateescape;
2882 }
2883 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2884 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2885 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2886
2887#if SIZEOF_WCHAR_T == 4
2888 *p++ = (wchar_t)ch;
2889#else
2890 /* compute and append the two surrogates: */
2891
2892 /* translate from 10000..10FFFF to 0..FFFF */
2893 ch -= 0x10000;
2894
2895 /* high surrogate = top 10 bits added to D800 */
2896 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2897
2898 /* low surrogate = bottom 10 bits added to DC00 */
2899 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2900#endif
2901 break;
2902 }
2903 s += n;
2904 continue;
2905
2906 surrogateescape:
2907 *p++ = 0xDC00 + ch;
2908 s++;
2909 }
2910 *p = L'\0';
2911 return unicode;
2912}
2913
2914#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002915
Tim Peters602f7402002-04-27 18:03:26 +00002916/* Allocation strategy: if the string is short, convert into a stack buffer
2917 and allocate exactly as much space needed at the end. Else allocate the
2918 maximum possible needed (4 result bytes per Unicode character), and return
2919 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002920*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002921PyObject *
2922PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002923 Py_ssize_t size,
2924 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002925{
Tim Peters602f7402002-04-27 18:03:26 +00002926#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002927
Guido van Rossum98297ee2007-11-06 21:34:58 +00002928 Py_ssize_t i; /* index into s of next input byte */
2929 PyObject *result; /* result string object */
2930 char *p; /* next free byte in output buffer */
2931 Py_ssize_t nallocated; /* number of result bytes allocated */
2932 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002933 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002934 PyObject *errorHandler = NULL;
2935 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002936
Tim Peters602f7402002-04-27 18:03:26 +00002937 assert(s != NULL);
2938 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939
Tim Peters602f7402002-04-27 18:03:26 +00002940 if (size <= MAX_SHORT_UNICHARS) {
2941 /* Write into the stack buffer; nallocated can't overflow.
2942 * At the end, we'll allocate exactly as much heap space as it
2943 * turns out we need.
2944 */
2945 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002946 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002947 p = stackbuf;
2948 }
2949 else {
2950 /* Overallocate on the heap, and give the excess back at the end. */
2951 nallocated = size * 4;
2952 if (nallocated / 4 != size) /* overflow! */
2953 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002954 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002955 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002956 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002957 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002958 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002959
Tim Peters602f7402002-04-27 18:03:26 +00002960 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002961 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002962
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002963 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002964 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002966
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002968 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002969 *p++ = (char)(0xc0 | (ch >> 6));
2970 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002971 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002972#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002973 /* Special case: check for high and low surrogate */
2974 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2975 Py_UCS4 ch2 = s[i];
2976 /* Combine the two surrogates to form a UCS4 value */
2977 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2978 i++;
2979
2980 /* Encode UCS4 Unicode ordinals */
2981 *p++ = (char)(0xf0 | (ch >> 18));
2982 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002983 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2984 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002985 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002986#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002987 Py_ssize_t newpos;
2988 PyObject *rep;
2989 Py_ssize_t repsize, k;
2990 rep = unicode_encode_call_errorhandler
2991 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2992 s, size, &exc, i-1, i, &newpos);
2993 if (!rep)
2994 goto error;
2995
2996 if (PyBytes_Check(rep))
2997 repsize = PyBytes_GET_SIZE(rep);
2998 else
2999 repsize = PyUnicode_GET_SIZE(rep);
3000
3001 if (repsize > 4) {
3002 Py_ssize_t offset;
3003
3004 if (result == NULL)
3005 offset = p - stackbuf;
3006 else
3007 offset = p - PyBytes_AS_STRING(result);
3008
3009 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
3010 /* integer overflow */
3011 PyErr_NoMemory();
3012 goto error;
3013 }
3014 nallocated += repsize - 4;
3015 if (result != NULL) {
3016 if (_PyBytes_Resize(&result, nallocated) < 0)
3017 goto error;
3018 } else {
3019 result = PyBytes_FromStringAndSize(NULL, nallocated);
3020 if (result == NULL)
3021 goto error;
3022 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3023 }
3024 p = PyBytes_AS_STRING(result) + offset;
3025 }
3026
3027 if (PyBytes_Check(rep)) {
3028 char *prep = PyBytes_AS_STRING(rep);
3029 for(k = repsize; k > 0; k--)
3030 *p++ = *prep++;
3031 } else /* rep is unicode */ {
3032 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3033 Py_UNICODE c;
3034
3035 for(k=0; k<repsize; k++) {
3036 c = prep[k];
3037 if (0x80 <= c) {
3038 raise_encode_exception(&exc, "utf-8", s, size,
3039 i-1, i, "surrogates not allowed");
3040 goto error;
3041 }
3042 *p++ = (char)prep[k];
3043 }
3044 }
3045 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003046#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003047 }
Victor Stinner445a6232010-04-22 20:01:57 +00003048#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003049 } else if (ch < 0x10000) {
3050 *p++ = (char)(0xe0 | (ch >> 12));
3051 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3052 *p++ = (char)(0x80 | (ch & 0x3f));
3053 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003054 /* Encode UCS4 Unicode ordinals */
3055 *p++ = (char)(0xf0 | (ch >> 18));
3056 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3057 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3058 *p++ = (char)(0x80 | (ch & 0x3f));
3059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003061
Guido van Rossum98297ee2007-11-06 21:34:58 +00003062 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003063 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003064 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003065 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003066 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003067 }
3068 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003069 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003070 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003071 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003072 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003073 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003074 Py_XDECREF(errorHandler);
3075 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003076 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003077 error:
3078 Py_XDECREF(errorHandler);
3079 Py_XDECREF(exc);
3080 Py_XDECREF(result);
3081 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003082
Tim Peters602f7402002-04-27 18:03:26 +00003083#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084}
3085
Alexander Belopolsky40018472011-02-26 01:02:56 +00003086PyObject *
3087PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088{
Victor Stinnera5c68c32011-03-02 01:03:14 +00003089 PyObject *utf8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 if (!PyUnicode_Check(unicode)) {
3091 PyErr_BadArgument();
3092 return NULL;
3093 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003094 utf8 = _PyUnicode_AsDefaultEncodedString(unicode);
3095 if (utf8 == NULL)
3096 return NULL;
3097 Py_INCREF(utf8);
3098 return utf8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099}
3100
Walter Dörwald41980ca2007-08-16 21:55:45 +00003101/* --- UTF-32 Codec ------------------------------------------------------- */
3102
3103PyObject *
3104PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003105 Py_ssize_t size,
3106 const char *errors,
3107 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003108{
3109 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3110}
3111
3112PyObject *
3113PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003114 Py_ssize_t size,
3115 const char *errors,
3116 int *byteorder,
3117 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003118{
3119 const char *starts = s;
3120 Py_ssize_t startinpos;
3121 Py_ssize_t endinpos;
3122 Py_ssize_t outpos;
3123 PyUnicodeObject *unicode;
3124 Py_UNICODE *p;
3125#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003126 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003127 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003128#else
3129 const int pairs = 0;
3130#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003131 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003132 int bo = 0; /* assume native ordering by default */
3133 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003134 /* Offsets from q for retrieving bytes in the right order. */
3135#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3136 int iorder[] = {0, 1, 2, 3};
3137#else
3138 int iorder[] = {3, 2, 1, 0};
3139#endif
3140 PyObject *errorHandler = NULL;
3141 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003142
Walter Dörwald41980ca2007-08-16 21:55:45 +00003143 q = (unsigned char *)s;
3144 e = q + size;
3145
3146 if (byteorder)
3147 bo = *byteorder;
3148
3149 /* Check for BOM marks (U+FEFF) in the input and adjust current
3150 byte order setting accordingly. In native mode, the leading BOM
3151 mark is skipped, in all other modes, it is copied to the output
3152 stream as-is (giving a ZWNBSP character). */
3153 if (bo == 0) {
3154 if (size >= 4) {
3155 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003156 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003157#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003158 if (bom == 0x0000FEFF) {
3159 q += 4;
3160 bo = -1;
3161 }
3162 else if (bom == 0xFFFE0000) {
3163 q += 4;
3164 bo = 1;
3165 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003166#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003167 if (bom == 0x0000FEFF) {
3168 q += 4;
3169 bo = 1;
3170 }
3171 else if (bom == 0xFFFE0000) {
3172 q += 4;
3173 bo = -1;
3174 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003175#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003176 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003177 }
3178
3179 if (bo == -1) {
3180 /* force LE */
3181 iorder[0] = 0;
3182 iorder[1] = 1;
3183 iorder[2] = 2;
3184 iorder[3] = 3;
3185 }
3186 else if (bo == 1) {
3187 /* force BE */
3188 iorder[0] = 3;
3189 iorder[1] = 2;
3190 iorder[2] = 1;
3191 iorder[3] = 0;
3192 }
3193
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003194 /* On narrow builds we split characters outside the BMP into two
3195 codepoints => count how much extra space we need. */
3196#ifndef Py_UNICODE_WIDE
3197 for (qq = q; qq < e; qq += 4)
3198 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3199 pairs++;
3200#endif
3201
3202 /* This might be one to much, because of a BOM */
3203 unicode = _PyUnicode_New((size+3)/4+pairs);
3204 if (!unicode)
3205 return NULL;
3206 if (size == 0)
3207 return (PyObject *)unicode;
3208
3209 /* Unpack UTF-32 encoded data */
3210 p = unicode->str;
3211
Walter Dörwald41980ca2007-08-16 21:55:45 +00003212 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003213 Py_UCS4 ch;
3214 /* remaining bytes at the end? (size should be divisible by 4) */
3215 if (e-q<4) {
3216 if (consumed)
3217 break;
3218 errmsg = "truncated data";
3219 startinpos = ((const char *)q)-starts;
3220 endinpos = ((const char *)e)-starts;
3221 goto utf32Error;
3222 /* The remaining input chars are ignored if the callback
3223 chooses to skip the input */
3224 }
3225 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3226 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003227
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 if (ch >= 0x110000)
3229 {
3230 errmsg = "codepoint not in range(0x110000)";
3231 startinpos = ((const char *)q)-starts;
3232 endinpos = startinpos+4;
3233 goto utf32Error;
3234 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003235#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003236 if (ch >= 0x10000)
3237 {
3238 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3239 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3240 }
3241 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003242#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003243 *p++ = ch;
3244 q += 4;
3245 continue;
3246 utf32Error:
3247 outpos = p-PyUnicode_AS_UNICODE(unicode);
3248 if (unicode_decode_call_errorhandler(
3249 errors, &errorHandler,
3250 "utf32", errmsg,
3251 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3252 &unicode, &outpos, &p))
3253 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003254 }
3255
3256 if (byteorder)
3257 *byteorder = bo;
3258
3259 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003260 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003261
3262 /* Adjust length */
3263 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3264 goto onError;
3265
3266 Py_XDECREF(errorHandler);
3267 Py_XDECREF(exc);
3268 return (PyObject *)unicode;
3269
Benjamin Peterson29060642009-01-31 22:14:21 +00003270 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003271 Py_DECREF(unicode);
3272 Py_XDECREF(errorHandler);
3273 Py_XDECREF(exc);
3274 return NULL;
3275}
3276
3277PyObject *
3278PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003279 Py_ssize_t size,
3280 const char *errors,
3281 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003282{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003283 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003284 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003285 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003286#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003287 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003288#else
3289 const int pairs = 0;
3290#endif
3291 /* Offsets from p for storing byte pairs in the right order. */
3292#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3293 int iorder[] = {0, 1, 2, 3};
3294#else
3295 int iorder[] = {3, 2, 1, 0};
3296#endif
3297
Benjamin Peterson29060642009-01-31 22:14:21 +00003298#define STORECHAR(CH) \
3299 do { \
3300 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3301 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3302 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3303 p[iorder[0]] = (CH) & 0xff; \
3304 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003305 } while(0)
3306
3307 /* In narrow builds we can output surrogate pairs as one codepoint,
3308 so we need less space. */
3309#ifndef Py_UNICODE_WIDE
3310 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003311 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3312 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3313 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003314#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003315 nsize = (size - pairs + (byteorder == 0));
3316 bytesize = nsize * 4;
3317 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003318 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003319 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003320 if (v == NULL)
3321 return NULL;
3322
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003323 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003324 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003325 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003326 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003327 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003328
3329 if (byteorder == -1) {
3330 /* force LE */
3331 iorder[0] = 0;
3332 iorder[1] = 1;
3333 iorder[2] = 2;
3334 iorder[3] = 3;
3335 }
3336 else if (byteorder == 1) {
3337 /* force BE */
3338 iorder[0] = 3;
3339 iorder[1] = 2;
3340 iorder[2] = 1;
3341 iorder[3] = 0;
3342 }
3343
3344 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003345 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003346#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003347 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3348 Py_UCS4 ch2 = *s;
3349 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3350 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3351 s++;
3352 size--;
3353 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003354 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003355#endif
3356 STORECHAR(ch);
3357 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003358
3359 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003360 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003361#undef STORECHAR
3362}
3363
Alexander Belopolsky40018472011-02-26 01:02:56 +00003364PyObject *
3365PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003366{
3367 if (!PyUnicode_Check(unicode)) {
3368 PyErr_BadArgument();
3369 return NULL;
3370 }
3371 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003372 PyUnicode_GET_SIZE(unicode),
3373 NULL,
3374 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003375}
3376
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377/* --- UTF-16 Codec ------------------------------------------------------- */
3378
Tim Peters772747b2001-08-09 22:21:55 +00003379PyObject *
3380PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003381 Py_ssize_t size,
3382 const char *errors,
3383 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003384{
Walter Dörwald69652032004-09-07 20:24:22 +00003385 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3386}
3387
Antoine Pitrouab868312009-01-10 15:40:25 +00003388/* Two masks for fast checking of whether a C 'long' may contain
3389 UTF16-encoded surrogate characters. This is an efficient heuristic,
3390 assuming that non-surrogate characters with a code point >= 0x8000 are
3391 rare in most input.
3392 FAST_CHAR_MASK is used when the input is in native byte ordering,
3393 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003394*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003395#if (SIZEOF_LONG == 8)
3396# define FAST_CHAR_MASK 0x8000800080008000L
3397# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3398#elif (SIZEOF_LONG == 4)
3399# define FAST_CHAR_MASK 0x80008000L
3400# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3401#else
3402# error C 'long' size should be either 4 or 8!
3403#endif
3404
Walter Dörwald69652032004-09-07 20:24:22 +00003405PyObject *
3406PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003407 Py_ssize_t size,
3408 const char *errors,
3409 int *byteorder,
3410 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003411{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003412 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003413 Py_ssize_t startinpos;
3414 Py_ssize_t endinpos;
3415 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416 PyUnicodeObject *unicode;
3417 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003418 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003419 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003420 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003421 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003422 /* Offsets from q for retrieving byte pairs in the right order. */
3423#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3424 int ihi = 1, ilo = 0;
3425#else
3426 int ihi = 0, ilo = 1;
3427#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003428 PyObject *errorHandler = NULL;
3429 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430
3431 /* Note: size will always be longer than the resulting Unicode
3432 character count */
3433 unicode = _PyUnicode_New(size);
3434 if (!unicode)
3435 return NULL;
3436 if (size == 0)
3437 return (PyObject *)unicode;
3438
3439 /* Unpack UTF-16 encoded data */
3440 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003441 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003442 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443
3444 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003445 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003446
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003447 /* Check for BOM marks (U+FEFF) in the input and adjust current
3448 byte order setting accordingly. In native mode, the leading BOM
3449 mark is skipped, in all other modes, it is copied to the output
3450 stream as-is (giving a ZWNBSP character). */
3451 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003452 if (size >= 2) {
3453 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003454#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003455 if (bom == 0xFEFF) {
3456 q += 2;
3457 bo = -1;
3458 }
3459 else if (bom == 0xFFFE) {
3460 q += 2;
3461 bo = 1;
3462 }
Tim Petersced69f82003-09-16 20:30:58 +00003463#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003464 if (bom == 0xFEFF) {
3465 q += 2;
3466 bo = 1;
3467 }
3468 else if (bom == 0xFFFE) {
3469 q += 2;
3470 bo = -1;
3471 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003472#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003473 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003475
Tim Peters772747b2001-08-09 22:21:55 +00003476 if (bo == -1) {
3477 /* force LE */
3478 ihi = 1;
3479 ilo = 0;
3480 }
3481 else if (bo == 1) {
3482 /* force BE */
3483 ihi = 0;
3484 ilo = 1;
3485 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003486#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3487 native_ordering = ilo < ihi;
3488#else
3489 native_ordering = ilo > ihi;
3490#endif
Tim Peters772747b2001-08-09 22:21:55 +00003491
Antoine Pitrouab868312009-01-10 15:40:25 +00003492 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003493 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003494 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003495 /* First check for possible aligned read of a C 'long'. Unaligned
3496 reads are more expensive, better to defer to another iteration. */
3497 if (!((size_t) q & LONG_PTR_MASK)) {
3498 /* Fast path for runs of non-surrogate chars. */
3499 register const unsigned char *_q = q;
3500 Py_UNICODE *_p = p;
3501 if (native_ordering) {
3502 /* Native ordering is simple: as long as the input cannot
3503 possibly contain a surrogate char, do an unrolled copy
3504 of several 16-bit code points to the target object.
3505 The non-surrogate check is done on several input bytes
3506 at a time (as many as a C 'long' can contain). */
3507 while (_q < aligned_end) {
3508 unsigned long data = * (unsigned long *) _q;
3509 if (data & FAST_CHAR_MASK)
3510 break;
3511 _p[0] = ((unsigned short *) _q)[0];
3512 _p[1] = ((unsigned short *) _q)[1];
3513#if (SIZEOF_LONG == 8)
3514 _p[2] = ((unsigned short *) _q)[2];
3515 _p[3] = ((unsigned short *) _q)[3];
3516#endif
3517 _q += SIZEOF_LONG;
3518 _p += SIZEOF_LONG / 2;
3519 }
3520 }
3521 else {
3522 /* Byteswapped ordering is similar, but we must decompose
3523 the copy bytewise, and take care of zero'ing out the
3524 upper bytes if the target object is in 32-bit units
3525 (that is, in UCS-4 builds). */
3526 while (_q < aligned_end) {
3527 unsigned long data = * (unsigned long *) _q;
3528 if (data & SWAPPED_FAST_CHAR_MASK)
3529 break;
3530 /* Zero upper bytes in UCS-4 builds */
3531#if (Py_UNICODE_SIZE > 2)
3532 _p[0] = 0;
3533 _p[1] = 0;
3534#if (SIZEOF_LONG == 8)
3535 _p[2] = 0;
3536 _p[3] = 0;
3537#endif
3538#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003539 /* Issue #4916; UCS-4 builds on big endian machines must
3540 fill the two last bytes of each 4-byte unit. */
3541#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3542# define OFF 2
3543#else
3544# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003545#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003546 ((unsigned char *) _p)[OFF + 1] = _q[0];
3547 ((unsigned char *) _p)[OFF + 0] = _q[1];
3548 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3549 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3550#if (SIZEOF_LONG == 8)
3551 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3552 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3553 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3554 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3555#endif
3556#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003557 _q += SIZEOF_LONG;
3558 _p += SIZEOF_LONG / 2;
3559 }
3560 }
3561 p = _p;
3562 q = _q;
3563 if (q >= e)
3564 break;
3565 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003566 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003567
Benjamin Peterson14339b62009-01-31 16:36:08 +00003568 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003569
3570 if (ch < 0xD800 || ch > 0xDFFF) {
3571 *p++ = ch;
3572 continue;
3573 }
3574
3575 /* UTF-16 code pair: */
3576 if (q > e) {
3577 errmsg = "unexpected end of data";
3578 startinpos = (((const char *)q) - 2) - starts;
3579 endinpos = ((const char *)e) + 1 - starts;
3580 goto utf16Error;
3581 }
3582 if (0xD800 <= ch && ch <= 0xDBFF) {
3583 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3584 q += 2;
3585 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003586#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003587 *p++ = ch;
3588 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003589#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003590 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003591#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003592 continue;
3593 }
3594 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003595 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003596 startinpos = (((const char *)q)-4)-starts;
3597 endinpos = startinpos+2;
3598 goto utf16Error;
3599 }
3600
Benjamin Peterson14339b62009-01-31 16:36:08 +00003601 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003602 errmsg = "illegal encoding";
3603 startinpos = (((const char *)q)-2)-starts;
3604 endinpos = startinpos+2;
3605 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003606
Benjamin Peterson29060642009-01-31 22:14:21 +00003607 utf16Error:
3608 outpos = p - PyUnicode_AS_UNICODE(unicode);
3609 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003610 errors,
3611 &errorHandler,
3612 "utf16", errmsg,
3613 &starts,
3614 (const char **)&e,
3615 &startinpos,
3616 &endinpos,
3617 &exc,
3618 (const char **)&q,
3619 &unicode,
3620 &outpos,
3621 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003622 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003624 /* remaining byte at the end? (size should be even) */
3625 if (e == q) {
3626 if (!consumed) {
3627 errmsg = "truncated data";
3628 startinpos = ((const char *)q) - starts;
3629 endinpos = ((const char *)e) + 1 - starts;
3630 outpos = p - PyUnicode_AS_UNICODE(unicode);
3631 if (unicode_decode_call_errorhandler(
3632 errors,
3633 &errorHandler,
3634 "utf16", errmsg,
3635 &starts,
3636 (const char **)&e,
3637 &startinpos,
3638 &endinpos,
3639 &exc,
3640 (const char **)&q,
3641 &unicode,
3642 &outpos,
3643 &p))
3644 goto onError;
3645 /* The remaining input chars are ignored if the callback
3646 chooses to skip the input */
3647 }
3648 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649
3650 if (byteorder)
3651 *byteorder = bo;
3652
Walter Dörwald69652032004-09-07 20:24:22 +00003653 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003654 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003655
Guido van Rossumd57fd912000-03-10 22:53:23 +00003656 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003657 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003658 goto onError;
3659
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660 Py_XDECREF(errorHandler);
3661 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662 return (PyObject *)unicode;
3663
Benjamin Peterson29060642009-01-31 22:14:21 +00003664 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003665 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003666 Py_XDECREF(errorHandler);
3667 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 return NULL;
3669}
3670
Antoine Pitrouab868312009-01-10 15:40:25 +00003671#undef FAST_CHAR_MASK
3672#undef SWAPPED_FAST_CHAR_MASK
3673
Tim Peters772747b2001-08-09 22:21:55 +00003674PyObject *
3675PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 Py_ssize_t size,
3677 const char *errors,
3678 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003680 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003681 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003682 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003683#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003684 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003685#else
3686 const int pairs = 0;
3687#endif
Tim Peters772747b2001-08-09 22:21:55 +00003688 /* Offsets from p for storing byte pairs in the right order. */
3689#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3690 int ihi = 1, ilo = 0;
3691#else
3692 int ihi = 0, ilo = 1;
3693#endif
3694
Benjamin Peterson29060642009-01-31 22:14:21 +00003695#define STORECHAR(CH) \
3696 do { \
3697 p[ihi] = ((CH) >> 8) & 0xff; \
3698 p[ilo] = (CH) & 0xff; \
3699 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003700 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003702#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003703 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003704 if (s[i] >= 0x10000)
3705 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003706#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003707 /* 2 * (size + pairs + (byteorder == 0)) */
3708 if (size > PY_SSIZE_T_MAX ||
3709 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003710 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003711 nsize = size + pairs + (byteorder == 0);
3712 bytesize = nsize * 2;
3713 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003714 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003715 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716 if (v == NULL)
3717 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003719 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003720 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003721 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003722 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003723 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003724
3725 if (byteorder == -1) {
3726 /* force LE */
3727 ihi = 1;
3728 ilo = 0;
3729 }
3730 else if (byteorder == 1) {
3731 /* force BE */
3732 ihi = 0;
3733 ilo = 1;
3734 }
3735
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003736 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003737 Py_UNICODE ch = *s++;
3738 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003739#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003740 if (ch >= 0x10000) {
3741 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3742 ch = 0xD800 | ((ch-0x10000) >> 10);
3743 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003744#endif
Tim Peters772747b2001-08-09 22:21:55 +00003745 STORECHAR(ch);
3746 if (ch2)
3747 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003748 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003749
3750 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003751 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003752#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003753}
3754
Alexander Belopolsky40018472011-02-26 01:02:56 +00003755PyObject *
3756PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757{
3758 if (!PyUnicode_Check(unicode)) {
3759 PyErr_BadArgument();
3760 return NULL;
3761 }
3762 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003763 PyUnicode_GET_SIZE(unicode),
3764 NULL,
3765 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766}
3767
3768/* --- Unicode Escape Codec ----------------------------------------------- */
3769
Fredrik Lundh06d12682001-01-24 07:59:11 +00003770static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003771
Alexander Belopolsky40018472011-02-26 01:02:56 +00003772PyObject *
3773PyUnicode_DecodeUnicodeEscape(const char *s,
3774 Py_ssize_t size,
3775 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003776{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003777 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003778 Py_ssize_t startinpos;
3779 Py_ssize_t endinpos;
3780 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003781 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003783 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003785 char* message;
3786 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 PyObject *errorHandler = NULL;
3788 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003789
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 /* Escaped strings will always be longer than the resulting
3791 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792 length after conversion to the true value.
3793 (but if the error callback returns a long replacement string
3794 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795 v = _PyUnicode_New(size);
3796 if (v == NULL)
3797 goto onError;
3798 if (size == 0)
3799 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003800
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003801 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003803
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804 while (s < end) {
3805 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003806 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003807 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808
3809 /* Non-escape characters are interpreted as Unicode ordinals */
3810 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003811 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812 continue;
3813 }
3814
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003815 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003816 /* \ - Escapes */
3817 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003818 c = *s++;
3819 if (s > end)
3820 c = '\0'; /* Invalid after \ */
3821 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822
Benjamin Peterson29060642009-01-31 22:14:21 +00003823 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824 case '\n': break;
3825 case '\\': *p++ = '\\'; break;
3826 case '\'': *p++ = '\''; break;
3827 case '\"': *p++ = '\"'; break;
3828 case 'b': *p++ = '\b'; break;
3829 case 'f': *p++ = '\014'; break; /* FF */
3830 case 't': *p++ = '\t'; break;
3831 case 'n': *p++ = '\n'; break;
3832 case 'r': *p++ = '\r'; break;
3833 case 'v': *p++ = '\013'; break; /* VT */
3834 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3835
Benjamin Peterson29060642009-01-31 22:14:21 +00003836 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 case '0': case '1': case '2': case '3':
3838 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003839 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003840 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003841 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003842 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003843 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003845 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846 break;
3847
Benjamin Peterson29060642009-01-31 22:14:21 +00003848 /* hex escapes */
3849 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003850 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003851 digits = 2;
3852 message = "truncated \\xXX escape";
3853 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854
Benjamin Peterson29060642009-01-31 22:14:21 +00003855 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003856 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003857 digits = 4;
3858 message = "truncated \\uXXXX escape";
3859 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860
Benjamin Peterson29060642009-01-31 22:14:21 +00003861 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003862 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003863 digits = 8;
3864 message = "truncated \\UXXXXXXXX escape";
3865 hexescape:
3866 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003867 outpos = p-PyUnicode_AS_UNICODE(v);
3868 if (s+digits>end) {
3869 endinpos = size;
3870 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003871 errors, &errorHandler,
3872 "unicodeescape", "end of string in escape sequence",
3873 &starts, &end, &startinpos, &endinpos, &exc, &s,
3874 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003875 goto onError;
3876 goto nextByte;
3877 }
3878 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003879 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003880 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003881 endinpos = (s+i+1)-starts;
3882 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003883 errors, &errorHandler,
3884 "unicodeescape", message,
3885 &starts, &end, &startinpos, &endinpos, &exc, &s,
3886 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003887 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003888 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003889 }
3890 chr = (chr<<4) & ~0xF;
3891 if (c >= '0' && c <= '9')
3892 chr += c - '0';
3893 else if (c >= 'a' && c <= 'f')
3894 chr += 10 + c - 'a';
3895 else
3896 chr += 10 + c - 'A';
3897 }
3898 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003899 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003900 /* _decoding_error will have already written into the
3901 target buffer. */
3902 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003903 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003904 /* when we get here, chr is a 32-bit unicode character */
3905 if (chr <= 0xffff)
3906 /* UCS-2 character */
3907 *p++ = (Py_UNICODE) chr;
3908 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003909 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003910 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003911#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003912 *p++ = chr;
3913#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003914 chr -= 0x10000L;
3915 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003916 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003917#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003918 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003919 endinpos = s-starts;
3920 outpos = p-PyUnicode_AS_UNICODE(v);
3921 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003922 errors, &errorHandler,
3923 "unicodeescape", "illegal Unicode character",
3924 &starts, &end, &startinpos, &endinpos, &exc, &s,
3925 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003926 goto onError;
3927 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003928 break;
3929
Benjamin Peterson29060642009-01-31 22:14:21 +00003930 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003931 case 'N':
3932 message = "malformed \\N character escape";
3933 if (ucnhash_CAPI == NULL) {
3934 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003935 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003936 if (ucnhash_CAPI == NULL)
3937 goto ucnhashError;
3938 }
3939 if (*s == '{') {
3940 const char *start = s+1;
3941 /* look for the closing brace */
3942 while (*s != '}' && s < end)
3943 s++;
3944 if (s > start && s < end && *s == '}') {
3945 /* found a name. look it up in the unicode database */
3946 message = "unknown Unicode character name";
3947 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003948 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003949 goto store;
3950 }
3951 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003952 endinpos = s-starts;
3953 outpos = p-PyUnicode_AS_UNICODE(v);
3954 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003955 errors, &errorHandler,
3956 "unicodeescape", message,
3957 &starts, &end, &startinpos, &endinpos, &exc, &s,
3958 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003959 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003960 break;
3961
3962 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003963 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 message = "\\ at end of string";
3965 s--;
3966 endinpos = s-starts;
3967 outpos = p-PyUnicode_AS_UNICODE(v);
3968 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003969 errors, &errorHandler,
3970 "unicodeescape", message,
3971 &starts, &end, &startinpos, &endinpos, &exc, &s,
3972 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003973 goto onError;
3974 }
3975 else {
3976 *p++ = '\\';
3977 *p++ = (unsigned char)s[-1];
3978 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003979 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003981 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003982 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003984 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003985 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003986 Py_XDECREF(errorHandler);
3987 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003989
Benjamin Peterson29060642009-01-31 22:14:21 +00003990 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003991 PyErr_SetString(
3992 PyExc_UnicodeError,
3993 "\\N escapes not supported (can't load unicodedata module)"
3994 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003995 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003996 Py_XDECREF(errorHandler);
3997 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003998 return NULL;
3999
Benjamin Peterson29060642009-01-31 22:14:21 +00004000 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004001 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004002 Py_XDECREF(errorHandler);
4003 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 return NULL;
4005}
4006
4007/* Return a Unicode-Escape string version of the Unicode object.
4008
4009 If quotes is true, the string is enclosed in u"" or u'' quotes as
4010 appropriate.
4011
4012*/
4013
Thomas Wouters477c8d52006-05-27 19:21:47 +00004014Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004015 Py_ssize_t size,
4016 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00004017{
4018 /* like wcschr, but doesn't stop at NULL characters */
4019
4020 while (size-- > 0) {
4021 if (*s == ch)
4022 return s;
4023 s++;
4024 }
4025
4026 return NULL;
4027}
Barry Warsaw51ac5802000-03-20 16:36:48 +00004028
Walter Dörwald79e913e2007-05-12 11:08:06 +00004029static const char *hexdigits = "0123456789abcdef";
4030
Alexander Belopolsky40018472011-02-26 01:02:56 +00004031PyObject *
4032PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
4033 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004035 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004038#ifdef Py_UNICODE_WIDE
4039 const Py_ssize_t expandsize = 10;
4040#else
4041 const Py_ssize_t expandsize = 6;
4042#endif
4043
Thomas Wouters89f507f2006-12-13 04:49:30 +00004044 /* XXX(nnorwitz): rather than over-allocating, it would be
4045 better to choose a different scheme. Perhaps scan the
4046 first N-chars of the string and allocate based on that size.
4047 */
4048 /* Initial allocation is based on the longest-possible unichr
4049 escape.
4050
4051 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4052 unichr, so in this case it's the longest unichr escape. In
4053 narrow (UTF-16) builds this is five chars per source unichr
4054 since there are two unichrs in the surrogate pair, so in narrow
4055 (UTF-16) builds it's not the longest unichr escape.
4056
4057 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4058 so in the narrow (UTF-16) build case it's the longest unichr
4059 escape.
4060 */
4061
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004062 if (size == 0)
4063 return PyBytes_FromStringAndSize(NULL, 0);
4064
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004065 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004066 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004067
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004068 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004069 2
4070 + expandsize*size
4071 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 if (repr == NULL)
4073 return NULL;
4074
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004075 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004076
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 while (size-- > 0) {
4078 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004079
Walter Dörwald79e913e2007-05-12 11:08:06 +00004080 /* Escape backslashes */
4081 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 *p++ = '\\';
4083 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004084 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004085 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004086
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004087#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004088 /* Map 21-bit characters to '\U00xxxxxx' */
4089 else if (ch >= 0x10000) {
4090 *p++ = '\\';
4091 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004092 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4093 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4094 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4095 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4096 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4097 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4098 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4099 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004100 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004101 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004102#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004103 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4104 else if (ch >= 0xD800 && ch < 0xDC00) {
4105 Py_UNICODE ch2;
4106 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004107
Benjamin Peterson29060642009-01-31 22:14:21 +00004108 ch2 = *s++;
4109 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004110 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4112 *p++ = '\\';
4113 *p++ = 'U';
4114 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4115 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4116 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4117 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4118 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4119 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4120 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4121 *p++ = hexdigits[ucs & 0x0000000F];
4122 continue;
4123 }
4124 /* Fall through: isolated surrogates are copied as-is */
4125 s--;
4126 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004127 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004128#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004129
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004131 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004132 *p++ = '\\';
4133 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004134 *p++ = hexdigits[(ch >> 12) & 0x000F];
4135 *p++ = hexdigits[(ch >> 8) & 0x000F];
4136 *p++ = hexdigits[(ch >> 4) & 0x000F];
4137 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004139
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004140 /* Map special whitespace to '\t', \n', '\r' */
4141 else if (ch == '\t') {
4142 *p++ = '\\';
4143 *p++ = 't';
4144 }
4145 else if (ch == '\n') {
4146 *p++ = '\\';
4147 *p++ = 'n';
4148 }
4149 else if (ch == '\r') {
4150 *p++ = '\\';
4151 *p++ = 'r';
4152 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004153
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004154 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004155 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004157 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004158 *p++ = hexdigits[(ch >> 4) & 0x000F];
4159 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004160 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004161
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162 /* Copy everything else as-is */
4163 else
4164 *p++ = (char) ch;
4165 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004167 assert(p - PyBytes_AS_STRING(repr) > 0);
4168 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4169 return NULL;
4170 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171}
4172
Alexander Belopolsky40018472011-02-26 01:02:56 +00004173PyObject *
4174PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004175{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004176 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 if (!PyUnicode_Check(unicode)) {
4178 PyErr_BadArgument();
4179 return NULL;
4180 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004181 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4182 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004183 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184}
4185
4186/* --- Raw Unicode Escape Codec ------------------------------------------- */
4187
Alexander Belopolsky40018472011-02-26 01:02:56 +00004188PyObject *
4189PyUnicode_DecodeRawUnicodeEscape(const char *s,
4190 Py_ssize_t size,
4191 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004194 Py_ssize_t startinpos;
4195 Py_ssize_t endinpos;
4196 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 const char *end;
4200 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 PyObject *errorHandler = NULL;
4202 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004203
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204 /* Escaped strings will always be longer than the resulting
4205 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 length after conversion to the true value. (But decoding error
4207 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004208 v = _PyUnicode_New(size);
4209 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004210 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004211 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214 end = s + size;
4215 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 unsigned char c;
4217 Py_UCS4 x;
4218 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004219 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004220
Benjamin Peterson29060642009-01-31 22:14:21 +00004221 /* Non-escape characters are interpreted as Unicode ordinals */
4222 if (*s != '\\') {
4223 *p++ = (unsigned char)*s++;
4224 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004225 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 startinpos = s-starts;
4227
4228 /* \u-escapes are only interpreted iff the number of leading
4229 backslashes if odd */
4230 bs = s;
4231 for (;s < end;) {
4232 if (*s != '\\')
4233 break;
4234 *p++ = (unsigned char)*s++;
4235 }
4236 if (((s - bs) & 1) == 0 ||
4237 s >= end ||
4238 (*s != 'u' && *s != 'U')) {
4239 continue;
4240 }
4241 p--;
4242 count = *s=='u' ? 4 : 8;
4243 s++;
4244
4245 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4246 outpos = p-PyUnicode_AS_UNICODE(v);
4247 for (x = 0, i = 0; i < count; ++i, ++s) {
4248 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004249 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 endinpos = s-starts;
4251 if (unicode_decode_call_errorhandler(
4252 errors, &errorHandler,
4253 "rawunicodeescape", "truncated \\uXXXX",
4254 &starts, &end, &startinpos, &endinpos, &exc, &s,
4255 &v, &outpos, &p))
4256 goto onError;
4257 goto nextByte;
4258 }
4259 x = (x<<4) & ~0xF;
4260 if (c >= '0' && c <= '9')
4261 x += c - '0';
4262 else if (c >= 'a' && c <= 'f')
4263 x += 10 + c - 'a';
4264 else
4265 x += 10 + c - 'A';
4266 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004267 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 /* UCS-2 character */
4269 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004270 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004271 /* UCS-4 character. Either store directly, or as
4272 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004273#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004275#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004276 x -= 0x10000L;
4277 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4278 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004279#endif
4280 } else {
4281 endinpos = s-starts;
4282 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004283 if (unicode_decode_call_errorhandler(
4284 errors, &errorHandler,
4285 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 &starts, &end, &startinpos, &endinpos, &exc, &s,
4287 &v, &outpos, &p))
4288 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004289 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 nextByte:
4291 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004293 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004295 Py_XDECREF(errorHandler);
4296 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004298
Benjamin Peterson29060642009-01-31 22:14:21 +00004299 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004301 Py_XDECREF(errorHandler);
4302 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004303 return NULL;
4304}
4305
Alexander Belopolsky40018472011-02-26 01:02:56 +00004306PyObject *
4307PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4308 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004310 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311 char *p;
4312 char *q;
4313
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004314#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004315 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004316#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004317 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004318#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004319
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004320 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004321 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004322
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004323 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324 if (repr == NULL)
4325 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004326 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004327 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004329 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 while (size-- > 0) {
4331 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004332#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004333 /* Map 32-bit characters to '\Uxxxxxxxx' */
4334 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004335 *p++ = '\\';
4336 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004337 *p++ = hexdigits[(ch >> 28) & 0xf];
4338 *p++ = hexdigits[(ch >> 24) & 0xf];
4339 *p++ = hexdigits[(ch >> 20) & 0xf];
4340 *p++ = hexdigits[(ch >> 16) & 0xf];
4341 *p++ = hexdigits[(ch >> 12) & 0xf];
4342 *p++ = hexdigits[(ch >> 8) & 0xf];
4343 *p++ = hexdigits[(ch >> 4) & 0xf];
4344 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004345 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004346 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004347#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004348 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4349 if (ch >= 0xD800 && ch < 0xDC00) {
4350 Py_UNICODE ch2;
4351 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004352
Benjamin Peterson29060642009-01-31 22:14:21 +00004353 ch2 = *s++;
4354 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004355 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004356 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4357 *p++ = '\\';
4358 *p++ = 'U';
4359 *p++ = hexdigits[(ucs >> 28) & 0xf];
4360 *p++ = hexdigits[(ucs >> 24) & 0xf];
4361 *p++ = hexdigits[(ucs >> 20) & 0xf];
4362 *p++ = hexdigits[(ucs >> 16) & 0xf];
4363 *p++ = hexdigits[(ucs >> 12) & 0xf];
4364 *p++ = hexdigits[(ucs >> 8) & 0xf];
4365 *p++ = hexdigits[(ucs >> 4) & 0xf];
4366 *p++ = hexdigits[ucs & 0xf];
4367 continue;
4368 }
4369 /* Fall through: isolated surrogates are copied as-is */
4370 s--;
4371 size++;
4372 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004373#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004374 /* Map 16-bit characters to '\uxxxx' */
4375 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376 *p++ = '\\';
4377 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004378 *p++ = hexdigits[(ch >> 12) & 0xf];
4379 *p++ = hexdigits[(ch >> 8) & 0xf];
4380 *p++ = hexdigits[(ch >> 4) & 0xf];
4381 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004382 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004383 /* Copy everything else as-is */
4384 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385 *p++ = (char) ch;
4386 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004387 size = p - q;
4388
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004389 assert(size > 0);
4390 if (_PyBytes_Resize(&repr, size) < 0)
4391 return NULL;
4392 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393}
4394
Alexander Belopolsky40018472011-02-26 01:02:56 +00004395PyObject *
4396PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004398 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004400 PyErr_BadArgument();
4401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004403 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4404 PyUnicode_GET_SIZE(unicode));
4405
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004406 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407}
4408
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004409/* --- Unicode Internal Codec ------------------------------------------- */
4410
Alexander Belopolsky40018472011-02-26 01:02:56 +00004411PyObject *
4412_PyUnicode_DecodeUnicodeInternal(const char *s,
4413 Py_ssize_t size,
4414 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004415{
4416 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004417 Py_ssize_t startinpos;
4418 Py_ssize_t endinpos;
4419 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004420 PyUnicodeObject *v;
4421 Py_UNICODE *p;
4422 const char *end;
4423 const char *reason;
4424 PyObject *errorHandler = NULL;
4425 PyObject *exc = NULL;
4426
Neal Norwitzd43069c2006-01-08 01:12:10 +00004427#ifdef Py_UNICODE_WIDE
4428 Py_UNICODE unimax = PyUnicode_GetMax();
4429#endif
4430
Thomas Wouters89f507f2006-12-13 04:49:30 +00004431 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004432 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4433 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004434 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004435 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004437 p = PyUnicode_AS_UNICODE(v);
4438 end = s + size;
4439
4440 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004441 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004442 /* We have to sanity check the raw data, otherwise doom looms for
4443 some malformed UCS-4 data. */
4444 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004445#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004446 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004447#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004448 end-s < Py_UNICODE_SIZE
4449 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004450 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004451 startinpos = s - starts;
4452 if (end-s < Py_UNICODE_SIZE) {
4453 endinpos = end-starts;
4454 reason = "truncated input";
4455 }
4456 else {
4457 endinpos = s - starts + Py_UNICODE_SIZE;
4458 reason = "illegal code point (> 0x10FFFF)";
4459 }
4460 outpos = p - PyUnicode_AS_UNICODE(v);
4461 if (unicode_decode_call_errorhandler(
4462 errors, &errorHandler,
4463 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004464 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004465 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004466 goto onError;
4467 }
4468 }
4469 else {
4470 p++;
4471 s += Py_UNICODE_SIZE;
4472 }
4473 }
4474
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004475 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004476 goto onError;
4477 Py_XDECREF(errorHandler);
4478 Py_XDECREF(exc);
4479 return (PyObject *)v;
4480
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004482 Py_XDECREF(v);
4483 Py_XDECREF(errorHandler);
4484 Py_XDECREF(exc);
4485 return NULL;
4486}
4487
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488/* --- Latin-1 Codec ------------------------------------------------------ */
4489
Alexander Belopolsky40018472011-02-26 01:02:56 +00004490PyObject *
4491PyUnicode_DecodeLatin1(const char *s,
4492 Py_ssize_t size,
4493 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494{
4495 PyUnicodeObject *v;
4496 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004497 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004498
Guido van Rossumd57fd912000-03-10 22:53:23 +00004499 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004500 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 Py_UNICODE r = *(unsigned char*)s;
4502 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004503 }
4504
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505 v = _PyUnicode_New(size);
4506 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004507 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004510 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004511 e = s + size;
4512 /* Unrolling the copy makes it much faster by reducing the looping
4513 overhead. This is similar to what many memcpy() implementations do. */
4514 unrolled_end = e - 4;
4515 while (s < unrolled_end) {
4516 p[0] = (unsigned char) s[0];
4517 p[1] = (unsigned char) s[1];
4518 p[2] = (unsigned char) s[2];
4519 p[3] = (unsigned char) s[3];
4520 s += 4;
4521 p += 4;
4522 }
4523 while (s < e)
4524 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004526
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528 Py_XDECREF(v);
4529 return NULL;
4530}
4531
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004532/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004533static void
4534make_encode_exception(PyObject **exceptionObject,
4535 const char *encoding,
4536 const Py_UNICODE *unicode, Py_ssize_t size,
4537 Py_ssize_t startpos, Py_ssize_t endpos,
4538 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004540 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 *exceptionObject = PyUnicodeEncodeError_Create(
4542 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543 }
4544 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004545 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4546 goto onError;
4547 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4548 goto onError;
4549 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4550 goto onError;
4551 return;
4552 onError:
4553 Py_DECREF(*exceptionObject);
4554 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555 }
4556}
4557
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004559static void
4560raise_encode_exception(PyObject **exceptionObject,
4561 const char *encoding,
4562 const Py_UNICODE *unicode, Py_ssize_t size,
4563 Py_ssize_t startpos, Py_ssize_t endpos,
4564 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565{
4566 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004567 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004568 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004570}
4571
4572/* error handling callback helper:
4573 build arguments, call the callback and check the arguments,
4574 put the result into newpos and return the replacement string, which
4575 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004576static PyObject *
4577unicode_encode_call_errorhandler(const char *errors,
4578 PyObject **errorHandler,
4579 const char *encoding, const char *reason,
4580 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4581 Py_ssize_t startpos, Py_ssize_t endpos,
4582 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004584 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585
4586 PyObject *restuple;
4587 PyObject *resunicode;
4588
4589 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004590 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004592 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004593 }
4594
4595 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004596 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004597 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004598 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599
4600 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004601 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004602 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004603 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004604 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004605 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004606 Py_DECREF(restuple);
4607 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004608 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004609 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 &resunicode, newpos)) {
4611 Py_DECREF(restuple);
4612 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004613 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004614 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4615 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4616 Py_DECREF(restuple);
4617 return NULL;
4618 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004619 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004620 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004621 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004622 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4623 Py_DECREF(restuple);
4624 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004625 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626 Py_INCREF(resunicode);
4627 Py_DECREF(restuple);
4628 return resunicode;
4629}
4630
Alexander Belopolsky40018472011-02-26 01:02:56 +00004631static PyObject *
4632unicode_encode_ucs1(const Py_UNICODE *p,
4633 Py_ssize_t size,
4634 const char *errors,
4635 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004636{
4637 /* output object */
4638 PyObject *res;
4639 /* pointers to the beginning and end+1 of input */
4640 const Py_UNICODE *startp = p;
4641 const Py_UNICODE *endp = p + size;
4642 /* pointer to the beginning of the unencodable characters */
4643 /* const Py_UNICODE *badp = NULL; */
4644 /* pointer into the output */
4645 char *str;
4646 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004647 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004648 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4649 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650 PyObject *errorHandler = NULL;
4651 PyObject *exc = NULL;
4652 /* the following variable is used for caching string comparisons
4653 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4654 int known_errorHandler = -1;
4655
4656 /* allocate enough for a simple encoding without
4657 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004658 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004659 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004660 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004661 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004662 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004663 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004664 ressize = size;
4665
4666 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004667 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004668
Benjamin Peterson29060642009-01-31 22:14:21 +00004669 /* can we encode this? */
4670 if (c<limit) {
4671 /* no overflow check, because we know that the space is enough */
4672 *str++ = (char)c;
4673 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004674 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 else {
4676 Py_ssize_t unicodepos = p-startp;
4677 Py_ssize_t requiredsize;
4678 PyObject *repunicode;
4679 Py_ssize_t repsize;
4680 Py_ssize_t newpos;
4681 Py_ssize_t respos;
4682 Py_UNICODE *uni2;
4683 /* startpos for collecting unencodable chars */
4684 const Py_UNICODE *collstart = p;
4685 const Py_UNICODE *collend = p;
4686 /* find all unecodable characters */
4687 while ((collend < endp) && ((*collend)>=limit))
4688 ++collend;
4689 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4690 if (known_errorHandler==-1) {
4691 if ((errors==NULL) || (!strcmp(errors, "strict")))
4692 known_errorHandler = 1;
4693 else if (!strcmp(errors, "replace"))
4694 known_errorHandler = 2;
4695 else if (!strcmp(errors, "ignore"))
4696 known_errorHandler = 3;
4697 else if (!strcmp(errors, "xmlcharrefreplace"))
4698 known_errorHandler = 4;
4699 else
4700 known_errorHandler = 0;
4701 }
4702 switch (known_errorHandler) {
4703 case 1: /* strict */
4704 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4705 goto onError;
4706 case 2: /* replace */
4707 while (collstart++<collend)
4708 *str++ = '?'; /* fall through */
4709 case 3: /* ignore */
4710 p = collend;
4711 break;
4712 case 4: /* xmlcharrefreplace */
4713 respos = str - PyBytes_AS_STRING(res);
4714 /* determine replacement size (temporarily (mis)uses p) */
4715 for (p = collstart, repsize = 0; p < collend; ++p) {
4716 if (*p<10)
4717 repsize += 2+1+1;
4718 else if (*p<100)
4719 repsize += 2+2+1;
4720 else if (*p<1000)
4721 repsize += 2+3+1;
4722 else if (*p<10000)
4723 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004724#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004725 else
4726 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004727#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004728 else if (*p<100000)
4729 repsize += 2+5+1;
4730 else if (*p<1000000)
4731 repsize += 2+6+1;
4732 else
4733 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004734#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004735 }
4736 requiredsize = respos+repsize+(endp-collend);
4737 if (requiredsize > ressize) {
4738 if (requiredsize<2*ressize)
4739 requiredsize = 2*ressize;
4740 if (_PyBytes_Resize(&res, requiredsize))
4741 goto onError;
4742 str = PyBytes_AS_STRING(res) + respos;
4743 ressize = requiredsize;
4744 }
4745 /* generate replacement (temporarily (mis)uses p) */
4746 for (p = collstart; p < collend; ++p) {
4747 str += sprintf(str, "&#%d;", (int)*p);
4748 }
4749 p = collend;
4750 break;
4751 default:
4752 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4753 encoding, reason, startp, size, &exc,
4754 collstart-startp, collend-startp, &newpos);
4755 if (repunicode == NULL)
4756 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004757 if (PyBytes_Check(repunicode)) {
4758 /* Directly copy bytes result to output. */
4759 repsize = PyBytes_Size(repunicode);
4760 if (repsize > 1) {
4761 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004762 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004763 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4764 Py_DECREF(repunicode);
4765 goto onError;
4766 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004767 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004768 ressize += repsize-1;
4769 }
4770 memcpy(str, PyBytes_AsString(repunicode), repsize);
4771 str += repsize;
4772 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004773 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004774 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004775 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004776 /* need more space? (at least enough for what we
4777 have+the replacement+the rest of the string, so
4778 we won't have to check space for encodable characters) */
4779 respos = str - PyBytes_AS_STRING(res);
4780 repsize = PyUnicode_GET_SIZE(repunicode);
4781 requiredsize = respos+repsize+(endp-collend);
4782 if (requiredsize > ressize) {
4783 if (requiredsize<2*ressize)
4784 requiredsize = 2*ressize;
4785 if (_PyBytes_Resize(&res, requiredsize)) {
4786 Py_DECREF(repunicode);
4787 goto onError;
4788 }
4789 str = PyBytes_AS_STRING(res) + respos;
4790 ressize = requiredsize;
4791 }
4792 /* check if there is anything unencodable in the replacement
4793 and copy it to the output */
4794 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4795 c = *uni2;
4796 if (c >= limit) {
4797 raise_encode_exception(&exc, encoding, startp, size,
4798 unicodepos, unicodepos+1, reason);
4799 Py_DECREF(repunicode);
4800 goto onError;
4801 }
4802 *str = (char)c;
4803 }
4804 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004805 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004806 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004807 }
4808 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004809 /* Resize if we allocated to much */
4810 size = str - PyBytes_AS_STRING(res);
4811 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004812 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004813 if (_PyBytes_Resize(&res, size) < 0)
4814 goto onError;
4815 }
4816
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004817 Py_XDECREF(errorHandler);
4818 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004819 return res;
4820
4821 onError:
4822 Py_XDECREF(res);
4823 Py_XDECREF(errorHandler);
4824 Py_XDECREF(exc);
4825 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004826}
4827
Alexander Belopolsky40018472011-02-26 01:02:56 +00004828PyObject *
4829PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4830 Py_ssize_t size,
4831 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004833 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834}
4835
Alexander Belopolsky40018472011-02-26 01:02:56 +00004836PyObject *
4837PyUnicode_AsLatin1String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838{
4839 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004840 PyErr_BadArgument();
4841 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 }
4843 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 PyUnicode_GET_SIZE(unicode),
4845 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846}
4847
4848/* --- 7-bit ASCII Codec -------------------------------------------------- */
4849
Alexander Belopolsky40018472011-02-26 01:02:56 +00004850PyObject *
4851PyUnicode_DecodeASCII(const char *s,
4852 Py_ssize_t size,
4853 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004855 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 PyUnicodeObject *v;
4857 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004858 Py_ssize_t startinpos;
4859 Py_ssize_t endinpos;
4860 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004861 const char *e;
4862 PyObject *errorHandler = NULL;
4863 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004864
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004866 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 Py_UNICODE r = *(unsigned char*)s;
4868 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004869 }
Tim Petersced69f82003-09-16 20:30:58 +00004870
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 v = _PyUnicode_New(size);
4872 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004873 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004875 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004877 e = s + size;
4878 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004879 register unsigned char c = (unsigned char)*s;
4880 if (c < 128) {
4881 *p++ = c;
4882 ++s;
4883 }
4884 else {
4885 startinpos = s-starts;
4886 endinpos = startinpos + 1;
4887 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4888 if (unicode_decode_call_errorhandler(
4889 errors, &errorHandler,
4890 "ascii", "ordinal not in range(128)",
4891 &starts, &e, &startinpos, &endinpos, &exc, &s,
4892 &v, &outpos, &p))
4893 goto onError;
4894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004896 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004897 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4898 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004899 Py_XDECREF(errorHandler);
4900 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004902
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004905 Py_XDECREF(errorHandler);
4906 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907 return NULL;
4908}
4909
Alexander Belopolsky40018472011-02-26 01:02:56 +00004910PyObject *
4911PyUnicode_EncodeASCII(const Py_UNICODE *p,
4912 Py_ssize_t size,
4913 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004915 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916}
4917
Alexander Belopolsky40018472011-02-26 01:02:56 +00004918PyObject *
4919PyUnicode_AsASCIIString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920{
4921 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 PyErr_BadArgument();
4923 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924 }
4925 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 PyUnicode_GET_SIZE(unicode),
4927 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928}
4929
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004930#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004931
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004932/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004933
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004934#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004935#define NEED_RETRY
4936#endif
4937
4938/* XXX This code is limited to "true" double-byte encodings, as
4939 a) it assumes an incomplete character consists of a single byte, and
4940 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004942
Alexander Belopolsky40018472011-02-26 01:02:56 +00004943static int
4944is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004945{
4946 const char *curr = s + offset;
4947
4948 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004949 const char *prev = CharPrev(s, curr);
4950 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004951 }
4952 return 0;
4953}
4954
4955/*
4956 * Decode MBCS string into unicode object. If 'final' is set, converts
4957 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4958 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004959static int
4960decode_mbcs(PyUnicodeObject **v,
4961 const char *s, /* MBCS string */
4962 int size, /* sizeof MBCS string */
4963 int final,
4964 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004965{
4966 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004967 Py_ssize_t n;
4968 DWORD usize;
4969 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004970
4971 assert(size >= 0);
4972
Victor Stinner554f3f02010-06-16 23:33:54 +00004973 /* check and handle 'errors' arg */
4974 if (errors==NULL || strcmp(errors, "strict")==0)
4975 flags = MB_ERR_INVALID_CHARS;
4976 else if (strcmp(errors, "ignore")==0)
4977 flags = 0;
4978 else {
4979 PyErr_Format(PyExc_ValueError,
4980 "mbcs encoding does not support errors='%s'",
4981 errors);
4982 return -1;
4983 }
4984
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004985 /* Skip trailing lead-byte unless 'final' is set */
4986 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004987 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004988
4989 /* First get the size of the result */
4990 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004991 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4992 if (usize==0)
4993 goto mbcs_decode_error;
4994 } else
4995 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004996
4997 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 /* Create unicode object */
4999 *v = _PyUnicode_New(usize);
5000 if (*v == NULL)
5001 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005002 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005003 }
5004 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005005 /* Extend unicode object */
5006 n = PyUnicode_GET_SIZE(*v);
5007 if (_PyUnicode_Resize(v, n + usize) < 0)
5008 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005009 }
5010
5011 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00005012 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005014 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
5015 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00005016 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005017 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005018 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00005019
5020mbcs_decode_error:
5021 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
5022 we raise a UnicodeDecodeError - else it is a 'generic'
5023 windows error
5024 */
5025 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
5026 /* Ideally, we should get reason from FormatMessage - this
5027 is the Windows 2000 English version of the message
5028 */
5029 PyObject *exc = NULL;
5030 const char *reason = "No mapping for the Unicode character exists "
5031 "in the target multi-byte code page.";
5032 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
5033 if (exc != NULL) {
5034 PyCodec_StrictErrors(exc);
5035 Py_DECREF(exc);
5036 }
5037 } else {
5038 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5039 }
5040 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005041}
5042
Alexander Belopolsky40018472011-02-26 01:02:56 +00005043PyObject *
5044PyUnicode_DecodeMBCSStateful(const char *s,
5045 Py_ssize_t size,
5046 const char *errors,
5047 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005048{
5049 PyUnicodeObject *v = NULL;
5050 int done;
5051
5052 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005053 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005054
5055#ifdef NEED_RETRY
5056 retry:
5057 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005058 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005059 else
5060#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005061 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005062
5063 if (done < 0) {
5064 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005066 }
5067
5068 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005069 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005070
5071#ifdef NEED_RETRY
5072 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005073 s += done;
5074 size -= done;
5075 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005076 }
5077#endif
5078
5079 return (PyObject *)v;
5080}
5081
Alexander Belopolsky40018472011-02-26 01:02:56 +00005082PyObject *
5083PyUnicode_DecodeMBCS(const char *s,
5084 Py_ssize_t size,
5085 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005086{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005087 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5088}
5089
5090/*
5091 * Convert unicode into string object (MBCS).
5092 * Returns 0 if succeed, -1 otherwise.
5093 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005094static int
5095encode_mbcs(PyObject **repr,
5096 const Py_UNICODE *p, /* unicode */
5097 int size, /* size of unicode */
5098 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005099{
Victor Stinner554f3f02010-06-16 23:33:54 +00005100 BOOL usedDefaultChar = FALSE;
5101 BOOL *pusedDefaultChar;
5102 int mbcssize;
5103 Py_ssize_t n;
5104 PyObject *exc = NULL;
5105 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005106
5107 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005108
Victor Stinner554f3f02010-06-16 23:33:54 +00005109 /* check and handle 'errors' arg */
5110 if (errors==NULL || strcmp(errors, "strict")==0) {
5111 flags = WC_NO_BEST_FIT_CHARS;
5112 pusedDefaultChar = &usedDefaultChar;
5113 } else if (strcmp(errors, "replace")==0) {
5114 flags = 0;
5115 pusedDefaultChar = NULL;
5116 } else {
5117 PyErr_Format(PyExc_ValueError,
5118 "mbcs encoding does not support errors='%s'",
5119 errors);
5120 return -1;
5121 }
5122
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005123 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005124 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005125 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5126 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005127 if (mbcssize == 0) {
5128 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5129 return -1;
5130 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005131 /* If we used a default char, then we failed! */
5132 if (pusedDefaultChar && *pusedDefaultChar)
5133 goto mbcs_encode_error;
5134 } else {
5135 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005136 }
5137
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005138 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005139 /* Create string object */
5140 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5141 if (*repr == NULL)
5142 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005143 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005144 }
5145 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005146 /* Extend string object */
5147 n = PyBytes_Size(*repr);
5148 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5149 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005150 }
5151
5152 /* Do the conversion */
5153 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005154 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005155 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5156 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5158 return -1;
5159 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005160 if (pusedDefaultChar && *pusedDefaultChar)
5161 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005162 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005163 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005164
5165mbcs_encode_error:
5166 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5167 Py_XDECREF(exc);
5168 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005169}
5170
Alexander Belopolsky40018472011-02-26 01:02:56 +00005171PyObject *
5172PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5173 Py_ssize_t size,
5174 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005175{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005176 PyObject *repr = NULL;
5177 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005178
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005179#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005180 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005181 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005182 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005183 else
5184#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005185 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005186
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005187 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 Py_XDECREF(repr);
5189 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005190 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005191
5192#ifdef NEED_RETRY
5193 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005194 p += INT_MAX;
5195 size -= INT_MAX;
5196 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005197 }
5198#endif
5199
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005200 return repr;
5201}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005202
Alexander Belopolsky40018472011-02-26 01:02:56 +00005203PyObject *
5204PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005205{
5206 if (!PyUnicode_Check(unicode)) {
5207 PyErr_BadArgument();
5208 return NULL;
5209 }
5210 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005211 PyUnicode_GET_SIZE(unicode),
5212 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005213}
5214
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005215#undef NEED_RETRY
5216
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005217#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005218
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219/* --- Character Mapping Codec -------------------------------------------- */
5220
Alexander Belopolsky40018472011-02-26 01:02:56 +00005221PyObject *
5222PyUnicode_DecodeCharmap(const char *s,
5223 Py_ssize_t size,
5224 PyObject *mapping,
5225 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005227 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005228 Py_ssize_t startinpos;
5229 Py_ssize_t endinpos;
5230 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005231 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232 PyUnicodeObject *v;
5233 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005234 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005235 PyObject *errorHandler = NULL;
5236 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005237 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005238 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005239
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 /* Default to Latin-1 */
5241 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005242 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243
5244 v = _PyUnicode_New(size);
5245 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005246 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005248 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005250 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005251 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005252 mapstring = PyUnicode_AS_UNICODE(mapping);
5253 maplen = PyUnicode_GET_SIZE(mapping);
5254 while (s < e) {
5255 unsigned char ch = *s;
5256 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257
Benjamin Peterson29060642009-01-31 22:14:21 +00005258 if (ch < maplen)
5259 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 if (x == 0xfffe) {
5262 /* undefined mapping */
5263 outpos = p-PyUnicode_AS_UNICODE(v);
5264 startinpos = s-starts;
5265 endinpos = startinpos+1;
5266 if (unicode_decode_call_errorhandler(
5267 errors, &errorHandler,
5268 "charmap", "character maps to <undefined>",
5269 &starts, &e, &startinpos, &endinpos, &exc, &s,
5270 &v, &outpos, &p)) {
5271 goto onError;
5272 }
5273 continue;
5274 }
5275 *p++ = x;
5276 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005277 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005278 }
5279 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005280 while (s < e) {
5281 unsigned char ch = *s;
5282 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005283
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5285 w = PyLong_FromLong((long)ch);
5286 if (w == NULL)
5287 goto onError;
5288 x = PyObject_GetItem(mapping, w);
5289 Py_DECREF(w);
5290 if (x == NULL) {
5291 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5292 /* No mapping found means: mapping is undefined. */
5293 PyErr_Clear();
5294 x = Py_None;
5295 Py_INCREF(x);
5296 } else
5297 goto onError;
5298 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005299
Benjamin Peterson29060642009-01-31 22:14:21 +00005300 /* Apply mapping */
5301 if (PyLong_Check(x)) {
5302 long value = PyLong_AS_LONG(x);
5303 if (value < 0 || value > 65535) {
5304 PyErr_SetString(PyExc_TypeError,
5305 "character mapping must be in range(65536)");
5306 Py_DECREF(x);
5307 goto onError;
5308 }
5309 *p++ = (Py_UNICODE)value;
5310 }
5311 else if (x == Py_None) {
5312 /* undefined mapping */
5313 outpos = p-PyUnicode_AS_UNICODE(v);
5314 startinpos = s-starts;
5315 endinpos = startinpos+1;
5316 if (unicode_decode_call_errorhandler(
5317 errors, &errorHandler,
5318 "charmap", "character maps to <undefined>",
5319 &starts, &e, &startinpos, &endinpos, &exc, &s,
5320 &v, &outpos, &p)) {
5321 Py_DECREF(x);
5322 goto onError;
5323 }
5324 Py_DECREF(x);
5325 continue;
5326 }
5327 else if (PyUnicode_Check(x)) {
5328 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005329
Benjamin Peterson29060642009-01-31 22:14:21 +00005330 if (targetsize == 1)
5331 /* 1-1 mapping */
5332 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005333
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 else if (targetsize > 1) {
5335 /* 1-n mapping */
5336 if (targetsize > extrachars) {
5337 /* resize first */
5338 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5339 Py_ssize_t needed = (targetsize - extrachars) + \
5340 (targetsize << 2);
5341 extrachars += needed;
5342 /* XXX overflow detection missing */
5343 if (_PyUnicode_Resize(&v,
5344 PyUnicode_GET_SIZE(v) + needed) < 0) {
5345 Py_DECREF(x);
5346 goto onError;
5347 }
5348 p = PyUnicode_AS_UNICODE(v) + oldpos;
5349 }
5350 Py_UNICODE_COPY(p,
5351 PyUnicode_AS_UNICODE(x),
5352 targetsize);
5353 p += targetsize;
5354 extrachars -= targetsize;
5355 }
5356 /* 1-0 mapping: skip the character */
5357 }
5358 else {
5359 /* wrong return value */
5360 PyErr_SetString(PyExc_TypeError,
5361 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005362 Py_DECREF(x);
5363 goto onError;
5364 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 Py_DECREF(x);
5366 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005367 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 }
5369 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5371 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005372 Py_XDECREF(errorHandler);
5373 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005375
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377 Py_XDECREF(errorHandler);
5378 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 Py_XDECREF(v);
5380 return NULL;
5381}
5382
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005383/* Charmap encoding: the lookup table */
5384
Alexander Belopolsky40018472011-02-26 01:02:56 +00005385struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00005386 PyObject_HEAD
5387 unsigned char level1[32];
5388 int count2, count3;
5389 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005390};
5391
5392static PyObject*
5393encoding_map_size(PyObject *obj, PyObject* args)
5394{
5395 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005396 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005397 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005398}
5399
5400static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005401 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005402 PyDoc_STR("Return the size (in bytes) of this object") },
5403 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005404};
5405
5406static void
5407encoding_map_dealloc(PyObject* o)
5408{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005409 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005410}
5411
5412static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005413 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 "EncodingMap", /*tp_name*/
5415 sizeof(struct encoding_map), /*tp_basicsize*/
5416 0, /*tp_itemsize*/
5417 /* methods */
5418 encoding_map_dealloc, /*tp_dealloc*/
5419 0, /*tp_print*/
5420 0, /*tp_getattr*/
5421 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005422 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005423 0, /*tp_repr*/
5424 0, /*tp_as_number*/
5425 0, /*tp_as_sequence*/
5426 0, /*tp_as_mapping*/
5427 0, /*tp_hash*/
5428 0, /*tp_call*/
5429 0, /*tp_str*/
5430 0, /*tp_getattro*/
5431 0, /*tp_setattro*/
5432 0, /*tp_as_buffer*/
5433 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5434 0, /*tp_doc*/
5435 0, /*tp_traverse*/
5436 0, /*tp_clear*/
5437 0, /*tp_richcompare*/
5438 0, /*tp_weaklistoffset*/
5439 0, /*tp_iter*/
5440 0, /*tp_iternext*/
5441 encoding_map_methods, /*tp_methods*/
5442 0, /*tp_members*/
5443 0, /*tp_getset*/
5444 0, /*tp_base*/
5445 0, /*tp_dict*/
5446 0, /*tp_descr_get*/
5447 0, /*tp_descr_set*/
5448 0, /*tp_dictoffset*/
5449 0, /*tp_init*/
5450 0, /*tp_alloc*/
5451 0, /*tp_new*/
5452 0, /*tp_free*/
5453 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005454};
5455
5456PyObject*
5457PyUnicode_BuildEncodingMap(PyObject* string)
5458{
5459 Py_UNICODE *decode;
5460 PyObject *result;
5461 struct encoding_map *mresult;
5462 int i;
5463 int need_dict = 0;
5464 unsigned char level1[32];
5465 unsigned char level2[512];
5466 unsigned char *mlevel1, *mlevel2, *mlevel3;
5467 int count2 = 0, count3 = 0;
5468
5469 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5470 PyErr_BadArgument();
5471 return NULL;
5472 }
5473 decode = PyUnicode_AS_UNICODE(string);
5474 memset(level1, 0xFF, sizeof level1);
5475 memset(level2, 0xFF, sizeof level2);
5476
5477 /* If there isn't a one-to-one mapping of NULL to \0,
5478 or if there are non-BMP characters, we need to use
5479 a mapping dictionary. */
5480 if (decode[0] != 0)
5481 need_dict = 1;
5482 for (i = 1; i < 256; i++) {
5483 int l1, l2;
5484 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005485#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005486 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005487#endif
5488 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005489 need_dict = 1;
5490 break;
5491 }
5492 if (decode[i] == 0xFFFE)
5493 /* unmapped character */
5494 continue;
5495 l1 = decode[i] >> 11;
5496 l2 = decode[i] >> 7;
5497 if (level1[l1] == 0xFF)
5498 level1[l1] = count2++;
5499 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005500 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005501 }
5502
5503 if (count2 >= 0xFF || count3 >= 0xFF)
5504 need_dict = 1;
5505
5506 if (need_dict) {
5507 PyObject *result = PyDict_New();
5508 PyObject *key, *value;
5509 if (!result)
5510 return NULL;
5511 for (i = 0; i < 256; i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00005512 key = PyLong_FromLong(decode[i]);
5513 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005514 if (!key || !value)
5515 goto failed1;
5516 if (PyDict_SetItem(result, key, value) == -1)
5517 goto failed1;
5518 Py_DECREF(key);
5519 Py_DECREF(value);
5520 }
5521 return result;
5522 failed1:
5523 Py_XDECREF(key);
5524 Py_XDECREF(value);
5525 Py_DECREF(result);
5526 return NULL;
5527 }
5528
5529 /* Create a three-level trie */
5530 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5531 16*count2 + 128*count3 - 1);
5532 if (!result)
5533 return PyErr_NoMemory();
5534 PyObject_Init(result, &EncodingMapType);
5535 mresult = (struct encoding_map*)result;
5536 mresult->count2 = count2;
5537 mresult->count3 = count3;
5538 mlevel1 = mresult->level1;
5539 mlevel2 = mresult->level23;
5540 mlevel3 = mresult->level23 + 16*count2;
5541 memcpy(mlevel1, level1, 32);
5542 memset(mlevel2, 0xFF, 16*count2);
5543 memset(mlevel3, 0, 128*count3);
5544 count3 = 0;
5545 for (i = 1; i < 256; i++) {
5546 int o1, o2, o3, i2, i3;
5547 if (decode[i] == 0xFFFE)
5548 /* unmapped character */
5549 continue;
5550 o1 = decode[i]>>11;
5551 o2 = (decode[i]>>7) & 0xF;
5552 i2 = 16*mlevel1[o1] + o2;
5553 if (mlevel2[i2] == 0xFF)
5554 mlevel2[i2] = count3++;
5555 o3 = decode[i] & 0x7F;
5556 i3 = 128*mlevel2[i2] + o3;
5557 mlevel3[i3] = i;
5558 }
5559 return result;
5560}
5561
5562static int
5563encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5564{
5565 struct encoding_map *map = (struct encoding_map*)mapping;
5566 int l1 = c>>11;
5567 int l2 = (c>>7) & 0xF;
5568 int l3 = c & 0x7F;
5569 int i;
5570
5571#ifdef Py_UNICODE_WIDE
5572 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005573 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005574 }
5575#endif
5576 if (c == 0)
5577 return 0;
5578 /* level 1*/
5579 i = map->level1[l1];
5580 if (i == 0xFF) {
5581 return -1;
5582 }
5583 /* level 2*/
5584 i = map->level23[16*i+l2];
5585 if (i == 0xFF) {
5586 return -1;
5587 }
5588 /* level 3 */
5589 i = map->level23[16*map->count2 + 128*i + l3];
5590 if (i == 0) {
5591 return -1;
5592 }
5593 return i;
5594}
5595
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005596/* Lookup the character ch in the mapping. If the character
5597 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005598 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005599static PyObject *
5600charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601{
Christian Heimes217cfd12007-12-02 14:31:20 +00005602 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603 PyObject *x;
5604
5605 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005606 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005607 x = PyObject_GetItem(mapping, w);
5608 Py_DECREF(w);
5609 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005610 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5611 /* No mapping found means: mapping is undefined. */
5612 PyErr_Clear();
5613 x = Py_None;
5614 Py_INCREF(x);
5615 return x;
5616 } else
5617 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005619 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005621 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 long value = PyLong_AS_LONG(x);
5623 if (value < 0 || value > 255) {
5624 PyErr_SetString(PyExc_TypeError,
5625 "character mapping must be in range(256)");
5626 Py_DECREF(x);
5627 return NULL;
5628 }
5629 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005631 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005632 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005634 /* wrong return value */
5635 PyErr_Format(PyExc_TypeError,
5636 "character mapping must return integer, bytes or None, not %.400s",
5637 x->ob_type->tp_name);
5638 Py_DECREF(x);
5639 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 }
5641}
5642
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005643static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005644charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005645{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005646 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5647 /* exponentially overallocate to minimize reallocations */
5648 if (requiredsize < 2*outsize)
5649 requiredsize = 2*outsize;
5650 if (_PyBytes_Resize(outobj, requiredsize))
5651 return -1;
5652 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005653}
5654
Benjamin Peterson14339b62009-01-31 16:36:08 +00005655typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00005657} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005658/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005659 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005660 space is available. Return a new reference to the object that
5661 was put in the output buffer, or Py_None, if the mapping was undefined
5662 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005663 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005664static charmapencode_result
5665charmapencode_output(Py_UNICODE c, PyObject *mapping,
5666 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005667{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005668 PyObject *rep;
5669 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005670 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005671
Christian Heimes90aa7642007-12-19 02:45:37 +00005672 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005673 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005675 if (res == -1)
5676 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005677 if (outsize<requiredsize)
5678 if (charmapencode_resize(outobj, outpos, requiredsize))
5679 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005680 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 outstart[(*outpos)++] = (char)res;
5682 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005683 }
5684
5685 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005688 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 Py_DECREF(rep);
5690 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005691 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 if (PyLong_Check(rep)) {
5693 Py_ssize_t requiredsize = *outpos+1;
5694 if (outsize<requiredsize)
5695 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5696 Py_DECREF(rep);
5697 return enc_EXCEPTION;
5698 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005699 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005701 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 else {
5703 const char *repchars = PyBytes_AS_STRING(rep);
5704 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5705 Py_ssize_t requiredsize = *outpos+repsize;
5706 if (outsize<requiredsize)
5707 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5708 Py_DECREF(rep);
5709 return enc_EXCEPTION;
5710 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005711 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 memcpy(outstart + *outpos, repchars, repsize);
5713 *outpos += repsize;
5714 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005715 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005716 Py_DECREF(rep);
5717 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005718}
5719
5720/* handle an error in PyUnicode_EncodeCharmap
5721 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005722static int
5723charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005724 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005725 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005726 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005727 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005728{
5729 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005730 Py_ssize_t repsize;
5731 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005732 Py_UNICODE *uni2;
5733 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005734 Py_ssize_t collstartpos = *inpos;
5735 Py_ssize_t collendpos = *inpos+1;
5736 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737 char *encoding = "charmap";
5738 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005739 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005740
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741 /* find all unencodable characters */
5742 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005743 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005744 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 int res = encoding_map_lookup(p[collendpos], mapping);
5746 if (res != -1)
5747 break;
5748 ++collendpos;
5749 continue;
5750 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005751
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 rep = charmapencode_lookup(p[collendpos], mapping);
5753 if (rep==NULL)
5754 return -1;
5755 else if (rep!=Py_None) {
5756 Py_DECREF(rep);
5757 break;
5758 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005759 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005761 }
5762 /* cache callback name lookup
5763 * (if not done yet, i.e. it's the first error) */
5764 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 if ((errors==NULL) || (!strcmp(errors, "strict")))
5766 *known_errorHandler = 1;
5767 else if (!strcmp(errors, "replace"))
5768 *known_errorHandler = 2;
5769 else if (!strcmp(errors, "ignore"))
5770 *known_errorHandler = 3;
5771 else if (!strcmp(errors, "xmlcharrefreplace"))
5772 *known_errorHandler = 4;
5773 else
5774 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005775 }
5776 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005777 case 1: /* strict */
5778 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5779 return -1;
5780 case 2: /* replace */
5781 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 x = charmapencode_output('?', mapping, res, respos);
5783 if (x==enc_EXCEPTION) {
5784 return -1;
5785 }
5786 else if (x==enc_FAILED) {
5787 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5788 return -1;
5789 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005790 }
5791 /* fall through */
5792 case 3: /* ignore */
5793 *inpos = collendpos;
5794 break;
5795 case 4: /* xmlcharrefreplace */
5796 /* generate replacement (temporarily (mis)uses p) */
5797 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005798 char buffer[2+29+1+1];
5799 char *cp;
5800 sprintf(buffer, "&#%d;", (int)p[collpos]);
5801 for (cp = buffer; *cp; ++cp) {
5802 x = charmapencode_output(*cp, mapping, res, respos);
5803 if (x==enc_EXCEPTION)
5804 return -1;
5805 else if (x==enc_FAILED) {
5806 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5807 return -1;
5808 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005809 }
5810 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005811 *inpos = collendpos;
5812 break;
5813 default:
5814 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 encoding, reason, p, size, exceptionObject,
5816 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005817 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005819 if (PyBytes_Check(repunicode)) {
5820 /* Directly copy bytes result to output. */
5821 Py_ssize_t outsize = PyBytes_Size(*res);
5822 Py_ssize_t requiredsize;
5823 repsize = PyBytes_Size(repunicode);
5824 requiredsize = *respos + repsize;
5825 if (requiredsize > outsize)
5826 /* Make room for all additional bytes. */
5827 if (charmapencode_resize(res, respos, requiredsize)) {
5828 Py_DECREF(repunicode);
5829 return -1;
5830 }
5831 memcpy(PyBytes_AsString(*res) + *respos,
5832 PyBytes_AsString(repunicode), repsize);
5833 *respos += repsize;
5834 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005835 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005836 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005837 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005838 /* generate replacement */
5839 repsize = PyUnicode_GET_SIZE(repunicode);
5840 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 x = charmapencode_output(*uni2, mapping, res, respos);
5842 if (x==enc_EXCEPTION) {
5843 return -1;
5844 }
5845 else if (x==enc_FAILED) {
5846 Py_DECREF(repunicode);
5847 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5848 return -1;
5849 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005850 }
5851 *inpos = newpos;
5852 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005853 }
5854 return 0;
5855}
5856
Alexander Belopolsky40018472011-02-26 01:02:56 +00005857PyObject *
5858PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5859 Py_ssize_t size,
5860 PyObject *mapping,
5861 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005863 /* output object */
5864 PyObject *res = NULL;
5865 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005866 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005868 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005869 PyObject *errorHandler = NULL;
5870 PyObject *exc = NULL;
5871 /* the following variable is used for caching string comparisons
5872 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5873 * 3=ignore, 4=xmlcharrefreplace */
5874 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875
5876 /* Default to Latin-1 */
5877 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005880 /* allocate enough for a simple encoding without
5881 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005882 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005883 if (res == NULL)
5884 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005885 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005886 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005888 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005889 /* try to encode it */
5890 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5891 if (x==enc_EXCEPTION) /* error */
5892 goto onError;
5893 if (x==enc_FAILED) { /* unencodable character */
5894 if (charmap_encoding_error(p, size, &inpos, mapping,
5895 &exc,
5896 &known_errorHandler, &errorHandler, errors,
5897 &res, &respos)) {
5898 goto onError;
5899 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005900 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 else
5902 /* done with this character => adjust input position */
5903 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005907 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005908 if (_PyBytes_Resize(&res, respos) < 0)
5909 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005910
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005911 Py_XDECREF(exc);
5912 Py_XDECREF(errorHandler);
5913 return res;
5914
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916 Py_XDECREF(res);
5917 Py_XDECREF(exc);
5918 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 return NULL;
5920}
5921
Alexander Belopolsky40018472011-02-26 01:02:56 +00005922PyObject *
5923PyUnicode_AsCharmapString(PyObject *unicode,
5924 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925{
5926 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 PyErr_BadArgument();
5928 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 }
5930 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 PyUnicode_GET_SIZE(unicode),
5932 mapping,
5933 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934}
5935
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005936/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005937static void
5938make_translate_exception(PyObject **exceptionObject,
5939 const Py_UNICODE *unicode, Py_ssize_t size,
5940 Py_ssize_t startpos, Py_ssize_t endpos,
5941 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005943 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005944 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 }
5947 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005948 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5949 goto onError;
5950 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5951 goto onError;
5952 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5953 goto onError;
5954 return;
5955 onError:
5956 Py_DECREF(*exceptionObject);
5957 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 }
5959}
5960
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005961/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005962static void
5963raise_translate_exception(PyObject **exceptionObject,
5964 const Py_UNICODE *unicode, Py_ssize_t size,
5965 Py_ssize_t startpos, Py_ssize_t endpos,
5966 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005967{
5968 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005970 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005972}
5973
5974/* error handling callback helper:
5975 build arguments, call the callback and check the arguments,
5976 put the result into newpos and return the replacement string, which
5977 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005978static PyObject *
5979unicode_translate_call_errorhandler(const char *errors,
5980 PyObject **errorHandler,
5981 const char *reason,
5982 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5983 Py_ssize_t startpos, Py_ssize_t endpos,
5984 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005985{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005986 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005988 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005989 PyObject *restuple;
5990 PyObject *resunicode;
5991
5992 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005993 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005994 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005996 }
5997
5998 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006000 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006002
6003 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006005 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006006 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006007 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00006008 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 Py_DECREF(restuple);
6010 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011 }
6012 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 &resunicode, &i_newpos)) {
6014 Py_DECREF(restuple);
6015 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006016 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006017 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006019 else
6020 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006021 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6023 Py_DECREF(restuple);
6024 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006025 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026 Py_INCREF(resunicode);
6027 Py_DECREF(restuple);
6028 return resunicode;
6029}
6030
6031/* Lookup the character ch in the mapping and put the result in result,
6032 which must be decrefed by the caller.
6033 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006034static int
6035charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006036{
Christian Heimes217cfd12007-12-02 14:31:20 +00006037 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006038 PyObject *x;
6039
6040 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042 x = PyObject_GetItem(mapping, w);
6043 Py_DECREF(w);
6044 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6046 /* No mapping found means: use 1:1 mapping. */
6047 PyErr_Clear();
6048 *result = NULL;
6049 return 0;
6050 } else
6051 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006052 }
6053 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 *result = x;
6055 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 }
Christian Heimes217cfd12007-12-02 14:31:20 +00006057 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 long value = PyLong_AS_LONG(x);
6059 long max = PyUnicode_GetMax();
6060 if (value < 0 || value > max) {
6061 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00006062 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00006063 Py_DECREF(x);
6064 return -1;
6065 }
6066 *result = x;
6067 return 0;
6068 }
6069 else if (PyUnicode_Check(x)) {
6070 *result = x;
6071 return 0;
6072 }
6073 else {
6074 /* wrong return value */
6075 PyErr_SetString(PyExc_TypeError,
6076 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006077 Py_DECREF(x);
6078 return -1;
6079 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006080}
6081/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 if not reallocate and adjust various state variables.
6083 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006084static int
6085charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006087{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006088 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006089 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 /* remember old output position */
6091 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6092 /* exponentially overallocate to minimize reallocations */
6093 if (requiredsize < 2 * oldsize)
6094 requiredsize = 2 * oldsize;
6095 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6096 return -1;
6097 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006098 }
6099 return 0;
6100}
6101/* lookup the character, put the result in the output string and adjust
6102 various state variables. Return a new reference to the object that
6103 was put in the output buffer in *result, or Py_None, if the mapping was
6104 undefined (in which case no character was written).
6105 The called must decref result.
6106 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006107static int
6108charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
6109 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6110 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111{
Walter Dörwald4894c302003-10-24 14:25:28 +00006112 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006114 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 /* not found => default to 1:1 mapping */
6116 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006117 }
6118 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006120 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 /* no overflow check, because we know that the space is enough */
6122 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006123 }
6124 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6126 if (repsize==1) {
6127 /* no overflow check, because we know that the space is enough */
6128 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6129 }
6130 else if (repsize!=0) {
6131 /* more than one character */
6132 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6133 (insize - (curinp-startinp)) +
6134 repsize - 1;
6135 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6136 return -1;
6137 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6138 *outp += repsize;
6139 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140 }
6141 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143 return 0;
6144}
6145
Alexander Belopolsky40018472011-02-26 01:02:56 +00006146PyObject *
6147PyUnicode_TranslateCharmap(const Py_UNICODE *p,
6148 Py_ssize_t size,
6149 PyObject *mapping,
6150 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006152 /* output object */
6153 PyObject *res = NULL;
6154 /* pointers to the beginning and end+1 of input */
6155 const Py_UNICODE *startp = p;
6156 const Py_UNICODE *endp = p + size;
6157 /* pointer into the output */
6158 Py_UNICODE *str;
6159 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006160 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006161 char *reason = "character maps to <undefined>";
6162 PyObject *errorHandler = NULL;
6163 PyObject *exc = NULL;
6164 /* the following variable is used for caching string comparisons
6165 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6166 * 3=ignore, 4=xmlcharrefreplace */
6167 int known_errorHandler = -1;
6168
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006170 PyErr_BadArgument();
6171 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006173
6174 /* allocate enough for a simple 1:1 translation without
6175 replacements, if we need more, we'll resize */
6176 res = PyUnicode_FromUnicode(NULL, size);
6177 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006180 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006181 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006183 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 /* try to encode it */
6185 PyObject *x = NULL;
6186 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6187 Py_XDECREF(x);
6188 goto onError;
6189 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006190 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 if (x!=Py_None) /* it worked => adjust input pointer */
6192 ++p;
6193 else { /* untranslatable character */
6194 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6195 Py_ssize_t repsize;
6196 Py_ssize_t newpos;
6197 Py_UNICODE *uni2;
6198 /* startpos for collecting untranslatable chars */
6199 const Py_UNICODE *collstart = p;
6200 const Py_UNICODE *collend = p+1;
6201 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 /* find all untranslatable characters */
6204 while (collend < endp) {
6205 if (charmaptranslate_lookup(*collend, mapping, &x))
6206 goto onError;
6207 Py_XDECREF(x);
6208 if (x!=Py_None)
6209 break;
6210 ++collend;
6211 }
6212 /* cache callback name lookup
6213 * (if not done yet, i.e. it's the first error) */
6214 if (known_errorHandler==-1) {
6215 if ((errors==NULL) || (!strcmp(errors, "strict")))
6216 known_errorHandler = 1;
6217 else if (!strcmp(errors, "replace"))
6218 known_errorHandler = 2;
6219 else if (!strcmp(errors, "ignore"))
6220 known_errorHandler = 3;
6221 else if (!strcmp(errors, "xmlcharrefreplace"))
6222 known_errorHandler = 4;
6223 else
6224 known_errorHandler = 0;
6225 }
6226 switch (known_errorHandler) {
6227 case 1: /* strict */
6228 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006229 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 case 2: /* replace */
6231 /* No need to check for space, this is a 1:1 replacement */
6232 for (coll = collstart; coll<collend; ++coll)
6233 *str++ = '?';
6234 /* fall through */
6235 case 3: /* ignore */
6236 p = collend;
6237 break;
6238 case 4: /* xmlcharrefreplace */
6239 /* generate replacement (temporarily (mis)uses p) */
6240 for (p = collstart; p < collend; ++p) {
6241 char buffer[2+29+1+1];
6242 char *cp;
6243 sprintf(buffer, "&#%d;", (int)*p);
6244 if (charmaptranslate_makespace(&res, &str,
6245 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6246 goto onError;
6247 for (cp = buffer; *cp; ++cp)
6248 *str++ = *cp;
6249 }
6250 p = collend;
6251 break;
6252 default:
6253 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6254 reason, startp, size, &exc,
6255 collstart-startp, collend-startp, &newpos);
6256 if (repunicode == NULL)
6257 goto onError;
6258 /* generate replacement */
6259 repsize = PyUnicode_GET_SIZE(repunicode);
6260 if (charmaptranslate_makespace(&res, &str,
6261 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6262 Py_DECREF(repunicode);
6263 goto onError;
6264 }
6265 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6266 *str++ = *uni2;
6267 p = startp + newpos;
6268 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006269 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006270 }
6271 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006272 /* Resize if we allocated to much */
6273 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006274 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 if (PyUnicode_Resize(&res, respos) < 0)
6276 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006277 }
6278 Py_XDECREF(exc);
6279 Py_XDECREF(errorHandler);
6280 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006283 Py_XDECREF(res);
6284 Py_XDECREF(exc);
6285 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286 return NULL;
6287}
6288
Alexander Belopolsky40018472011-02-26 01:02:56 +00006289PyObject *
6290PyUnicode_Translate(PyObject *str,
6291 PyObject *mapping,
6292 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293{
6294 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006295
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 str = PyUnicode_FromObject(str);
6297 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 PyUnicode_GET_SIZE(str),
6301 mapping,
6302 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 Py_DECREF(str);
6304 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006305
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307 Py_XDECREF(str);
6308 return NULL;
6309}
Tim Petersced69f82003-09-16 20:30:58 +00006310
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006311PyObject *
6312PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6313 Py_ssize_t length)
6314{
6315 PyObject *result;
6316 Py_UNICODE *p; /* write pointer into result */
6317 Py_ssize_t i;
6318 /* Copy to a new string */
6319 result = (PyObject *)_PyUnicode_New(length);
6320 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6321 if (result == NULL)
6322 return result;
6323 p = PyUnicode_AS_UNICODE(result);
6324 /* Iterate over code points */
6325 for (i = 0; i < length; i++) {
6326 Py_UNICODE ch =s[i];
6327 if (ch > 127) {
6328 int decimal = Py_UNICODE_TODECIMAL(ch);
6329 if (decimal >= 0)
6330 p[i] = '0' + decimal;
6331 }
6332 }
6333 return result;
6334}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006335/* --- Decimal Encoder ---------------------------------------------------- */
6336
Alexander Belopolsky40018472011-02-26 01:02:56 +00006337int
6338PyUnicode_EncodeDecimal(Py_UNICODE *s,
6339 Py_ssize_t length,
6340 char *output,
6341 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006342{
6343 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006344 PyObject *errorHandler = NULL;
6345 PyObject *exc = NULL;
6346 const char *encoding = "decimal";
6347 const char *reason = "invalid decimal Unicode string";
6348 /* the following variable is used for caching string comparisons
6349 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6350 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006351
6352 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 PyErr_BadArgument();
6354 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006355 }
6356
6357 p = s;
6358 end = s + length;
6359 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 register Py_UNICODE ch = *p;
6361 int decimal;
6362 PyObject *repunicode;
6363 Py_ssize_t repsize;
6364 Py_ssize_t newpos;
6365 Py_UNICODE *uni2;
6366 Py_UNICODE *collstart;
6367 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006368
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006370 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 ++p;
6372 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006373 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 decimal = Py_UNICODE_TODECIMAL(ch);
6375 if (decimal >= 0) {
6376 *output++ = '0' + decimal;
6377 ++p;
6378 continue;
6379 }
6380 if (0 < ch && ch < 256) {
6381 *output++ = (char)ch;
6382 ++p;
6383 continue;
6384 }
6385 /* All other characters are considered unencodable */
6386 collstart = p;
6387 collend = p+1;
6388 while (collend < end) {
6389 if ((0 < *collend && *collend < 256) ||
6390 !Py_UNICODE_ISSPACE(*collend) ||
6391 Py_UNICODE_TODECIMAL(*collend))
6392 break;
6393 }
6394 /* cache callback name lookup
6395 * (if not done yet, i.e. it's the first error) */
6396 if (known_errorHandler==-1) {
6397 if ((errors==NULL) || (!strcmp(errors, "strict")))
6398 known_errorHandler = 1;
6399 else if (!strcmp(errors, "replace"))
6400 known_errorHandler = 2;
6401 else if (!strcmp(errors, "ignore"))
6402 known_errorHandler = 3;
6403 else if (!strcmp(errors, "xmlcharrefreplace"))
6404 known_errorHandler = 4;
6405 else
6406 known_errorHandler = 0;
6407 }
6408 switch (known_errorHandler) {
6409 case 1: /* strict */
6410 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6411 goto onError;
6412 case 2: /* replace */
6413 for (p = collstart; p < collend; ++p)
6414 *output++ = '?';
6415 /* fall through */
6416 case 3: /* ignore */
6417 p = collend;
6418 break;
6419 case 4: /* xmlcharrefreplace */
6420 /* generate replacement (temporarily (mis)uses p) */
6421 for (p = collstart; p < collend; ++p)
6422 output += sprintf(output, "&#%d;", (int)*p);
6423 p = collend;
6424 break;
6425 default:
6426 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6427 encoding, reason, s, length, &exc,
6428 collstart-s, collend-s, &newpos);
6429 if (repunicode == NULL)
6430 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006431 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006432 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006433 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6434 Py_DECREF(repunicode);
6435 goto onError;
6436 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 /* generate replacement */
6438 repsize = PyUnicode_GET_SIZE(repunicode);
6439 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6440 Py_UNICODE ch = *uni2;
6441 if (Py_UNICODE_ISSPACE(ch))
6442 *output++ = ' ';
6443 else {
6444 decimal = Py_UNICODE_TODECIMAL(ch);
6445 if (decimal >= 0)
6446 *output++ = '0' + decimal;
6447 else if (0 < ch && ch < 256)
6448 *output++ = (char)ch;
6449 else {
6450 Py_DECREF(repunicode);
6451 raise_encode_exception(&exc, encoding,
6452 s, length, collstart-s, collend-s, reason);
6453 goto onError;
6454 }
6455 }
6456 }
6457 p = s + newpos;
6458 Py_DECREF(repunicode);
6459 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006460 }
6461 /* 0-terminate the output string */
6462 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463 Py_XDECREF(exc);
6464 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006465 return 0;
6466
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006468 Py_XDECREF(exc);
6469 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006470 return -1;
6471}
6472
Guido van Rossumd57fd912000-03-10 22:53:23 +00006473/* --- Helpers ------------------------------------------------------------ */
6474
Eric Smith8c663262007-08-25 02:26:07 +00006475#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006476#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006477
Thomas Wouters477c8d52006-05-27 19:21:47 +00006478#include "stringlib/count.h"
6479#include "stringlib/find.h"
6480#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006481#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006482
Eric Smith5807c412008-05-11 21:00:57 +00006483#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006484#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006485#include "stringlib/localeutil.h"
6486
Thomas Wouters477c8d52006-05-27 19:21:47 +00006487/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006488#define ADJUST_INDICES(start, end, len) \
6489 if (end > len) \
6490 end = len; \
6491 else if (end < 0) { \
6492 end += len; \
6493 if (end < 0) \
6494 end = 0; \
6495 } \
6496 if (start < 0) { \
6497 start += len; \
6498 if (start < 0) \
6499 start = 0; \
6500 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006501
Alexander Belopolsky40018472011-02-26 01:02:56 +00006502Py_ssize_t
6503PyUnicode_Count(PyObject *str,
6504 PyObject *substr,
6505 Py_ssize_t start,
6506 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006508 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006509 PyUnicodeObject* str_obj;
6510 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006511
Thomas Wouters477c8d52006-05-27 19:21:47 +00006512 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6513 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006515 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6516 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 Py_DECREF(str_obj);
6518 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519 }
Tim Petersced69f82003-09-16 20:30:58 +00006520
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006521 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006522 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006523 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6524 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006525 );
6526
6527 Py_DECREF(sub_obj);
6528 Py_DECREF(str_obj);
6529
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 return result;
6531}
6532
Alexander Belopolsky40018472011-02-26 01:02:56 +00006533Py_ssize_t
6534PyUnicode_Find(PyObject *str,
6535 PyObject *sub,
6536 Py_ssize_t start,
6537 Py_ssize_t end,
6538 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006540 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006541
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006543 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006545 sub = PyUnicode_FromObject(sub);
6546 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 Py_DECREF(str);
6548 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 }
Tim Petersced69f82003-09-16 20:30:58 +00006550
Thomas Wouters477c8d52006-05-27 19:21:47 +00006551 if (direction > 0)
6552 result = stringlib_find_slice(
6553 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6554 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6555 start, end
6556 );
6557 else
6558 result = stringlib_rfind_slice(
6559 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6560 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6561 start, end
6562 );
6563
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006565 Py_DECREF(sub);
6566
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 return result;
6568}
6569
Alexander Belopolsky40018472011-02-26 01:02:56 +00006570static int
6571tailmatch(PyUnicodeObject *self,
6572 PyUnicodeObject *substring,
6573 Py_ssize_t start,
6574 Py_ssize_t end,
6575 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 if (substring->length == 0)
6578 return 1;
6579
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006580 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 end -= substring->length;
6582 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006583 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584
6585 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006586 if (Py_UNICODE_MATCH(self, end, substring))
6587 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 } else {
6589 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 }
6592
6593 return 0;
6594}
6595
Alexander Belopolsky40018472011-02-26 01:02:56 +00006596Py_ssize_t
6597PyUnicode_Tailmatch(PyObject *str,
6598 PyObject *substr,
6599 Py_ssize_t start,
6600 Py_ssize_t end,
6601 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006603 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006604
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 str = PyUnicode_FromObject(str);
6606 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 substr = PyUnicode_FromObject(substr);
6609 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 Py_DECREF(str);
6611 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 }
Tim Petersced69f82003-09-16 20:30:58 +00006613
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006615 (PyUnicodeObject *)substr,
6616 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 Py_DECREF(str);
6618 Py_DECREF(substr);
6619 return result;
6620}
6621
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622/* Apply fixfct filter to the Unicode object self and return a
6623 reference to the modified object */
6624
Alexander Belopolsky40018472011-02-26 01:02:56 +00006625static PyObject *
6626fixup(PyUnicodeObject *self,
6627 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628{
6629
6630 PyUnicodeObject *u;
6631
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006632 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006635
6636 Py_UNICODE_COPY(u->str, self->str, self->length);
6637
Tim Peters7a29bd52001-09-12 03:03:31 +00006638 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006639 /* fixfct should return TRUE if it modified the buffer. If
6640 FALSE, return a reference to the original buffer instead
6641 (to save space, not time) */
6642 Py_INCREF(self);
6643 Py_DECREF(u);
6644 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 }
6646 return (PyObject*) u;
6647}
6648
Alexander Belopolsky40018472011-02-26 01:02:56 +00006649static int
6650fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006652 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 Py_UNICODE *s = self->str;
6654 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006655
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006658
Benjamin Peterson29060642009-01-31 22:14:21 +00006659 ch = Py_UNICODE_TOUPPER(*s);
6660 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 *s = ch;
6663 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 s++;
6665 }
6666
6667 return status;
6668}
6669
Alexander Belopolsky40018472011-02-26 01:02:56 +00006670static int
6671fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006673 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 Py_UNICODE *s = self->str;
6675 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006676
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006679
Benjamin Peterson29060642009-01-31 22:14:21 +00006680 ch = Py_UNICODE_TOLOWER(*s);
6681 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006683 *s = ch;
6684 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685 s++;
6686 }
6687
6688 return status;
6689}
6690
Alexander Belopolsky40018472011-02-26 01:02:56 +00006691static int
6692fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006694 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 Py_UNICODE *s = self->str;
6696 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006697
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 while (len-- > 0) {
6699 if (Py_UNICODE_ISUPPER(*s)) {
6700 *s = Py_UNICODE_TOLOWER(*s);
6701 status = 1;
6702 } else if (Py_UNICODE_ISLOWER(*s)) {
6703 *s = Py_UNICODE_TOUPPER(*s);
6704 status = 1;
6705 }
6706 s++;
6707 }
6708
6709 return status;
6710}
6711
Alexander Belopolsky40018472011-02-26 01:02:56 +00006712static int
6713fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006715 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006716 Py_UNICODE *s = self->str;
6717 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006718
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006719 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006720 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006721 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 *s = Py_UNICODE_TOUPPER(*s);
6723 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006725 s++;
6726 while (--len > 0) {
6727 if (Py_UNICODE_ISUPPER(*s)) {
6728 *s = Py_UNICODE_TOLOWER(*s);
6729 status = 1;
6730 }
6731 s++;
6732 }
6733 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734}
6735
Alexander Belopolsky40018472011-02-26 01:02:56 +00006736static int
6737fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738{
6739 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6740 register Py_UNICODE *e;
6741 int previous_is_cased;
6742
6743 /* Shortcut for single character strings */
6744 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6746 if (*p != ch) {
6747 *p = ch;
6748 return 1;
6749 }
6750 else
6751 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 }
Tim Petersced69f82003-09-16 20:30:58 +00006753
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 e = p + PyUnicode_GET_SIZE(self);
6755 previous_is_cased = 0;
6756 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006757 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006758
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 if (previous_is_cased)
6760 *p = Py_UNICODE_TOLOWER(ch);
6761 else
6762 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006763
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 if (Py_UNICODE_ISLOWER(ch) ||
6765 Py_UNICODE_ISUPPER(ch) ||
6766 Py_UNICODE_ISTITLE(ch))
6767 previous_is_cased = 1;
6768 else
6769 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 }
6771 return 1;
6772}
6773
Tim Peters8ce9f162004-08-27 01:49:32 +00006774PyObject *
6775PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776{
Skip Montanaro6543b452004-09-16 03:28:13 +00006777 const Py_UNICODE blank = ' ';
6778 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006779 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006780 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006781 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6782 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006783 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6784 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006785 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006786 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787
Tim Peters05eba1f2004-08-27 21:32:02 +00006788 fseq = PySequence_Fast(seq, "");
6789 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006790 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006791 }
6792
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006793 /* NOTE: the following code can't call back into Python code,
6794 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006795 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006796
Tim Peters05eba1f2004-08-27 21:32:02 +00006797 seqlen = PySequence_Fast_GET_SIZE(fseq);
6798 /* If empty sequence, return u"". */
6799 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006800 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6801 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006802 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006803 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006804 /* If singleton sequence with an exact Unicode, return that. */
6805 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006806 item = items[0];
6807 if (PyUnicode_CheckExact(item)) {
6808 Py_INCREF(item);
6809 res = (PyUnicodeObject *)item;
6810 goto Done;
6811 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006812 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006813 else {
6814 /* Set up sep and seplen */
6815 if (separator == NULL) {
6816 sep = &blank;
6817 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006818 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006819 else {
6820 if (!PyUnicode_Check(separator)) {
6821 PyErr_Format(PyExc_TypeError,
6822 "separator: expected str instance,"
6823 " %.80s found",
6824 Py_TYPE(separator)->tp_name);
6825 goto onError;
6826 }
6827 sep = PyUnicode_AS_UNICODE(separator);
6828 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006829 }
6830 }
6831
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006832 /* There are at least two things to join, or else we have a subclass
6833 * of str in the sequence.
6834 * Do a pre-pass to figure out the total amount of space we'll
6835 * need (sz), and see whether all argument are strings.
6836 */
6837 sz = 0;
6838 for (i = 0; i < seqlen; i++) {
6839 const Py_ssize_t old_sz = sz;
6840 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 if (!PyUnicode_Check(item)) {
6842 PyErr_Format(PyExc_TypeError,
6843 "sequence item %zd: expected str instance,"
6844 " %.80s found",
6845 i, Py_TYPE(item)->tp_name);
6846 goto onError;
6847 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006848 sz += PyUnicode_GET_SIZE(item);
6849 if (i != 0)
6850 sz += seplen;
6851 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6852 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006853 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006854 goto onError;
6855 }
6856 }
Tim Petersced69f82003-09-16 20:30:58 +00006857
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006858 res = _PyUnicode_New(sz);
6859 if (res == NULL)
6860 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006861
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006862 /* Catenate everything. */
6863 res_p = PyUnicode_AS_UNICODE(res);
6864 for (i = 0; i < seqlen; ++i) {
6865 Py_ssize_t itemlen;
6866 item = items[i];
6867 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 /* Copy item, and maybe the separator. */
6869 if (i) {
6870 Py_UNICODE_COPY(res_p, sep, seplen);
6871 res_p += seplen;
6872 }
6873 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6874 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006875 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006876
Benjamin Peterson29060642009-01-31 22:14:21 +00006877 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006878 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 return (PyObject *)res;
6880
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006882 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006883 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884 return NULL;
6885}
6886
Alexander Belopolsky40018472011-02-26 01:02:56 +00006887static PyUnicodeObject *
6888pad(PyUnicodeObject *self,
6889 Py_ssize_t left,
6890 Py_ssize_t right,
6891 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006892{
6893 PyUnicodeObject *u;
6894
6895 if (left < 0)
6896 left = 0;
6897 if (right < 0)
6898 right = 0;
6899
Tim Peters7a29bd52001-09-12 03:03:31 +00006900 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 Py_INCREF(self);
6902 return self;
6903 }
6904
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006905 if (left > PY_SSIZE_T_MAX - self->length ||
6906 right > PY_SSIZE_T_MAX - (left + self->length)) {
6907 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6908 return NULL;
6909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 u = _PyUnicode_New(left + self->length + right);
6911 if (u) {
6912 if (left)
6913 Py_UNICODE_FILL(u->str, fill, left);
6914 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6915 if (right)
6916 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6917 }
6918
6919 return u;
6920}
6921
Alexander Belopolsky40018472011-02-26 01:02:56 +00006922PyObject *
6923PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006924{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926
6927 string = PyUnicode_FromObject(string);
6928 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006931 list = stringlib_splitlines(
6932 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6933 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934
6935 Py_DECREF(string);
6936 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937}
6938
Alexander Belopolsky40018472011-02-26 01:02:56 +00006939static PyObject *
6940split(PyUnicodeObject *self,
6941 PyUnicodeObject *substring,
6942 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006944 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006945 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006948 return stringlib_split_whitespace(
6949 (PyObject*) self, self->str, self->length, maxcount
6950 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006952 return stringlib_split(
6953 (PyObject*) self, self->str, self->length,
6954 substring->str, substring->length,
6955 maxcount
6956 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957}
6958
Alexander Belopolsky40018472011-02-26 01:02:56 +00006959static PyObject *
6960rsplit(PyUnicodeObject *self,
6961 PyUnicodeObject *substring,
6962 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006963{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006964 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006965 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006966
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006967 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006968 return stringlib_rsplit_whitespace(
6969 (PyObject*) self, self->str, self->length, maxcount
6970 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006971
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006972 return stringlib_rsplit(
6973 (PyObject*) self, self->str, self->length,
6974 substring->str, substring->length,
6975 maxcount
6976 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006977}
6978
Alexander Belopolsky40018472011-02-26 01:02:56 +00006979static PyObject *
6980replace(PyUnicodeObject *self,
6981 PyUnicodeObject *str1,
6982 PyUnicodeObject *str2,
6983 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984{
6985 PyUnicodeObject *u;
6986
6987 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006989 else if (maxcount == 0 || self->length == 0)
6990 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991
Thomas Wouters477c8d52006-05-27 19:21:47 +00006992 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006993 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006994 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006995 if (str1->length == 0)
6996 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006997 if (str1->length == 1) {
6998 /* replace characters */
6999 Py_UNICODE u1, u2;
7000 if (!findchar(self->str, self->length, str1->str[0]))
7001 goto nothing;
7002 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7003 if (!u)
7004 return NULL;
7005 Py_UNICODE_COPY(u->str, self->str, self->length);
7006 u1 = str1->str[0];
7007 u2 = str2->str[0];
7008 for (i = 0; i < u->length; i++)
7009 if (u->str[i] == u1) {
7010 if (--maxcount < 0)
7011 break;
7012 u->str[i] = u2;
7013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007015 i = stringlib_find(
7016 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00007017 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00007018 if (i < 0)
7019 goto nothing;
7020 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7021 if (!u)
7022 return NULL;
7023 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007024
7025 /* change everything in-place, starting with this one */
7026 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7027 i += str1->length;
7028
7029 while ( --maxcount > 0) {
7030 i = stringlib_find(self->str+i, self->length-i,
7031 str1->str, str1->length,
7032 i);
7033 if (i == -1)
7034 break;
7035 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7036 i += str1->length;
7037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007040
Brett Cannonb94767f2011-02-22 20:15:44 +00007041 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007042 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043 Py_UNICODE *p;
7044
7045 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007046 n = stringlib_count(self->str, self->length, str1->str, str1->length,
7047 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007048 if (n == 0)
7049 goto nothing;
7050 /* new_size = self->length + n * (str2->length - str1->length)); */
7051 delta = (str2->length - str1->length);
7052 if (delta == 0) {
7053 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007055 product = n * (str2->length - str1->length);
7056 if ((product / (str2->length - str1->length)) != n) {
7057 PyErr_SetString(PyExc_OverflowError,
7058 "replace string is too long");
7059 return NULL;
7060 }
7061 new_size = self->length + product;
7062 if (new_size < 0) {
7063 PyErr_SetString(PyExc_OverflowError,
7064 "replace string is too long");
7065 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 }
7067 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007068 u = _PyUnicode_New(new_size);
7069 if (!u)
7070 return NULL;
7071 i = 0;
7072 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007073 if (str1->length > 0) {
7074 while (n-- > 0) {
7075 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007076 j = stringlib_find(self->str+i, self->length-i,
7077 str1->str, str1->length,
7078 i);
7079 if (j == -1)
7080 break;
7081 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007082 /* copy unchanged part [i:j] */
7083 Py_UNICODE_COPY(p, self->str+i, j-i);
7084 p += j - i;
7085 }
7086 /* copy substitution string */
7087 if (str2->length > 0) {
7088 Py_UNICODE_COPY(p, str2->str, str2->length);
7089 p += str2->length;
7090 }
7091 i = j + str1->length;
7092 }
7093 if (i < self->length)
7094 /* copy tail [i:] */
7095 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7096 } else {
7097 /* interleave */
7098 while (n > 0) {
7099 Py_UNICODE_COPY(p, str2->str, str2->length);
7100 p += str2->length;
7101 if (--n <= 0)
7102 break;
7103 *p++ = self->str[i++];
7104 }
7105 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007108 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007109
Benjamin Peterson29060642009-01-31 22:14:21 +00007110 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007111 /* nothing to replace; return original string (when possible) */
7112 if (PyUnicode_CheckExact(self)) {
7113 Py_INCREF(self);
7114 return (PyObject *) self;
7115 }
7116 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117}
7118
7119/* --- Unicode Object Methods --------------------------------------------- */
7120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007121PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007122 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123\n\
7124Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007125characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007126
7127static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007128unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130 return fixup(self, fixtitle);
7131}
7132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007133PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007135\n\
7136Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007137have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138
7139static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007140unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142 return fixup(self, fixcapitalize);
7143}
7144
7145#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007146PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007147 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148\n\
7149Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007150normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151
7152static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007153unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154{
7155 PyObject *list;
7156 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007157 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159 /* Split into words */
7160 list = split(self, NULL, -1);
7161 if (!list)
7162 return NULL;
7163
7164 /* Capitalize each word */
7165 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7166 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007167 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007168 if (item == NULL)
7169 goto onError;
7170 Py_DECREF(PyList_GET_ITEM(list, i));
7171 PyList_SET_ITEM(list, i, item);
7172 }
7173
7174 /* Join the words to form a new string */
7175 item = PyUnicode_Join(NULL, list);
7176
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007178 Py_DECREF(list);
7179 return (PyObject *)item;
7180}
7181#endif
7182
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007183/* Argument converter. Coerces to a single unicode character */
7184
7185static int
7186convert_uc(PyObject *obj, void *addr)
7187{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007188 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7189 PyObject *uniobj;
7190 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007191
Benjamin Peterson14339b62009-01-31 16:36:08 +00007192 uniobj = PyUnicode_FromObject(obj);
7193 if (uniobj == NULL) {
7194 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007195 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007196 return 0;
7197 }
7198 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7199 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007201 Py_DECREF(uniobj);
7202 return 0;
7203 }
7204 unistr = PyUnicode_AS_UNICODE(uniobj);
7205 *fillcharloc = unistr[0];
7206 Py_DECREF(uniobj);
7207 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007208}
7209
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007210PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007213Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007214done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215
7216static PyObject *
7217unicode_center(PyUnicodeObject *self, PyObject *args)
7218{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007219 Py_ssize_t marg, left;
7220 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007221 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222
Thomas Woutersde017742006-02-16 19:34:37 +00007223 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 return NULL;
7225
Tim Peters7a29bd52001-09-12 03:03:31 +00007226 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227 Py_INCREF(self);
7228 return (PyObject*) self;
7229 }
7230
7231 marg = width - self->length;
7232 left = marg / 2 + (marg & width & 1);
7233
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007234 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235}
7236
Marc-André Lemburge5034372000-08-08 08:04:29 +00007237#if 0
7238
7239/* This code should go into some future Unicode collation support
7240 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007241 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007242
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007243/* speedy UTF-16 code point order comparison */
7244/* gleaned from: */
7245/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7246
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007247static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007248{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007249 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007250 0, 0, 0, 0, 0, 0, 0, 0,
7251 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007252 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007253};
7254
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255static int
7256unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7257{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007258 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007259
Guido van Rossumd57fd912000-03-10 22:53:23 +00007260 Py_UNICODE *s1 = str1->str;
7261 Py_UNICODE *s2 = str2->str;
7262
7263 len1 = str1->length;
7264 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007265
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007267 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007268
7269 c1 = *s1++;
7270 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007271
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 if (c1 > (1<<11) * 26)
7273 c1 += utf16Fixup[c1>>11];
7274 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007275 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007276 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007277
7278 if (c1 != c2)
7279 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007280
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007281 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007282 }
7283
7284 return (len1 < len2) ? -1 : (len1 != len2);
7285}
7286
Marc-André Lemburge5034372000-08-08 08:04:29 +00007287#else
7288
7289static int
7290unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7291{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007292 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007293
7294 Py_UNICODE *s1 = str1->str;
7295 Py_UNICODE *s2 = str2->str;
7296
7297 len1 = str1->length;
7298 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007299
Marc-André Lemburge5034372000-08-08 08:04:29 +00007300 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007301 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007302
Fredrik Lundh45714e92001-06-26 16:39:36 +00007303 c1 = *s1++;
7304 c2 = *s2++;
7305
7306 if (c1 != c2)
7307 return (c1 < c2) ? -1 : 1;
7308
Marc-André Lemburge5034372000-08-08 08:04:29 +00007309 len1--; len2--;
7310 }
7311
7312 return (len1 < len2) ? -1 : (len1 != len2);
7313}
7314
7315#endif
7316
Alexander Belopolsky40018472011-02-26 01:02:56 +00007317int
7318PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007320 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7321 return unicode_compare((PyUnicodeObject *)left,
7322 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007323 PyErr_Format(PyExc_TypeError,
7324 "Can't compare %.100s and %.100s",
7325 left->ob_type->tp_name,
7326 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327 return -1;
7328}
7329
Martin v. Löwis5b222132007-06-10 09:51:05 +00007330int
7331PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7332{
7333 int i;
7334 Py_UNICODE *id;
7335 assert(PyUnicode_Check(uni));
7336 id = PyUnicode_AS_UNICODE(uni);
7337 /* Compare Unicode string and source character set string */
7338 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 if (id[i] != str[i])
7340 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007341 /* This check keeps Python strings that end in '\0' from comparing equal
7342 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007343 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007345 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007346 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007347 return 0;
7348}
7349
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007350
Benjamin Peterson29060642009-01-31 22:14:21 +00007351#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007352 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007353
Alexander Belopolsky40018472011-02-26 01:02:56 +00007354PyObject *
7355PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007356{
7357 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007358
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007359 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7360 PyObject *v;
Benjamin Peterson5fd4bd32011-03-06 09:06:34 -06007361 if (PyUnicode_GET_SIZE(left) != PyUnicode_GET_SIZE(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007362 if (op == Py_EQ) {
7363 Py_INCREF(Py_False);
7364 return Py_False;
7365 }
7366 if (op == Py_NE) {
7367 Py_INCREF(Py_True);
7368 return Py_True;
7369 }
7370 }
7371 if (left == right)
7372 result = 0;
7373 else
7374 result = unicode_compare((PyUnicodeObject *)left,
7375 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007376
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007377 /* Convert the return value to a Boolean */
7378 switch (op) {
7379 case Py_EQ:
7380 v = TEST_COND(result == 0);
7381 break;
7382 case Py_NE:
7383 v = TEST_COND(result != 0);
7384 break;
7385 case Py_LE:
7386 v = TEST_COND(result <= 0);
7387 break;
7388 case Py_GE:
7389 v = TEST_COND(result >= 0);
7390 break;
7391 case Py_LT:
7392 v = TEST_COND(result == -1);
7393 break;
7394 case Py_GT:
7395 v = TEST_COND(result == 1);
7396 break;
7397 default:
7398 PyErr_BadArgument();
7399 return NULL;
7400 }
7401 Py_INCREF(v);
7402 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007403 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007404
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007405 Py_INCREF(Py_NotImplemented);
7406 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007407}
7408
Alexander Belopolsky40018472011-02-26 01:02:56 +00007409int
7410PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007411{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007412 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007413 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007414
7415 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007416 sub = PyUnicode_FromObject(element);
7417 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 PyErr_Format(PyExc_TypeError,
7419 "'in <string>' requires string as left operand, not %s",
7420 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007421 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007422 }
7423
Thomas Wouters477c8d52006-05-27 19:21:47 +00007424 str = PyUnicode_FromObject(container);
7425 if (!str) {
7426 Py_DECREF(sub);
7427 return -1;
7428 }
7429
7430 result = stringlib_contains_obj(str, sub);
7431
7432 Py_DECREF(str);
7433 Py_DECREF(sub);
7434
Guido van Rossum403d68b2000-03-13 15:55:09 +00007435 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007436}
7437
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438/* Concat to string or Unicode object giving a new Unicode object. */
7439
Alexander Belopolsky40018472011-02-26 01:02:56 +00007440PyObject *
7441PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007442{
7443 PyUnicodeObject *u = NULL, *v = NULL, *w;
7444
7445 /* Coerce the two arguments */
7446 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7447 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7450 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452
7453 /* Shortcuts */
7454 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 Py_DECREF(v);
7456 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457 }
7458 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 Py_DECREF(u);
7460 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461 }
7462
7463 /* Concat the two Unicode strings */
7464 w = _PyUnicode_New(u->length + v->length);
7465 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467 Py_UNICODE_COPY(w->str, u->str, u->length);
7468 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7469
7470 Py_DECREF(u);
7471 Py_DECREF(v);
7472 return (PyObject *)w;
7473
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 Py_XDECREF(u);
7476 Py_XDECREF(v);
7477 return NULL;
7478}
7479
Walter Dörwald1ab83302007-05-18 17:15:44 +00007480void
7481PyUnicode_Append(PyObject **pleft, PyObject *right)
7482{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007483 PyObject *new;
7484 if (*pleft == NULL)
7485 return;
7486 if (right == NULL || !PyUnicode_Check(*pleft)) {
7487 Py_DECREF(*pleft);
7488 *pleft = NULL;
7489 return;
7490 }
7491 new = PyUnicode_Concat(*pleft, right);
7492 Py_DECREF(*pleft);
7493 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007494}
7495
7496void
7497PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7498{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007499 PyUnicode_Append(pleft, right);
7500 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007501}
7502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007503PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007504 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007506Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007507string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007508interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007509
7510static PyObject *
7511unicode_count(PyUnicodeObject *self, PyObject *args)
7512{
7513 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007514 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007515 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516 PyObject *result;
7517
Guido van Rossumb8872e62000-05-09 14:14:27 +00007518 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007519 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 return NULL;
7521
7522 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007523 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007526
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007527 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007528 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007529 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007530 substring->str, substring->length,
7531 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007532 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007533
7534 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007535
Guido van Rossumd57fd912000-03-10 22:53:23 +00007536 return result;
7537}
7538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007539PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007540 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007541\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007542Encode S using the codec registered for encoding. Default encoding\n\
7543is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007544handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007545a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7546'xmlcharrefreplace' as well as any other name registered with\n\
7547codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007548
7549static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007550unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007552 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007553 char *encoding = NULL;
7554 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007555
Benjamin Peterson308d6372009-09-18 21:42:35 +00007556 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7557 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007559 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007560}
7561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007562PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007563 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007564\n\
7565Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007566If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567
7568static PyObject*
7569unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7570{
7571 Py_UNICODE *e;
7572 Py_UNICODE *p;
7573 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007574 Py_UNICODE *qe;
7575 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576 PyUnicodeObject *u;
7577 int tabsize = 8;
7578
7579 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007581
Thomas Wouters7e474022000-07-16 12:04:32 +00007582 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007583 i = 0; /* chars up to and including most recent \n or \r */
7584 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7585 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 for (p = self->str; p < e; p++)
7587 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 if (tabsize > 0) {
7589 incr = tabsize - (j % tabsize); /* cannot overflow */
7590 if (j > PY_SSIZE_T_MAX - incr)
7591 goto overflow1;
7592 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007593 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007595 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 if (j > PY_SSIZE_T_MAX - 1)
7597 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598 j++;
7599 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 if (i > PY_SSIZE_T_MAX - j)
7601 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007603 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604 }
7605 }
7606
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007607 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007609
Guido van Rossumd57fd912000-03-10 22:53:23 +00007610 /* Second pass: create output string and fill it */
7611 u = _PyUnicode_New(i + j);
7612 if (!u)
7613 return NULL;
7614
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007615 j = 0; /* same as in first pass */
7616 q = u->str; /* next output char */
7617 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007618
7619 for (p = self->str; p < e; p++)
7620 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007621 if (tabsize > 0) {
7622 i = tabsize - (j % tabsize);
7623 j += i;
7624 while (i--) {
7625 if (q >= qe)
7626 goto overflow2;
7627 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007628 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007630 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 else {
7632 if (q >= qe)
7633 goto overflow2;
7634 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007635 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007636 if (*p == '\n' || *p == '\r')
7637 j = 0;
7638 }
7639
7640 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007641
7642 overflow2:
7643 Py_DECREF(u);
7644 overflow1:
7645 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647}
7648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007649PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007650 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007651\n\
7652Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007653such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007654arguments start and end are interpreted as in slice notation.\n\
7655\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007656Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657
7658static PyObject *
7659unicode_find(PyUnicodeObject *self, PyObject *args)
7660{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007661 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007662 Py_ssize_t start;
7663 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007664 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665
Christian Heimes9cd17752007-11-18 19:35:23 +00007666 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007668
Thomas Wouters477c8d52006-05-27 19:21:47 +00007669 result = stringlib_find_slice(
7670 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7671 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7672 start, end
7673 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674
7675 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007676
Christian Heimes217cfd12007-12-02 14:31:20 +00007677 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678}
7679
7680static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007681unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682{
7683 if (index < 0 || index >= self->length) {
7684 PyErr_SetString(PyExc_IndexError, "string index out of range");
7685 return NULL;
7686 }
7687
7688 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7689}
7690
Guido van Rossumc2504932007-09-18 19:42:40 +00007691/* Believe it or not, this produces the same value for ASCII strings
7692 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007693static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007694unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007695{
Guido van Rossumc2504932007-09-18 19:42:40 +00007696 Py_ssize_t len;
7697 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007698 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007699
7700 if (self->hash != -1)
7701 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007702 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007703 p = self->str;
7704 x = *p << 7;
7705 while (--len >= 0)
7706 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007707 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007708 if (x == -1)
7709 x = -2;
7710 self->hash = x;
7711 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712}
7713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007714PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007717Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007718
7719static PyObject *
7720unicode_index(PyUnicodeObject *self, PyObject *args)
7721{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007722 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007723 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007724 Py_ssize_t start;
7725 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007726
Christian Heimes9cd17752007-11-18 19:35:23 +00007727 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729
Thomas Wouters477c8d52006-05-27 19:21:47 +00007730 result = stringlib_find_slice(
7731 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7732 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7733 start, end
7734 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735
7736 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007737
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738 if (result < 0) {
7739 PyErr_SetString(PyExc_ValueError, "substring not found");
7740 return NULL;
7741 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007742
Christian Heimes217cfd12007-12-02 14:31:20 +00007743 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744}
7745
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007746PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007747 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007749Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007750at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007751
7752static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007753unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754{
7755 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7756 register const Py_UNICODE *e;
7757 int cased;
7758
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759 /* Shortcut for single character strings */
7760 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007762
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007763 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007764 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007765 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007766
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767 e = p + PyUnicode_GET_SIZE(self);
7768 cased = 0;
7769 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007771
Benjamin Peterson29060642009-01-31 22:14:21 +00007772 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7773 return PyBool_FromLong(0);
7774 else if (!cased && Py_UNICODE_ISLOWER(ch))
7775 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007776 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007777 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778}
7779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007780PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007782\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007783Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007784at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007785
7786static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007787unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788{
7789 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7790 register const Py_UNICODE *e;
7791 int cased;
7792
Guido van Rossumd57fd912000-03-10 22:53:23 +00007793 /* Shortcut for single character strings */
7794 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007796
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007797 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007798 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007799 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007800
Guido van Rossumd57fd912000-03-10 22:53:23 +00007801 e = p + PyUnicode_GET_SIZE(self);
7802 cased = 0;
7803 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007804 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007805
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7807 return PyBool_FromLong(0);
7808 else if (!cased && Py_UNICODE_ISUPPER(ch))
7809 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007810 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007811 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812}
7813
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007814PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007815 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007817Return True if S is a titlecased string and there is at least one\n\
7818character in S, i.e. upper- and titlecase characters may only\n\
7819follow uncased characters and lowercase characters only cased ones.\n\
7820Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821
7822static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007823unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824{
7825 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7826 register const Py_UNICODE *e;
7827 int cased, previous_is_cased;
7828
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829 /* Shortcut for single character strings */
7830 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7832 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007834 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007835 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007837
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 e = p + PyUnicode_GET_SIZE(self);
7839 cased = 0;
7840 previous_is_cased = 0;
7841 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007843
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7845 if (previous_is_cased)
7846 return PyBool_FromLong(0);
7847 previous_is_cased = 1;
7848 cased = 1;
7849 }
7850 else if (Py_UNICODE_ISLOWER(ch)) {
7851 if (!previous_is_cased)
7852 return PyBool_FromLong(0);
7853 previous_is_cased = 1;
7854 cased = 1;
7855 }
7856 else
7857 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007859 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860}
7861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007862PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007864\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007865Return True if all characters in S are whitespace\n\
7866and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007867
7868static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007869unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870{
7871 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7872 register const Py_UNICODE *e;
7873
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 /* Shortcut for single character strings */
7875 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007876 Py_UNICODE_ISSPACE(*p))
7877 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007879 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007880 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007881 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007882
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883 e = p + PyUnicode_GET_SIZE(self);
7884 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007885 if (!Py_UNICODE_ISSPACE(*p))
7886 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007888 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007889}
7890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007891PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007893\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007894Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007895and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007896
7897static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007898unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007899{
7900 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7901 register const Py_UNICODE *e;
7902
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007903 /* Shortcut for single character strings */
7904 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 Py_UNICODE_ISALPHA(*p))
7906 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007907
7908 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007909 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007911
7912 e = p + PyUnicode_GET_SIZE(self);
7913 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 if (!Py_UNICODE_ISALPHA(*p))
7915 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007916 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007917 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007918}
7919
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007920PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007922\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007923Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007924and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007925
7926static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007927unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007928{
7929 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7930 register const Py_UNICODE *e;
7931
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007932 /* Shortcut for single character strings */
7933 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007934 Py_UNICODE_ISALNUM(*p))
7935 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007936
7937 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007938 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007940
7941 e = p + PyUnicode_GET_SIZE(self);
7942 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007943 if (!Py_UNICODE_ISALNUM(*p))
7944 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007945 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007946 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007947}
7948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007949PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007950 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007951\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007952Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007953False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007954
7955static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007956unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957{
7958 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7959 register const Py_UNICODE *e;
7960
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961 /* Shortcut for single character strings */
7962 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 Py_UNICODE_ISDECIMAL(*p))
7964 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007965
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007966 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007967 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007969
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970 e = p + PyUnicode_GET_SIZE(self);
7971 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 if (!Py_UNICODE_ISDECIMAL(*p))
7973 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007975 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976}
7977
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007978PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007981Return True if all characters in S are digits\n\
7982and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983
7984static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007985unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986{
7987 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7988 register const Py_UNICODE *e;
7989
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990 /* Shortcut for single character strings */
7991 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 Py_UNICODE_ISDIGIT(*p))
7993 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007995 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007996 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007997 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007998
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 e = p + PyUnicode_GET_SIZE(self);
8000 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008001 if (!Py_UNICODE_ISDIGIT(*p))
8002 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008004 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005}
8006
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008007PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008008 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008009\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00008010Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008011False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012
8013static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008014unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015{
8016 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8017 register const Py_UNICODE *e;
8018
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019 /* Shortcut for single character strings */
8020 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 Py_UNICODE_ISNUMERIC(*p))
8022 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008024 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008025 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008027
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 e = p + PyUnicode_GET_SIZE(self);
8029 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 if (!Py_UNICODE_ISNUMERIC(*p))
8031 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008032 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008033 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034}
8035
Martin v. Löwis47383402007-08-15 07:32:56 +00008036int
8037PyUnicode_IsIdentifier(PyObject *self)
8038{
8039 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
8040 register const Py_UNICODE *e;
8041
8042 /* Special case for empty strings */
8043 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008045
8046 /* PEP 3131 says that the first character must be in
8047 XID_Start and subsequent characters in XID_Continue,
8048 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00008049 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00008050 letters, digits, underscore). However, given the current
8051 definition of XID_Start and XID_Continue, it is sufficient
8052 to check just for these, except that _ must be allowed
8053 as starting an identifier. */
8054 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
8055 return 0;
8056
8057 e = p + PyUnicode_GET_SIZE(self);
8058 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 if (!_PyUnicode_IsXidContinue(*p))
8060 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008061 }
8062 return 1;
8063}
8064
8065PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008067\n\
8068Return True if S is a valid identifier according\n\
8069to the language definition.");
8070
8071static PyObject*
8072unicode_isidentifier(PyObject *self)
8073{
8074 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8075}
8076
Georg Brandl559e5d72008-06-11 18:37:52 +00008077PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008079\n\
8080Return True if all characters in S are considered\n\
8081printable in repr() or S is empty, False otherwise.");
8082
8083static PyObject*
8084unicode_isprintable(PyObject *self)
8085{
8086 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8087 register const Py_UNICODE *e;
8088
8089 /* Shortcut for single character strings */
8090 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8091 Py_RETURN_TRUE;
8092 }
8093
8094 e = p + PyUnicode_GET_SIZE(self);
8095 for (; p < e; p++) {
8096 if (!Py_UNICODE_ISPRINTABLE(*p)) {
8097 Py_RETURN_FALSE;
8098 }
8099 }
8100 Py_RETURN_TRUE;
8101}
8102
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008103PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008104 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105\n\
8106Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008107iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008108
8109static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008110unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008112 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113}
8114
Martin v. Löwis18e16552006-02-15 17:27:45 +00008115static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008116unicode_length(PyUnicodeObject *self)
8117{
8118 return self->length;
8119}
8120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008121PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008124Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008125done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126
8127static PyObject *
8128unicode_ljust(PyUnicodeObject *self, PyObject *args)
8129{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008130 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008131 Py_UNICODE fillchar = ' ';
8132
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008133 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008134 return NULL;
8135
Tim Peters7a29bd52001-09-12 03:03:31 +00008136 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137 Py_INCREF(self);
8138 return (PyObject*) self;
8139 }
8140
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008141 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142}
8143
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008144PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008145 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008147Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148
8149static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008150unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008152 return fixup(self, fixlower);
8153}
8154
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008155#define LEFTSTRIP 0
8156#define RIGHTSTRIP 1
8157#define BOTHSTRIP 2
8158
8159/* Arrays indexed by above */
8160static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8161
8162#define STRIPNAME(i) (stripformat[i]+3)
8163
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008164/* externally visible for str.strip(unicode) */
8165PyObject *
8166_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8167{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008168 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8169 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8170 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8171 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8172 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008173
Benjamin Peterson29060642009-01-31 22:14:21 +00008174 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008175
Benjamin Peterson14339b62009-01-31 16:36:08 +00008176 i = 0;
8177 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8179 i++;
8180 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008181 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008182
Benjamin Peterson14339b62009-01-31 16:36:08 +00008183 j = len;
8184 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 do {
8186 j--;
8187 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8188 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008189 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008190
Benjamin Peterson14339b62009-01-31 16:36:08 +00008191 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008192 Py_INCREF(self);
8193 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008194 }
8195 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008197}
8198
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199
8200static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008201do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008202{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008203 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8204 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008205
Benjamin Peterson14339b62009-01-31 16:36:08 +00008206 i = 0;
8207 if (striptype != RIGHTSTRIP) {
8208 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8209 i++;
8210 }
8211 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008212
Benjamin Peterson14339b62009-01-31 16:36:08 +00008213 j = len;
8214 if (striptype != LEFTSTRIP) {
8215 do {
8216 j--;
8217 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8218 j++;
8219 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008220
Benjamin Peterson14339b62009-01-31 16:36:08 +00008221 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8222 Py_INCREF(self);
8223 return (PyObject*)self;
8224 }
8225 else
8226 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008227}
8228
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008229
8230static PyObject *
8231do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8232{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008233 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008234
Benjamin Peterson14339b62009-01-31 16:36:08 +00008235 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8236 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008237
Benjamin Peterson14339b62009-01-31 16:36:08 +00008238 if (sep != NULL && sep != Py_None) {
8239 if (PyUnicode_Check(sep))
8240 return _PyUnicode_XStrip(self, striptype, sep);
8241 else {
8242 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 "%s arg must be None or str",
8244 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008245 return NULL;
8246 }
8247 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008248
Benjamin Peterson14339b62009-01-31 16:36:08 +00008249 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008250}
8251
8252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008253PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008255\n\
8256Return a copy of the string S with leading and trailing\n\
8257whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008258If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008259
8260static PyObject *
8261unicode_strip(PyUnicodeObject *self, PyObject *args)
8262{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008263 if (PyTuple_GET_SIZE(args) == 0)
8264 return do_strip(self, BOTHSTRIP); /* Common case */
8265 else
8266 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008267}
8268
8269
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008270PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008271 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008272\n\
8273Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008274If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008275
8276static PyObject *
8277unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8278{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008279 if (PyTuple_GET_SIZE(args) == 0)
8280 return do_strip(self, LEFTSTRIP); /* Common case */
8281 else
8282 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008283}
8284
8285
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008286PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008288\n\
8289Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008290If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008291
8292static PyObject *
8293unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8294{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008295 if (PyTuple_GET_SIZE(args) == 0)
8296 return do_strip(self, RIGHTSTRIP); /* Common case */
8297 else
8298 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008299}
8300
8301
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008303unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304{
8305 PyUnicodeObject *u;
8306 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008307 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008308 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309
Georg Brandl222de0f2009-04-12 12:01:50 +00008310 if (len < 1) {
8311 Py_INCREF(unicode_empty);
8312 return (PyObject *)unicode_empty;
8313 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314
Tim Peters7a29bd52001-09-12 03:03:31 +00008315 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316 /* no repeat, return original string */
8317 Py_INCREF(str);
8318 return (PyObject*) str;
8319 }
Tim Peters8f422462000-09-09 06:13:41 +00008320
8321 /* ensure # of chars needed doesn't overflow int and # of bytes
8322 * needed doesn't overflow size_t
8323 */
8324 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008325 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008326 PyErr_SetString(PyExc_OverflowError,
8327 "repeated string is too long");
8328 return NULL;
8329 }
8330 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8331 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8332 PyErr_SetString(PyExc_OverflowError,
8333 "repeated string is too long");
8334 return NULL;
8335 }
8336 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 if (!u)
8338 return NULL;
8339
8340 p = u->str;
8341
Georg Brandl222de0f2009-04-12 12:01:50 +00008342 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008343 Py_UNICODE_FILL(p, str->str[0], len);
8344 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008345 Py_ssize_t done = str->length; /* number of characters copied this far */
8346 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008348 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008349 Py_UNICODE_COPY(p+done, p, n);
8350 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352 }
8353
8354 return (PyObject*) u;
8355}
8356
Alexander Belopolsky40018472011-02-26 01:02:56 +00008357PyObject *
8358PyUnicode_Replace(PyObject *obj,
8359 PyObject *subobj,
8360 PyObject *replobj,
8361 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362{
8363 PyObject *self;
8364 PyObject *str1;
8365 PyObject *str2;
8366 PyObject *result;
8367
8368 self = PyUnicode_FromObject(obj);
8369 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371 str1 = PyUnicode_FromObject(subobj);
8372 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 Py_DECREF(self);
8374 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375 }
8376 str2 = PyUnicode_FromObject(replobj);
8377 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 Py_DECREF(self);
8379 Py_DECREF(str1);
8380 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008381 }
Tim Petersced69f82003-09-16 20:30:58 +00008382 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 (PyUnicodeObject *)str1,
8384 (PyUnicodeObject *)str2,
8385 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008386 Py_DECREF(self);
8387 Py_DECREF(str1);
8388 Py_DECREF(str2);
8389 return result;
8390}
8391
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008392PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008393 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394\n\
8395Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008396old replaced by new. If the optional argument count is\n\
8397given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398
8399static PyObject*
8400unicode_replace(PyUnicodeObject *self, PyObject *args)
8401{
8402 PyUnicodeObject *str1;
8403 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008404 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008405 PyObject *result;
8406
Martin v. Löwis18e16552006-02-15 17:27:45 +00008407 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008408 return NULL;
8409 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8410 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008413 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 Py_DECREF(str1);
8415 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008416 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417
8418 result = replace(self, str1, str2, maxcount);
8419
8420 Py_DECREF(str1);
8421 Py_DECREF(str2);
8422 return result;
8423}
8424
Alexander Belopolsky40018472011-02-26 01:02:56 +00008425static PyObject *
8426unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008428 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008429 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008430 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8431 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8432
8433 /* XXX(nnorwitz): rather than over-allocating, it would be
8434 better to choose a different scheme. Perhaps scan the
8435 first N-chars of the string and allocate based on that size.
8436 */
8437 /* Initial allocation is based on the longest-possible unichr
8438 escape.
8439
8440 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8441 unichr, so in this case it's the longest unichr escape. In
8442 narrow (UTF-16) builds this is five chars per source unichr
8443 since there are two unichrs in the surrogate pair, so in narrow
8444 (UTF-16) builds it's not the longest unichr escape.
8445
8446 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8447 so in the narrow (UTF-16) build case it's the longest unichr
8448 escape.
8449 */
8450
Walter Dörwald1ab83302007-05-18 17:15:44 +00008451 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008453#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008455#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008457#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008459 if (repr == NULL)
8460 return NULL;
8461
Walter Dörwald1ab83302007-05-18 17:15:44 +00008462 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008463
8464 /* Add quote */
8465 *p++ = (findchar(s, size, '\'') &&
8466 !findchar(s, size, '"')) ? '"' : '\'';
8467 while (size-- > 0) {
8468 Py_UNICODE ch = *s++;
8469
8470 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008471 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008472 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008473 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008474 continue;
8475 }
8476
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008478 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008479 *p++ = '\\';
8480 *p++ = 't';
8481 }
8482 else if (ch == '\n') {
8483 *p++ = '\\';
8484 *p++ = 'n';
8485 }
8486 else if (ch == '\r') {
8487 *p++ = '\\';
8488 *p++ = 'r';
8489 }
8490
8491 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008492 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008493 *p++ = '\\';
8494 *p++ = 'x';
8495 *p++ = hexdigits[(ch >> 4) & 0x000F];
8496 *p++ = hexdigits[ch & 0x000F];
8497 }
8498
Georg Brandl559e5d72008-06-11 18:37:52 +00008499 /* Copy ASCII characters as-is */
8500 else if (ch < 0x7F) {
8501 *p++ = ch;
8502 }
8503
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008505 else {
8506 Py_UCS4 ucs = ch;
8507
8508#ifndef Py_UNICODE_WIDE
8509 Py_UNICODE ch2 = 0;
8510 /* Get code point from surrogate pair */
8511 if (size > 0) {
8512 ch2 = *s;
8513 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008515 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008517 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008518 size--;
8519 }
8520 }
8521#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008522 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008523 (categories Z* and C* except ASCII space)
8524 */
8525 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8526 /* Map 8-bit characters to '\xhh' */
8527 if (ucs <= 0xff) {
8528 *p++ = '\\';
8529 *p++ = 'x';
8530 *p++ = hexdigits[(ch >> 4) & 0x000F];
8531 *p++ = hexdigits[ch & 0x000F];
8532 }
8533 /* Map 21-bit characters to '\U00xxxxxx' */
8534 else if (ucs >= 0x10000) {
8535 *p++ = '\\';
8536 *p++ = 'U';
8537 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8538 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8539 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8540 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8541 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8542 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8543 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8544 *p++ = hexdigits[ucs & 0x0000000F];
8545 }
8546 /* Map 16-bit characters to '\uxxxx' */
8547 else {
8548 *p++ = '\\';
8549 *p++ = 'u';
8550 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8551 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8552 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8553 *p++ = hexdigits[ucs & 0x000F];
8554 }
8555 }
8556 /* Copy characters as-is */
8557 else {
8558 *p++ = ch;
8559#ifndef Py_UNICODE_WIDE
8560 if (ucs >= 0x10000)
8561 *p++ = ch2;
8562#endif
8563 }
8564 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008565 }
8566 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008567 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008568
8569 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008570 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008571 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572}
8573
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008574PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008576\n\
8577Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008578such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579arguments start and end are interpreted as in slice notation.\n\
8580\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008581Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582
8583static PyObject *
8584unicode_rfind(PyUnicodeObject *self, PyObject *args)
8585{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008586 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008587 Py_ssize_t start;
8588 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008589 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590
Christian Heimes9cd17752007-11-18 19:35:23 +00008591 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008592 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593
Thomas Wouters477c8d52006-05-27 19:21:47 +00008594 result = stringlib_rfind_slice(
8595 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8596 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8597 start, end
8598 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599
8600 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008601
Christian Heimes217cfd12007-12-02 14:31:20 +00008602 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603}
8604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008605PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008606 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008608Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609
8610static PyObject *
8611unicode_rindex(PyUnicodeObject *self, PyObject *args)
8612{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008613 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008614 Py_ssize_t start;
8615 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008616 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617
Christian Heimes9cd17752007-11-18 19:35:23 +00008618 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008619 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620
Thomas Wouters477c8d52006-05-27 19:21:47 +00008621 result = stringlib_rfind_slice(
8622 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8623 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8624 start, end
8625 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626
8627 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008628
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 if (result < 0) {
8630 PyErr_SetString(PyExc_ValueError, "substring not found");
8631 return NULL;
8632 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008633 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634}
8635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008636PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008637 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008639Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008640done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008641
8642static PyObject *
8643unicode_rjust(PyUnicodeObject *self, PyObject *args)
8644{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008645 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008646 Py_UNICODE fillchar = ' ';
8647
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008648 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649 return NULL;
8650
Tim Peters7a29bd52001-09-12 03:03:31 +00008651 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652 Py_INCREF(self);
8653 return (PyObject*) self;
8654 }
8655
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008656 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657}
8658
Alexander Belopolsky40018472011-02-26 01:02:56 +00008659PyObject *
8660PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661{
8662 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008663
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664 s = PyUnicode_FromObject(s);
8665 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008666 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 if (sep != NULL) {
8668 sep = PyUnicode_FromObject(sep);
8669 if (sep == NULL) {
8670 Py_DECREF(s);
8671 return NULL;
8672 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 }
8674
8675 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8676
8677 Py_DECREF(s);
8678 Py_XDECREF(sep);
8679 return result;
8680}
8681
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008682PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684\n\
8685Return a list of the words in S, using sep as the\n\
8686delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008687splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008688whitespace string is a separator and empty strings are\n\
8689removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690
8691static PyObject*
8692unicode_split(PyUnicodeObject *self, PyObject *args)
8693{
8694 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008695 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008696
Martin v. Löwis18e16552006-02-15 17:27:45 +00008697 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698 return NULL;
8699
8700 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008706}
8707
Thomas Wouters477c8d52006-05-27 19:21:47 +00008708PyObject *
8709PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8710{
8711 PyObject* str_obj;
8712 PyObject* sep_obj;
8713 PyObject* out;
8714
8715 str_obj = PyUnicode_FromObject(str_in);
8716 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008718 sep_obj = PyUnicode_FromObject(sep_in);
8719 if (!sep_obj) {
8720 Py_DECREF(str_obj);
8721 return NULL;
8722 }
8723
8724 out = stringlib_partition(
8725 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8726 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8727 );
8728
8729 Py_DECREF(sep_obj);
8730 Py_DECREF(str_obj);
8731
8732 return out;
8733}
8734
8735
8736PyObject *
8737PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8738{
8739 PyObject* str_obj;
8740 PyObject* sep_obj;
8741 PyObject* out;
8742
8743 str_obj = PyUnicode_FromObject(str_in);
8744 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008746 sep_obj = PyUnicode_FromObject(sep_in);
8747 if (!sep_obj) {
8748 Py_DECREF(str_obj);
8749 return NULL;
8750 }
8751
8752 out = stringlib_rpartition(
8753 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8754 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8755 );
8756
8757 Py_DECREF(sep_obj);
8758 Py_DECREF(str_obj);
8759
8760 return out;
8761}
8762
8763PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008765\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008766Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008767the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008768found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008769
8770static PyObject*
8771unicode_partition(PyUnicodeObject *self, PyObject *separator)
8772{
8773 return PyUnicode_Partition((PyObject *)self, separator);
8774}
8775
8776PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008777 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008778\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008779Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008780the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008781separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008782
8783static PyObject*
8784unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8785{
8786 return PyUnicode_RPartition((PyObject *)self, separator);
8787}
8788
Alexander Belopolsky40018472011-02-26 01:02:56 +00008789PyObject *
8790PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008791{
8792 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008793
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008794 s = PyUnicode_FromObject(s);
8795 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008796 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 if (sep != NULL) {
8798 sep = PyUnicode_FromObject(sep);
8799 if (sep == NULL) {
8800 Py_DECREF(s);
8801 return NULL;
8802 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008803 }
8804
8805 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8806
8807 Py_DECREF(s);
8808 Py_XDECREF(sep);
8809 return result;
8810}
8811
8812PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008813 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008814\n\
8815Return a list of the words in S, using sep as the\n\
8816delimiter string, starting at the end of the string and\n\
8817working to the front. If maxsplit is given, at most maxsplit\n\
8818splits are done. If sep is not specified, any whitespace string\n\
8819is a separator.");
8820
8821static PyObject*
8822unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8823{
8824 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008825 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008826
Martin v. Löwis18e16552006-02-15 17:27:45 +00008827 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008828 return NULL;
8829
8830 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008831 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008832 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008834 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008835 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008836}
8837
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008838PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008839 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840\n\
8841Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008842Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008843is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008844
8845static PyObject*
8846unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8847{
Guido van Rossum86662912000-04-11 15:38:46 +00008848 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008849
Guido van Rossum86662912000-04-11 15:38:46 +00008850 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851 return NULL;
8852
Guido van Rossum86662912000-04-11 15:38:46 +00008853 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854}
8855
8856static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008857PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858{
Walter Dörwald346737f2007-05-31 10:44:43 +00008859 if (PyUnicode_CheckExact(self)) {
8860 Py_INCREF(self);
8861 return self;
8862 } else
8863 /* Subtype -- return genuine unicode string with the same value. */
8864 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8865 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008866}
8867
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008868PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008869 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870\n\
8871Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008872and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873
8874static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008875unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008876{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877 return fixup(self, fixswapcase);
8878}
8879
Georg Brandlceee0772007-11-27 23:48:05 +00008880PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008882\n\
8883Return a translation table usable for str.translate().\n\
8884If there is only one argument, it must be a dictionary mapping Unicode\n\
8885ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008886Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008887If there are two arguments, they must be strings of equal length, and\n\
8888in the resulting dictionary, each character in x will be mapped to the\n\
8889character at the same position in y. If there is a third argument, it\n\
8890must be a string, whose characters will be mapped to None in the result.");
8891
8892static PyObject*
8893unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8894{
8895 PyObject *x, *y = NULL, *z = NULL;
8896 PyObject *new = NULL, *key, *value;
8897 Py_ssize_t i = 0;
8898 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008899
Georg Brandlceee0772007-11-27 23:48:05 +00008900 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8901 return NULL;
8902 new = PyDict_New();
8903 if (!new)
8904 return NULL;
8905 if (y != NULL) {
8906 /* x must be a string too, of equal length */
8907 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8908 if (!PyUnicode_Check(x)) {
8909 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8910 "be a string if there is a second argument");
8911 goto err;
8912 }
8913 if (PyUnicode_GET_SIZE(x) != ylen) {
8914 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8915 "arguments must have equal length");
8916 goto err;
8917 }
8918 /* create entries for translating chars in x to those in y */
8919 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008920 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8921 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008922 if (!key || !value)
8923 goto err;
8924 res = PyDict_SetItem(new, key, value);
8925 Py_DECREF(key);
8926 Py_DECREF(value);
8927 if (res < 0)
8928 goto err;
8929 }
8930 /* create entries for deleting chars in z */
8931 if (z != NULL) {
8932 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008933 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008934 if (!key)
8935 goto err;
8936 res = PyDict_SetItem(new, key, Py_None);
8937 Py_DECREF(key);
8938 if (res < 0)
8939 goto err;
8940 }
8941 }
8942 } else {
8943 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008944 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008945 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8946 "to maketrans it must be a dict");
8947 goto err;
8948 }
8949 /* copy entries into the new dict, converting string keys to int keys */
8950 while (PyDict_Next(x, &i, &key, &value)) {
8951 if (PyUnicode_Check(key)) {
8952 /* convert string keys to integer keys */
8953 PyObject *newkey;
8954 if (PyUnicode_GET_SIZE(key) != 1) {
8955 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8956 "table must be of length 1");
8957 goto err;
8958 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008959 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008960 if (!newkey)
8961 goto err;
8962 res = PyDict_SetItem(new, newkey, value);
8963 Py_DECREF(newkey);
8964 if (res < 0)
8965 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008966 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008967 /* just keep integer keys */
8968 if (PyDict_SetItem(new, key, value) < 0)
8969 goto err;
8970 } else {
8971 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8972 "be strings or integers");
8973 goto err;
8974 }
8975 }
8976 }
8977 return new;
8978 err:
8979 Py_DECREF(new);
8980 return NULL;
8981}
8982
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008983PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008984 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985\n\
8986Return a copy of the string S, where all characters have been mapped\n\
8987through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008988Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008989Unmapped characters are left untouched. Characters mapped to None\n\
8990are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991
8992static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008993unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994{
Georg Brandlceee0772007-11-27 23:48:05 +00008995 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996}
8997
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008998PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008999 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009001Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002
9003static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009004unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006 return fixup(self, fixupper);
9007}
9008
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009009PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009010 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00009012Pad a numeric string S with zeros on the left, to fill a field\n\
9013of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014
9015static PyObject *
9016unicode_zfill(PyUnicodeObject *self, PyObject *args)
9017{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009018 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 PyUnicodeObject *u;
9020
Martin v. Löwis18e16552006-02-15 17:27:45 +00009021 Py_ssize_t width;
9022 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023 return NULL;
9024
9025 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00009026 if (PyUnicode_CheckExact(self)) {
9027 Py_INCREF(self);
9028 return (PyObject*) self;
9029 }
9030 else
9031 return PyUnicode_FromUnicode(
9032 PyUnicode_AS_UNICODE(self),
9033 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00009034 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009035 }
9036
9037 fill = width - self->length;
9038
9039 u = pad(self, fill, 0, '0');
9040
Walter Dörwald068325e2002-04-15 13:36:47 +00009041 if (u == NULL)
9042 return NULL;
9043
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044 if (u->str[fill] == '+' || u->str[fill] == '-') {
9045 /* move sign to beginning of string */
9046 u->str[0] = u->str[fill];
9047 u->str[fill] = '0';
9048 }
9049
9050 return (PyObject*) u;
9051}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009052
9053#if 0
9054static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009055unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056{
Christian Heimes2202f872008-02-06 14:31:34 +00009057 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009059
9060static PyObject *
9061unicode__decimal2ascii(PyObject *self)
9062{
9063 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9064 PyUnicode_GET_SIZE(self));
9065}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066#endif
9067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009068PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009069 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009070\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009071Return True if S starts with the specified prefix, False otherwise.\n\
9072With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009073With optional end, stop comparing S at that position.\n\
9074prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075
9076static PyObject *
9077unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009078 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009080 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009082 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009083 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009084 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009086 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009087 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9088 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009089 if (PyTuple_Check(subobj)) {
9090 Py_ssize_t i;
9091 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9092 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009093 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009094 if (substring == NULL)
9095 return NULL;
9096 result = tailmatch(self, substring, start, end, -1);
9097 Py_DECREF(substring);
9098 if (result) {
9099 Py_RETURN_TRUE;
9100 }
9101 }
9102 /* nothing matched */
9103 Py_RETURN_FALSE;
9104 }
9105 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009107 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009108 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009109 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009110 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111}
9112
9113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009114PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009115 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009117Return True if S ends with the specified suffix, False otherwise.\n\
9118With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009119With optional end, stop comparing S at that position.\n\
9120suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121
9122static PyObject *
9123unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009124 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009126 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009128 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009129 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009130 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009132 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009133 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9134 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009135 if (PyTuple_Check(subobj)) {
9136 Py_ssize_t i;
9137 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9138 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009140 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009141 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009142 result = tailmatch(self, substring, start, end, +1);
9143 Py_DECREF(substring);
9144 if (result) {
9145 Py_RETURN_TRUE;
9146 }
9147 }
9148 Py_RETURN_FALSE;
9149 }
9150 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009154 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009156 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157}
9158
Eric Smith8c663262007-08-25 02:26:07 +00009159#include "stringlib/string_format.h"
9160
9161PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009162 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009163\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009164Return a formatted version of S, using substitutions from args and kwargs.\n\
9165The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009166
Eric Smith27bbca62010-11-04 17:06:58 +00009167PyDoc_STRVAR(format_map__doc__,
9168 "S.format_map(mapping) -> str\n\
9169\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009170Return a formatted version of S, using substitutions from mapping.\n\
9171The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009172
Eric Smith4a7d76d2008-05-30 18:10:19 +00009173static PyObject *
9174unicode__format__(PyObject* self, PyObject* args)
9175{
9176 PyObject *format_spec;
9177
9178 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9179 return NULL;
9180
9181 return _PyUnicode_FormatAdvanced(self,
9182 PyUnicode_AS_UNICODE(format_spec),
9183 PyUnicode_GET_SIZE(format_spec));
9184}
9185
Eric Smith8c663262007-08-25 02:26:07 +00009186PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009187 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009188\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009189Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009190
9191static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009192unicode__sizeof__(PyUnicodeObject *v)
9193{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009194 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9195 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009196}
9197
9198PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009200
9201static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009202unicode_getnewargs(PyUnicodeObject *v)
9203{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009204 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009205}
9206
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207static PyMethodDef unicode_methods[] = {
9208
9209 /* Order is according to common usage: often used methods should
9210 appear first, since lookup is done sequentially. */
9211
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009212 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009213 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9214 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009215 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009216 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9217 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9218 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9219 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9220 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9221 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9222 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009223 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009224 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9225 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9226 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009227 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009228 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9229 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9230 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009231 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009232 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009233 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009234 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009235 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9236 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9237 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9238 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9239 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9240 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9241 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9242 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9243 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9244 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9245 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9246 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9247 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9248 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009249 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009250 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009251 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009252 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009253 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009254 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009255 {"maketrans", (PyCFunction) unicode_maketrans,
9256 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009257 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009258#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009259 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009260#endif
9261
9262#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009263 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009264 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009265 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009266#endif
9267
Benjamin Peterson14339b62009-01-31 16:36:08 +00009268 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009269 {NULL, NULL}
9270};
9271
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009272static PyObject *
9273unicode_mod(PyObject *v, PyObject *w)
9274{
Benjamin Peterson29060642009-01-31 22:14:21 +00009275 if (!PyUnicode_Check(v)) {
9276 Py_INCREF(Py_NotImplemented);
9277 return Py_NotImplemented;
9278 }
9279 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009280}
9281
9282static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009283 0, /*nb_add*/
9284 0, /*nb_subtract*/
9285 0, /*nb_multiply*/
9286 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009287};
9288
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009290 (lenfunc) unicode_length, /* sq_length */
9291 PyUnicode_Concat, /* sq_concat */
9292 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9293 (ssizeargfunc) unicode_getitem, /* sq_item */
9294 0, /* sq_slice */
9295 0, /* sq_ass_item */
9296 0, /* sq_ass_slice */
9297 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298};
9299
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009300static PyObject*
9301unicode_subscript(PyUnicodeObject* self, PyObject* item)
9302{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009303 if (PyIndex_Check(item)) {
9304 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009305 if (i == -1 && PyErr_Occurred())
9306 return NULL;
9307 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009308 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009309 return unicode_getitem(self, i);
9310 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009311 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009312 Py_UNICODE* source_buf;
9313 Py_UNICODE* result_buf;
9314 PyObject* result;
9315
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009316 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009317 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009318 return NULL;
9319 }
9320
9321 if (slicelength <= 0) {
9322 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009323 } else if (start == 0 && step == 1 && slicelength == self->length &&
9324 PyUnicode_CheckExact(self)) {
9325 Py_INCREF(self);
9326 return (PyObject *)self;
9327 } else if (step == 1) {
9328 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009329 } else {
9330 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009331 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9332 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009333
Benjamin Peterson29060642009-01-31 22:14:21 +00009334 if (result_buf == NULL)
9335 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009336
9337 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9338 result_buf[i] = source_buf[cur];
9339 }
Tim Petersced69f82003-09-16 20:30:58 +00009340
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009341 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009342 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009343 return result;
9344 }
9345 } else {
9346 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9347 return NULL;
9348 }
9349}
9350
9351static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009352 (lenfunc)unicode_length, /* mp_length */
9353 (binaryfunc)unicode_subscript, /* mp_subscript */
9354 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009355};
9356
Guido van Rossumd57fd912000-03-10 22:53:23 +00009357
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358/* Helpers for PyUnicode_Format() */
9359
9360static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009361getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009363 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009364 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009365 (*p_argidx)++;
9366 if (arglen < 0)
9367 return args;
9368 else
9369 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370 }
9371 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373 return NULL;
9374}
9375
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009376/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009378static PyObject *
9379formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009381 char *p;
9382 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009384
Guido van Rossumd57fd912000-03-10 22:53:23 +00009385 x = PyFloat_AsDouble(v);
9386 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009387 return NULL;
9388
Guido van Rossumd57fd912000-03-10 22:53:23 +00009389 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009390 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009391
Eric Smith0923d1d2009-04-16 20:16:10 +00009392 p = PyOS_double_to_string(x, type, prec,
9393 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009394 if (p == NULL)
9395 return NULL;
9396 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009397 PyMem_Free(p);
9398 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399}
9400
Tim Peters38fd5b62000-09-21 05:43:11 +00009401static PyObject*
9402formatlong(PyObject *val, int flags, int prec, int type)
9403{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009404 char *buf;
9405 int len;
9406 PyObject *str; /* temporary string object. */
9407 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009408
Benjamin Peterson14339b62009-01-31 16:36:08 +00009409 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9410 if (!str)
9411 return NULL;
9412 result = PyUnicode_FromStringAndSize(buf, len);
9413 Py_DECREF(str);
9414 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009415}
9416
Guido van Rossumd57fd912000-03-10 22:53:23 +00009417static int
9418formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009419 size_t buflen,
9420 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009421{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009422 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009423 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009424 if (PyUnicode_GET_SIZE(v) == 1) {
9425 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9426 buf[1] = '\0';
9427 return 1;
9428 }
9429#ifndef Py_UNICODE_WIDE
9430 if (PyUnicode_GET_SIZE(v) == 2) {
9431 /* Decode a valid surrogate pair */
9432 int c0 = PyUnicode_AS_UNICODE(v)[0];
9433 int c1 = PyUnicode_AS_UNICODE(v)[1];
9434 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9435 0xDC00 <= c1 && c1 <= 0xDFFF) {
9436 buf[0] = c0;
9437 buf[1] = c1;
9438 buf[2] = '\0';
9439 return 2;
9440 }
9441 }
9442#endif
9443 goto onError;
9444 }
9445 else {
9446 /* Integer input truncated to a character */
9447 long x;
9448 x = PyLong_AsLong(v);
9449 if (x == -1 && PyErr_Occurred())
9450 goto onError;
9451
9452 if (x < 0 || x > 0x10ffff) {
9453 PyErr_SetString(PyExc_OverflowError,
9454 "%c arg not in range(0x110000)");
9455 return -1;
9456 }
9457
9458#ifndef Py_UNICODE_WIDE
9459 if (x > 0xffff) {
9460 x -= 0x10000;
9461 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9462 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9463 return 2;
9464 }
9465#endif
9466 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009467 buf[1] = '\0';
9468 return 1;
9469 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009470
Benjamin Peterson29060642009-01-31 22:14:21 +00009471 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009472 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009473 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009474 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475}
9476
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009477/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009478 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009479*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009480#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009481
Alexander Belopolsky40018472011-02-26 01:02:56 +00009482PyObject *
9483PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484{
9485 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009486 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487 int args_owned = 0;
9488 PyUnicodeObject *result = NULL;
9489 PyObject *dict = NULL;
9490 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009491
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009493 PyErr_BadInternalCall();
9494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009495 }
9496 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009497 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009498 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009499 fmt = PyUnicode_AS_UNICODE(uformat);
9500 fmtcnt = PyUnicode_GET_SIZE(uformat);
9501
9502 reslen = rescnt = fmtcnt + 100;
9503 result = _PyUnicode_New(reslen);
9504 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009505 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506 res = PyUnicode_AS_UNICODE(result);
9507
9508 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009509 arglen = PyTuple_Size(args);
9510 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511 }
9512 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009513 arglen = -1;
9514 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009516 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009517 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009518 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519
9520 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009521 if (*fmt != '%') {
9522 if (--rescnt < 0) {
9523 rescnt = fmtcnt + 100;
9524 reslen += rescnt;
9525 if (_PyUnicode_Resize(&result, reslen) < 0)
9526 goto onError;
9527 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9528 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009529 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009530 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009531 }
9532 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009533 /* Got a format specifier */
9534 int flags = 0;
9535 Py_ssize_t width = -1;
9536 int prec = -1;
9537 Py_UNICODE c = '\0';
9538 Py_UNICODE fill;
9539 int isnumok;
9540 PyObject *v = NULL;
9541 PyObject *temp = NULL;
9542 Py_UNICODE *pbuf;
9543 Py_UNICODE sign;
9544 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009545 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009546
Benjamin Peterson29060642009-01-31 22:14:21 +00009547 fmt++;
9548 if (*fmt == '(') {
9549 Py_UNICODE *keystart;
9550 Py_ssize_t keylen;
9551 PyObject *key;
9552 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009553
Benjamin Peterson29060642009-01-31 22:14:21 +00009554 if (dict == NULL) {
9555 PyErr_SetString(PyExc_TypeError,
9556 "format requires a mapping");
9557 goto onError;
9558 }
9559 ++fmt;
9560 --fmtcnt;
9561 keystart = fmt;
9562 /* Skip over balanced parentheses */
9563 while (pcount > 0 && --fmtcnt >= 0) {
9564 if (*fmt == ')')
9565 --pcount;
9566 else if (*fmt == '(')
9567 ++pcount;
9568 fmt++;
9569 }
9570 keylen = fmt - keystart - 1;
9571 if (fmtcnt < 0 || pcount > 0) {
9572 PyErr_SetString(PyExc_ValueError,
9573 "incomplete format key");
9574 goto onError;
9575 }
9576#if 0
9577 /* keys are converted to strings using UTF-8 and
9578 then looked up since Python uses strings to hold
9579 variables names etc. in its namespaces and we
9580 wouldn't want to break common idioms. */
9581 key = PyUnicode_EncodeUTF8(keystart,
9582 keylen,
9583 NULL);
9584#else
9585 key = PyUnicode_FromUnicode(keystart, keylen);
9586#endif
9587 if (key == NULL)
9588 goto onError;
9589 if (args_owned) {
9590 Py_DECREF(args);
9591 args_owned = 0;
9592 }
9593 args = PyObject_GetItem(dict, key);
9594 Py_DECREF(key);
9595 if (args == NULL) {
9596 goto onError;
9597 }
9598 args_owned = 1;
9599 arglen = -1;
9600 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009601 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009602 while (--fmtcnt >= 0) {
9603 switch (c = *fmt++) {
9604 case '-': flags |= F_LJUST; continue;
9605 case '+': flags |= F_SIGN; continue;
9606 case ' ': flags |= F_BLANK; continue;
9607 case '#': flags |= F_ALT; continue;
9608 case '0': flags |= F_ZERO; continue;
9609 }
9610 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009611 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009612 if (c == '*') {
9613 v = getnextarg(args, arglen, &argidx);
9614 if (v == NULL)
9615 goto onError;
9616 if (!PyLong_Check(v)) {
9617 PyErr_SetString(PyExc_TypeError,
9618 "* wants int");
9619 goto onError;
9620 }
9621 width = PyLong_AsLong(v);
9622 if (width == -1 && PyErr_Occurred())
9623 goto onError;
9624 if (width < 0) {
9625 flags |= F_LJUST;
9626 width = -width;
9627 }
9628 if (--fmtcnt >= 0)
9629 c = *fmt++;
9630 }
9631 else if (c >= '0' && c <= '9') {
9632 width = c - '0';
9633 while (--fmtcnt >= 0) {
9634 c = *fmt++;
9635 if (c < '0' || c > '9')
9636 break;
9637 if ((width*10) / 10 != width) {
9638 PyErr_SetString(PyExc_ValueError,
9639 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009640 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009641 }
9642 width = width*10 + (c - '0');
9643 }
9644 }
9645 if (c == '.') {
9646 prec = 0;
9647 if (--fmtcnt >= 0)
9648 c = *fmt++;
9649 if (c == '*') {
9650 v = getnextarg(args, arglen, &argidx);
9651 if (v == NULL)
9652 goto onError;
9653 if (!PyLong_Check(v)) {
9654 PyErr_SetString(PyExc_TypeError,
9655 "* wants int");
9656 goto onError;
9657 }
9658 prec = PyLong_AsLong(v);
9659 if (prec == -1 && PyErr_Occurred())
9660 goto onError;
9661 if (prec < 0)
9662 prec = 0;
9663 if (--fmtcnt >= 0)
9664 c = *fmt++;
9665 }
9666 else if (c >= '0' && c <= '9') {
9667 prec = c - '0';
9668 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009669 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009670 if (c < '0' || c > '9')
9671 break;
9672 if ((prec*10) / 10 != prec) {
9673 PyErr_SetString(PyExc_ValueError,
9674 "prec too big");
9675 goto onError;
9676 }
9677 prec = prec*10 + (c - '0');
9678 }
9679 }
9680 } /* prec */
9681 if (fmtcnt >= 0) {
9682 if (c == 'h' || c == 'l' || c == 'L') {
9683 if (--fmtcnt >= 0)
9684 c = *fmt++;
9685 }
9686 }
9687 if (fmtcnt < 0) {
9688 PyErr_SetString(PyExc_ValueError,
9689 "incomplete format");
9690 goto onError;
9691 }
9692 if (c != '%') {
9693 v = getnextarg(args, arglen, &argidx);
9694 if (v == NULL)
9695 goto onError;
9696 }
9697 sign = 0;
9698 fill = ' ';
9699 switch (c) {
9700
9701 case '%':
9702 pbuf = formatbuf;
9703 /* presume that buffer length is at least 1 */
9704 pbuf[0] = '%';
9705 len = 1;
9706 break;
9707
9708 case 's':
9709 case 'r':
9710 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009711 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009712 temp = v;
9713 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009714 }
9715 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009716 if (c == 's')
9717 temp = PyObject_Str(v);
9718 else if (c == 'r')
9719 temp = PyObject_Repr(v);
9720 else
9721 temp = PyObject_ASCII(v);
9722 if (temp == NULL)
9723 goto onError;
9724 if (PyUnicode_Check(temp))
9725 /* nothing to do */;
9726 else {
9727 Py_DECREF(temp);
9728 PyErr_SetString(PyExc_TypeError,
9729 "%s argument has non-string str()");
9730 goto onError;
9731 }
9732 }
9733 pbuf = PyUnicode_AS_UNICODE(temp);
9734 len = PyUnicode_GET_SIZE(temp);
9735 if (prec >= 0 && len > prec)
9736 len = prec;
9737 break;
9738
9739 case 'i':
9740 case 'd':
9741 case 'u':
9742 case 'o':
9743 case 'x':
9744 case 'X':
9745 if (c == 'i')
9746 c = 'd';
9747 isnumok = 0;
9748 if (PyNumber_Check(v)) {
9749 PyObject *iobj=NULL;
9750
9751 if (PyLong_Check(v)) {
9752 iobj = v;
9753 Py_INCREF(iobj);
9754 }
9755 else {
9756 iobj = PyNumber_Long(v);
9757 }
9758 if (iobj!=NULL) {
9759 if (PyLong_Check(iobj)) {
9760 isnumok = 1;
9761 temp = formatlong(iobj, flags, prec, c);
9762 Py_DECREF(iobj);
9763 if (!temp)
9764 goto onError;
9765 pbuf = PyUnicode_AS_UNICODE(temp);
9766 len = PyUnicode_GET_SIZE(temp);
9767 sign = 1;
9768 }
9769 else {
9770 Py_DECREF(iobj);
9771 }
9772 }
9773 }
9774 if (!isnumok) {
9775 PyErr_Format(PyExc_TypeError,
9776 "%%%c format: a number is required, "
9777 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9778 goto onError;
9779 }
9780 if (flags & F_ZERO)
9781 fill = '0';
9782 break;
9783
9784 case 'e':
9785 case 'E':
9786 case 'f':
9787 case 'F':
9788 case 'g':
9789 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009790 temp = formatfloat(v, flags, prec, c);
9791 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009792 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009793 pbuf = PyUnicode_AS_UNICODE(temp);
9794 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009795 sign = 1;
9796 if (flags & F_ZERO)
9797 fill = '0';
9798 break;
9799
9800 case 'c':
9801 pbuf = formatbuf;
9802 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9803 if (len < 0)
9804 goto onError;
9805 break;
9806
9807 default:
9808 PyErr_Format(PyExc_ValueError,
9809 "unsupported format character '%c' (0x%x) "
9810 "at index %zd",
9811 (31<=c && c<=126) ? (char)c : '?',
9812 (int)c,
9813 (Py_ssize_t)(fmt - 1 -
9814 PyUnicode_AS_UNICODE(uformat)));
9815 goto onError;
9816 }
9817 if (sign) {
9818 if (*pbuf == '-' || *pbuf == '+') {
9819 sign = *pbuf++;
9820 len--;
9821 }
9822 else if (flags & F_SIGN)
9823 sign = '+';
9824 else if (flags & F_BLANK)
9825 sign = ' ';
9826 else
9827 sign = 0;
9828 }
9829 if (width < len)
9830 width = len;
9831 if (rescnt - (sign != 0) < width) {
9832 reslen -= rescnt;
9833 rescnt = width + fmtcnt + 100;
9834 reslen += rescnt;
9835 if (reslen < 0) {
9836 Py_XDECREF(temp);
9837 PyErr_NoMemory();
9838 goto onError;
9839 }
9840 if (_PyUnicode_Resize(&result, reslen) < 0) {
9841 Py_XDECREF(temp);
9842 goto onError;
9843 }
9844 res = PyUnicode_AS_UNICODE(result)
9845 + reslen - rescnt;
9846 }
9847 if (sign) {
9848 if (fill != ' ')
9849 *res++ = sign;
9850 rescnt--;
9851 if (width > len)
9852 width--;
9853 }
9854 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9855 assert(pbuf[0] == '0');
9856 assert(pbuf[1] == c);
9857 if (fill != ' ') {
9858 *res++ = *pbuf++;
9859 *res++ = *pbuf++;
9860 }
9861 rescnt -= 2;
9862 width -= 2;
9863 if (width < 0)
9864 width = 0;
9865 len -= 2;
9866 }
9867 if (width > len && !(flags & F_LJUST)) {
9868 do {
9869 --rescnt;
9870 *res++ = fill;
9871 } while (--width > len);
9872 }
9873 if (fill == ' ') {
9874 if (sign)
9875 *res++ = sign;
9876 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9877 assert(pbuf[0] == '0');
9878 assert(pbuf[1] == c);
9879 *res++ = *pbuf++;
9880 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009881 }
9882 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009883 Py_UNICODE_COPY(res, pbuf, len);
9884 res += len;
9885 rescnt -= len;
9886 while (--width >= len) {
9887 --rescnt;
9888 *res++ = ' ';
9889 }
9890 if (dict && (argidx < arglen) && c != '%') {
9891 PyErr_SetString(PyExc_TypeError,
9892 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009893 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009894 goto onError;
9895 }
9896 Py_XDECREF(temp);
9897 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898 } /* until end */
9899 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009900 PyErr_SetString(PyExc_TypeError,
9901 "not all arguments converted during string formatting");
9902 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009903 }
9904
Thomas Woutersa96affe2006-03-12 00:29:36 +00009905 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009906 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009908 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909 }
9910 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911 return (PyObject *)result;
9912
Benjamin Peterson29060642009-01-31 22:14:21 +00009913 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914 Py_XDECREF(result);
9915 Py_DECREF(uformat);
9916 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009917 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918 }
9919 return NULL;
9920}
9921
Jeremy Hylton938ace62002-07-17 16:30:39 +00009922static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009923unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9924
Tim Peters6d6c1a32001-08-02 04:15:00 +00009925static PyObject *
9926unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9927{
Benjamin Peterson29060642009-01-31 22:14:21 +00009928 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009929 static char *kwlist[] = {"object", "encoding", "errors", 0};
9930 char *encoding = NULL;
9931 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009932
Benjamin Peterson14339b62009-01-31 16:36:08 +00009933 if (type != &PyUnicode_Type)
9934 return unicode_subtype_new(type, args, kwds);
9935 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009936 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009937 return NULL;
9938 if (x == NULL)
9939 return (PyObject *)_PyUnicode_New(0);
9940 if (encoding == NULL && errors == NULL)
9941 return PyObject_Str(x);
9942 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009943 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009944}
9945
Guido van Rossume023fe02001-08-30 03:12:59 +00009946static PyObject *
9947unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9948{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009949 PyUnicodeObject *tmp, *pnew;
9950 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009951
Benjamin Peterson14339b62009-01-31 16:36:08 +00009952 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9953 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9954 if (tmp == NULL)
9955 return NULL;
9956 assert(PyUnicode_Check(tmp));
9957 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9958 if (pnew == NULL) {
9959 Py_DECREF(tmp);
9960 return NULL;
9961 }
9962 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9963 if (pnew->str == NULL) {
9964 _Py_ForgetReference((PyObject *)pnew);
9965 PyObject_Del(pnew);
9966 Py_DECREF(tmp);
9967 return PyErr_NoMemory();
9968 }
9969 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9970 pnew->length = n;
9971 pnew->hash = tmp->hash;
9972 Py_DECREF(tmp);
9973 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009974}
9975
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009976PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009977 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009978\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009979Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009980encoding defaults to the current default string encoding.\n\
9981errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009982
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009983static PyObject *unicode_iter(PyObject *seq);
9984
Guido van Rossumd57fd912000-03-10 22:53:23 +00009985PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009986 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009987 "str", /* tp_name */
9988 sizeof(PyUnicodeObject), /* tp_size */
9989 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009991 (destructor)unicode_dealloc, /* tp_dealloc */
9992 0, /* tp_print */
9993 0, /* tp_getattr */
9994 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009995 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009996 unicode_repr, /* tp_repr */
9997 &unicode_as_number, /* tp_as_number */
9998 &unicode_as_sequence, /* tp_as_sequence */
9999 &unicode_as_mapping, /* tp_as_mapping */
10000 (hashfunc) unicode_hash, /* tp_hash*/
10001 0, /* tp_call*/
10002 (reprfunc) unicode_str, /* tp_str */
10003 PyObject_GenericGetAttr, /* tp_getattro */
10004 0, /* tp_setattro */
10005 0, /* tp_as_buffer */
10006 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000010007 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010008 unicode_doc, /* tp_doc */
10009 0, /* tp_traverse */
10010 0, /* tp_clear */
10011 PyUnicode_RichCompare, /* tp_richcompare */
10012 0, /* tp_weaklistoffset */
10013 unicode_iter, /* tp_iter */
10014 0, /* tp_iternext */
10015 unicode_methods, /* tp_methods */
10016 0, /* tp_members */
10017 0, /* tp_getset */
10018 &PyBaseObject_Type, /* tp_base */
10019 0, /* tp_dict */
10020 0, /* tp_descr_get */
10021 0, /* tp_descr_set */
10022 0, /* tp_dictoffset */
10023 0, /* tp_init */
10024 0, /* tp_alloc */
10025 unicode_new, /* tp_new */
10026 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027};
10028
10029/* Initialize the Unicode implementation */
10030
Thomas Wouters78890102000-07-22 19:25:51 +000010031void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010033 int i;
10034
Thomas Wouters477c8d52006-05-27 19:21:47 +000010035 /* XXX - move this array to unicodectype.c ? */
10036 Py_UNICODE linebreak[] = {
10037 0x000A, /* LINE FEED */
10038 0x000D, /* CARRIAGE RETURN */
10039 0x001C, /* FILE SEPARATOR */
10040 0x001D, /* GROUP SEPARATOR */
10041 0x001E, /* RECORD SEPARATOR */
10042 0x0085, /* NEXT LINE */
10043 0x2028, /* LINE SEPARATOR */
10044 0x2029, /* PARAGRAPH SEPARATOR */
10045 };
10046
Fred Drakee4315f52000-05-09 19:53:39 +000010047 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +000010048 free_list = NULL;
10049 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010050 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010051 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +000010052 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010053
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010054 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000010055 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000010056 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010057 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000010058
10059 /* initialize the linebreak bloom filter */
10060 bloom_linebreak = make_bloom_mask(
10061 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10062 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010063
10064 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010065}
10066
10067/* Finalize the Unicode implementation */
10068
Christian Heimesa156e092008-02-16 07:38:31 +000010069int
10070PyUnicode_ClearFreeList(void)
10071{
10072 int freelist_size = numfree;
10073 PyUnicodeObject *u;
10074
10075 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010076 PyUnicodeObject *v = u;
10077 u = *(PyUnicodeObject **)u;
10078 if (v->str)
10079 PyObject_DEL(v->str);
10080 Py_XDECREF(v->defenc);
10081 PyObject_Del(v);
10082 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010083 }
10084 free_list = NULL;
10085 assert(numfree == 0);
10086 return freelist_size;
10087}
10088
Guido van Rossumd57fd912000-03-10 22:53:23 +000010089void
Thomas Wouters78890102000-07-22 19:25:51 +000010090_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010091{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010092 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010093
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010094 Py_XDECREF(unicode_empty);
10095 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010096
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010097 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010098 if (unicode_latin1[i]) {
10099 Py_DECREF(unicode_latin1[i]);
10100 unicode_latin1[i] = NULL;
10101 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010102 }
Christian Heimesa156e092008-02-16 07:38:31 +000010103 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010104}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010105
Walter Dörwald16807132007-05-25 13:52:07 +000010106void
10107PyUnicode_InternInPlace(PyObject **p)
10108{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010109 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10110 PyObject *t;
10111 if (s == NULL || !PyUnicode_Check(s))
10112 Py_FatalError(
10113 "PyUnicode_InternInPlace: unicode strings only please!");
10114 /* If it's a subclass, we don't really know what putting
10115 it in the interned dict might do. */
10116 if (!PyUnicode_CheckExact(s))
10117 return;
10118 if (PyUnicode_CHECK_INTERNED(s))
10119 return;
10120 if (interned == NULL) {
10121 interned = PyDict_New();
10122 if (interned == NULL) {
10123 PyErr_Clear(); /* Don't leave an exception */
10124 return;
10125 }
10126 }
10127 /* It might be that the GetItem call fails even
10128 though the key is present in the dictionary,
10129 namely when this happens during a stack overflow. */
10130 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010131 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010132 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010133
Benjamin Peterson29060642009-01-31 22:14:21 +000010134 if (t) {
10135 Py_INCREF(t);
10136 Py_DECREF(*p);
10137 *p = t;
10138 return;
10139 }
Walter Dörwald16807132007-05-25 13:52:07 +000010140
Benjamin Peterson14339b62009-01-31 16:36:08 +000010141 PyThreadState_GET()->recursion_critical = 1;
10142 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10143 PyErr_Clear();
10144 PyThreadState_GET()->recursion_critical = 0;
10145 return;
10146 }
10147 PyThreadState_GET()->recursion_critical = 0;
10148 /* The two references in interned are not counted by refcnt.
10149 The deallocator will take care of this */
10150 Py_REFCNT(s) -= 2;
10151 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010152}
10153
10154void
10155PyUnicode_InternImmortal(PyObject **p)
10156{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010157 PyUnicode_InternInPlace(p);
10158 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10159 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10160 Py_INCREF(*p);
10161 }
Walter Dörwald16807132007-05-25 13:52:07 +000010162}
10163
10164PyObject *
10165PyUnicode_InternFromString(const char *cp)
10166{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010167 PyObject *s = PyUnicode_FromString(cp);
10168 if (s == NULL)
10169 return NULL;
10170 PyUnicode_InternInPlace(&s);
10171 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010172}
10173
Alexander Belopolsky40018472011-02-26 01:02:56 +000010174void
10175_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000010176{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010177 PyObject *keys;
10178 PyUnicodeObject *s;
10179 Py_ssize_t i, n;
10180 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010181
Benjamin Peterson14339b62009-01-31 16:36:08 +000010182 if (interned == NULL || !PyDict_Check(interned))
10183 return;
10184 keys = PyDict_Keys(interned);
10185 if (keys == NULL || !PyList_Check(keys)) {
10186 PyErr_Clear();
10187 return;
10188 }
Walter Dörwald16807132007-05-25 13:52:07 +000010189
Benjamin Peterson14339b62009-01-31 16:36:08 +000010190 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10191 detector, interned unicode strings are not forcibly deallocated;
10192 rather, we give them their stolen references back, and then clear
10193 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010194
Benjamin Peterson14339b62009-01-31 16:36:08 +000010195 n = PyList_GET_SIZE(keys);
10196 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010197 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010198 for (i = 0; i < n; i++) {
10199 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10200 switch (s->state) {
10201 case SSTATE_NOT_INTERNED:
10202 /* XXX Shouldn't happen */
10203 break;
10204 case SSTATE_INTERNED_IMMORTAL:
10205 Py_REFCNT(s) += 1;
10206 immortal_size += s->length;
10207 break;
10208 case SSTATE_INTERNED_MORTAL:
10209 Py_REFCNT(s) += 2;
10210 mortal_size += s->length;
10211 break;
10212 default:
10213 Py_FatalError("Inconsistent interned string state.");
10214 }
10215 s->state = SSTATE_NOT_INTERNED;
10216 }
10217 fprintf(stderr, "total size of all interned strings: "
10218 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10219 "mortal/immortal\n", mortal_size, immortal_size);
10220 Py_DECREF(keys);
10221 PyDict_Clear(interned);
10222 Py_DECREF(interned);
10223 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010224}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010225
10226
10227/********************* Unicode Iterator **************************/
10228
10229typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010230 PyObject_HEAD
10231 Py_ssize_t it_index;
10232 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010233} unicodeiterobject;
10234
10235static void
10236unicodeiter_dealloc(unicodeiterobject *it)
10237{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010238 _PyObject_GC_UNTRACK(it);
10239 Py_XDECREF(it->it_seq);
10240 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010241}
10242
10243static int
10244unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10245{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010246 Py_VISIT(it->it_seq);
10247 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010248}
10249
10250static PyObject *
10251unicodeiter_next(unicodeiterobject *it)
10252{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010253 PyUnicodeObject *seq;
10254 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010255
Benjamin Peterson14339b62009-01-31 16:36:08 +000010256 assert(it != NULL);
10257 seq = it->it_seq;
10258 if (seq == NULL)
10259 return NULL;
10260 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010261
Benjamin Peterson14339b62009-01-31 16:36:08 +000010262 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10263 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010264 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010265 if (item != NULL)
10266 ++it->it_index;
10267 return item;
10268 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010269
Benjamin Peterson14339b62009-01-31 16:36:08 +000010270 Py_DECREF(seq);
10271 it->it_seq = NULL;
10272 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010273}
10274
10275static PyObject *
10276unicodeiter_len(unicodeiterobject *it)
10277{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010278 Py_ssize_t len = 0;
10279 if (it->it_seq)
10280 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10281 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010282}
10283
10284PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10285
10286static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010287 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010288 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010289 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010290};
10291
10292PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010293 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10294 "str_iterator", /* tp_name */
10295 sizeof(unicodeiterobject), /* tp_basicsize */
10296 0, /* tp_itemsize */
10297 /* methods */
10298 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10299 0, /* tp_print */
10300 0, /* tp_getattr */
10301 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010302 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010303 0, /* tp_repr */
10304 0, /* tp_as_number */
10305 0, /* tp_as_sequence */
10306 0, /* tp_as_mapping */
10307 0, /* tp_hash */
10308 0, /* tp_call */
10309 0, /* tp_str */
10310 PyObject_GenericGetAttr, /* tp_getattro */
10311 0, /* tp_setattro */
10312 0, /* tp_as_buffer */
10313 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10314 0, /* tp_doc */
10315 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10316 0, /* tp_clear */
10317 0, /* tp_richcompare */
10318 0, /* tp_weaklistoffset */
10319 PyObject_SelfIter, /* tp_iter */
10320 (iternextfunc)unicodeiter_next, /* tp_iternext */
10321 unicodeiter_methods, /* tp_methods */
10322 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010323};
10324
10325static PyObject *
10326unicode_iter(PyObject *seq)
10327{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010328 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010329
Benjamin Peterson14339b62009-01-31 16:36:08 +000010330 if (!PyUnicode_Check(seq)) {
10331 PyErr_BadInternalCall();
10332 return NULL;
10333 }
10334 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10335 if (it == NULL)
10336 return NULL;
10337 it->it_index = 0;
10338 Py_INCREF(seq);
10339 it->it_seq = (PyUnicodeObject *)seq;
10340 _PyObject_GC_TRACK(it);
10341 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010342}
10343
Martin v. Löwis5b222132007-06-10 09:51:05 +000010344size_t
10345Py_UNICODE_strlen(const Py_UNICODE *u)
10346{
10347 int res = 0;
10348 while(*u++)
10349 res++;
10350 return res;
10351}
10352
10353Py_UNICODE*
10354Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10355{
10356 Py_UNICODE *u = s1;
10357 while ((*u++ = *s2++));
10358 return s1;
10359}
10360
10361Py_UNICODE*
10362Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10363{
10364 Py_UNICODE *u = s1;
10365 while ((*u++ = *s2++))
10366 if (n-- == 0)
10367 break;
10368 return s1;
10369}
10370
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010371Py_UNICODE*
10372Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10373{
10374 Py_UNICODE *u1 = s1;
10375 u1 += Py_UNICODE_strlen(u1);
10376 Py_UNICODE_strcpy(u1, s2);
10377 return s1;
10378}
10379
Martin v. Löwis5b222132007-06-10 09:51:05 +000010380int
10381Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10382{
10383 while (*s1 && *s2 && *s1 == *s2)
10384 s1++, s2++;
10385 if (*s1 && *s2)
10386 return (*s1 < *s2) ? -1 : +1;
10387 if (*s1)
10388 return 1;
10389 if (*s2)
10390 return -1;
10391 return 0;
10392}
10393
Victor Stinneref8d95c2010-08-16 22:03:11 +000010394int
10395Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10396{
10397 register Py_UNICODE u1, u2;
10398 for (; n != 0; n--) {
10399 u1 = *s1;
10400 u2 = *s2;
10401 if (u1 != u2)
10402 return (u1 < u2) ? -1 : +1;
10403 if (u1 == '\0')
10404 return 0;
10405 s1++;
10406 s2++;
10407 }
10408 return 0;
10409}
10410
Martin v. Löwis5b222132007-06-10 09:51:05 +000010411Py_UNICODE*
10412Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10413{
10414 const Py_UNICODE *p;
10415 for (p = s; *p; p++)
10416 if (*p == c)
10417 return (Py_UNICODE*)p;
10418 return NULL;
10419}
10420
Victor Stinner331ea922010-08-10 16:37:20 +000010421Py_UNICODE*
10422Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10423{
10424 const Py_UNICODE *p;
10425 p = s + Py_UNICODE_strlen(s);
10426 while (p != s) {
10427 p--;
10428 if (*p == c)
10429 return (Py_UNICODE*)p;
10430 }
10431 return NULL;
10432}
10433
Victor Stinner71133ff2010-09-01 23:43:53 +000010434Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010435PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010436{
10437 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10438 Py_UNICODE *copy;
10439 Py_ssize_t size;
10440
10441 /* Ensure we won't overflow the size. */
10442 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10443 PyErr_NoMemory();
10444 return NULL;
10445 }
10446 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10447 size *= sizeof(Py_UNICODE);
10448 copy = PyMem_Malloc(size);
10449 if (copy == NULL) {
10450 PyErr_NoMemory();
10451 return NULL;
10452 }
10453 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10454 return copy;
10455}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010456
Georg Brandl66c221e2010-10-14 07:04:07 +000010457/* A _string module, to export formatter_parser and formatter_field_name_split
10458 to the string.Formatter class implemented in Python. */
10459
10460static PyMethodDef _string_methods[] = {
10461 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10462 METH_O, PyDoc_STR("split the argument as a field name")},
10463 {"formatter_parser", (PyCFunction) formatter_parser,
10464 METH_O, PyDoc_STR("parse the argument as a format string")},
10465 {NULL, NULL}
10466};
10467
10468static struct PyModuleDef _string_module = {
10469 PyModuleDef_HEAD_INIT,
10470 "_string",
10471 PyDoc_STR("string helper module"),
10472 0,
10473 _string_methods,
10474 NULL,
10475 NULL,
10476 NULL,
10477 NULL
10478};
10479
10480PyMODINIT_FUNC
10481PyInit__string(void)
10482{
10483 return PyModule_Create(&_string_module);
10484}
10485
10486
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010487#ifdef __cplusplus
10488}
10489#endif