blob: 4361908c3a87bb634f7dc2d8f3500a9993f45208 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Alexander Belopolsky40018472011-02-26 01:02:56 +0000145static PyObject *
146unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000147 PyObject **errorHandler,const char *encoding, const char *reason,
148 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
149 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
150
Alexander Belopolsky40018472011-02-26 01:02:56 +0000151static void
152raise_encode_exception(PyObject **exceptionObject,
153 const char *encoding,
154 const Py_UNICODE *unicode, Py_ssize_t size,
155 Py_ssize_t startpos, Py_ssize_t endpos,
156 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000157
Christian Heimes190d79e2008-01-30 11:58:22 +0000158/* Same for linebreaks */
159static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000161/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000162/* 0x000B, * LINE TABULATION */
163/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000165 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000166 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000167/* 0x001C, * FILE SEPARATOR */
168/* 0x001D, * GROUP SEPARATOR */
169/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000175
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000184};
185
186
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000188PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000190#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000192#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 /* This is actually an illegal character, so it should
194 not be passed to unichr. */
195 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000196#endif
197}
198
Thomas Wouters477c8d52006-05-27 19:21:47 +0000199/* --- Bloom Filters ----------------------------------------------------- */
200
201/* stuff to implement simple "bloom filters" for Unicode characters.
202 to keep things simple, we use a single bitmask, using the least 5
203 bits from each unicode characters as the bit index. */
204
205/* the linebreak mask is set up by Unicode_Init below */
206
Antoine Pitrouf068f942010-01-13 14:19:12 +0000207#if LONG_BIT >= 128
208#define BLOOM_WIDTH 128
209#elif LONG_BIT >= 64
210#define BLOOM_WIDTH 64
211#elif LONG_BIT >= 32
212#define BLOOM_WIDTH 32
213#else
214#error "LONG_BIT is smaller than 32"
215#endif
216
Thomas Wouters477c8d52006-05-27 19:21:47 +0000217#define BLOOM_MASK unsigned long
218
219static BLOOM_MASK bloom_linebreak;
220
Antoine Pitrouf068f942010-01-13 14:19:12 +0000221#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
222#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223
Benjamin Peterson29060642009-01-31 22:14:21 +0000224#define BLOOM_LINEBREAK(ch) \
225 ((ch) < 128U ? ascii_linebreak[(ch)] : \
226 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227
Alexander Belopolsky40018472011-02-26 01:02:56 +0000228Py_LOCAL_INLINE(BLOOM_MASK)
229make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243Py_LOCAL_INLINE(int)
244unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245{
246 Py_ssize_t i;
247
248 for (i = 0; i < setlen; i++)
249 if (set[i] == chr)
250 return 1;
251
252 return 0;
253}
254
Benjamin Peterson29060642009-01-31 22:14:21 +0000255#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000256 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
257
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258/* --- Unicode Object ----------------------------------------------------- */
259
Alexander Belopolsky40018472011-02-26 01:02:56 +0000260static int
261unicode_resize(register PyUnicodeObject *unicode,
262 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263{
264 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000265
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000268 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000270 /* Resizing shared object (unicode_empty or single character
271 objects) in-place is not allowed. Use PyUnicode_Resize()
272 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000275 (unicode->length == 1 &&
276 unicode->str[0] < 256U &&
277 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000279 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return -1;
281 }
282
Thomas Wouters477c8d52006-05-27 19:21:47 +0000283 /* We allocate one more byte to make sure the string is Ux0000 terminated.
284 The overallocation is also used by fastsearch, which assumes that it's
285 safe to look at str[length] (without making any assumptions about what
286 it contains). */
287
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000289 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000290 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000292 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 PyErr_NoMemory();
294 return -1;
295 }
296 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000297 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000301 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000302 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000303 }
304 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000305
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306 return 0;
307}
308
309/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000310 Ux0000 terminated; some code (e.g. new_identifier)
311 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312
313 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000314 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
316*/
317
Alexander Belopolsky40018472011-02-26 01:02:56 +0000318static PyUnicodeObject *
319_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320{
321 register PyUnicodeObject *unicode;
322
Thomas Wouters477c8d52006-05-27 19:21:47 +0000323 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (length == 0 && unicode_empty != NULL) {
325 Py_INCREF(unicode_empty);
326 return unicode_empty;
327 }
328
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000329 /* Ensure we won't overflow the size. */
330 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
331 return (PyUnicodeObject *)PyErr_NoMemory();
332 }
333
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000335 if (free_list) {
336 unicode = free_list;
337 free_list = *(PyUnicodeObject **)unicode;
338 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 if (unicode->str) {
340 /* Keep-Alive optimization: we only upsize the buffer,
341 never downsize it. */
342 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000343 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000344 PyObject_DEL(unicode->str);
345 unicode->str = NULL;
346 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000347 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000348 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000351 }
352 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 }
354 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000356 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 if (unicode == NULL)
358 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000359 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
360 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 }
362
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000363 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 PyErr_NoMemory();
365 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000366 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000367 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000368 * the caller fails before initializing str -- unicode_resize()
369 * reads str[0], and the Keep-Alive optimization can keep memory
370 * allocated for str alive across a call to unicode_dealloc(unicode).
371 * We don't want unicode_resize to read uninitialized memory in
372 * that case.
373 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000374 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000378 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000379 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000380 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381
Benjamin Peterson29060642009-01-31 22:14:21 +0000382 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000383 /* XXX UNREF/NEWREF interface should be more symmetrical */
384 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000385 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000386 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388}
389
Alexander Belopolsky40018472011-02-26 01:02:56 +0000390static void
391unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392{
Walter Dörwald16807132007-05-25 13:52:07 +0000393 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000394 case SSTATE_NOT_INTERNED:
395 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000396
Benjamin Peterson29060642009-01-31 22:14:21 +0000397 case SSTATE_INTERNED_MORTAL:
398 /* revive dead object temporarily for DelItem */
399 Py_REFCNT(unicode) = 3;
400 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
401 Py_FatalError(
402 "deletion of interned string failed");
403 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 case SSTATE_INTERNED_IMMORTAL:
406 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000407
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 default:
409 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000410 }
411
Guido van Rossum604ddf82001-12-06 20:03:56 +0000412 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000414 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000415 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
416 PyObject_DEL(unicode->str);
417 unicode->str = NULL;
418 unicode->length = 0;
419 }
420 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000421 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 }
423 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000424 *(PyUnicodeObject **)unicode = free_list;
425 free_list = unicode;
426 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 }
428 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000429 PyObject_DEL(unicode->str);
430 Py_XDECREF(unicode->defenc);
431 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 }
433}
434
Alexander Belopolsky40018472011-02-26 01:02:56 +0000435static int
436_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000437{
438 register PyUnicodeObject *v;
439
440 /* Argument checks */
441 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000442 PyErr_BadInternalCall();
443 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000446 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000447 PyErr_BadInternalCall();
448 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000449 }
450
451 /* Resizing unicode_empty and single character objects is not
452 possible since these are being shared. We simply return a fresh
453 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000454 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000455 (v == unicode_empty || v->length == 1)) {
456 PyUnicodeObject *w = _PyUnicode_New(length);
457 if (w == NULL)
458 return -1;
459 Py_UNICODE_COPY(w->str, v->str,
460 length < v->length ? length : v->length);
461 Py_DECREF(*unicode);
462 *unicode = w;
463 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000464 }
465
466 /* Note that we don't have to modify *unicode for unshared Unicode
467 objects, since we can modify them in-place. */
468 return unicode_resize(v, length);
469}
470
Alexander Belopolsky40018472011-02-26 01:02:56 +0000471int
472PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000473{
474 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
475}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476
Alexander Belopolsky40018472011-02-26 01:02:56 +0000477PyObject *
478PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479{
480 PyUnicodeObject *unicode;
481
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects. */
484 if (u != NULL) {
485
Benjamin Peterson29060642009-01-31 22:14:21 +0000486 /* Optimization for empty strings */
487 if (size == 0 && unicode_empty != NULL) {
488 Py_INCREF(unicode_empty);
489 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000490 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000491
492 /* Single character Unicode objects in the Latin-1 range are
493 shared when using this constructor */
494 if (size == 1 && *u < 256) {
495 unicode = unicode_latin1[*u];
496 if (!unicode) {
497 unicode = _PyUnicode_New(1);
498 if (!unicode)
499 return NULL;
500 unicode->str[0] = *u;
501 unicode_latin1[*u] = unicode;
502 }
503 Py_INCREF(unicode);
504 return (PyObject *)unicode;
505 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 unicode = _PyUnicode_New(size);
509 if (!unicode)
510 return NULL;
511
512 /* Copy the Unicode data into the new object */
513 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000514 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000515
516 return (PyObject *)unicode;
517}
518
Alexander Belopolsky40018472011-02-26 01:02:56 +0000519PyObject *
520PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000521{
522 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 if (size < 0) {
525 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000526 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000527 return NULL;
528 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000531 some optimizations which share commonly used objects.
532 Also, this means the input must be UTF-8, so fall back to the
533 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 if (u != NULL) {
535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536 /* Optimization for empty strings */
537 if (size == 0 && unicode_empty != NULL) {
538 Py_INCREF(unicode_empty);
539 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000540 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000541
542 /* Single characters are shared when using this constructor.
543 Restrict to ASCII, since the input must be UTF-8. */
544 if (size == 1 && Py_CHARMASK(*u) < 128) {
545 unicode = unicode_latin1[Py_CHARMASK(*u)];
546 if (!unicode) {
547 unicode = _PyUnicode_New(1);
548 if (!unicode)
549 return NULL;
550 unicode->str[0] = Py_CHARMASK(*u);
551 unicode_latin1[Py_CHARMASK(*u)] = unicode;
552 }
553 Py_INCREF(unicode);
554 return (PyObject *)unicode;
555 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000556
557 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 }
559
Walter Dörwald55507312007-05-18 13:12:10 +0000560 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 if (!unicode)
562 return NULL;
563
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 return (PyObject *)unicode;
565}
566
Alexander Belopolsky40018472011-02-26 01:02:56 +0000567PyObject *
568PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +0000569{
570 size_t size = strlen(u);
571 if (size > PY_SSIZE_T_MAX) {
572 PyErr_SetString(PyExc_OverflowError, "input too long");
573 return NULL;
574 }
575
576 return PyUnicode_FromStringAndSize(u, size);
577}
578
Guido van Rossumd57fd912000-03-10 22:53:23 +0000579#ifdef HAVE_WCHAR_H
580
Mark Dickinson081dfee2009-03-18 14:47:41 +0000581#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
582# define CONVERT_WCHAR_TO_SURROGATES
583#endif
584
585#ifdef CONVERT_WCHAR_TO_SURROGATES
586
587/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
588 to convert from UTF32 to UTF16. */
589
Alexander Belopolsky40018472011-02-26 01:02:56 +0000590PyObject *
591PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +0000592{
593 PyUnicodeObject *unicode;
594 register Py_ssize_t i;
595 Py_ssize_t alloc;
596 const wchar_t *orig_w;
597
598 if (w == NULL) {
599 if (size == 0)
600 return PyUnicode_FromStringAndSize(NULL, 0);
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 if (size == -1) {
606 size = wcslen(w);
607 }
608
609 alloc = size;
610 orig_w = w;
611 for (i = size; i > 0; i--) {
612 if (*w > 0xFFFF)
613 alloc++;
614 w++;
615 }
616 w = orig_w;
617 unicode = _PyUnicode_New(alloc);
618 if (!unicode)
619 return NULL;
620
621 /* Copy the wchar_t data into the new object */
622 {
623 register Py_UNICODE *u;
624 u = PyUnicode_AS_UNICODE(unicode);
625 for (i = size; i > 0; i--) {
626 if (*w > 0xFFFF) {
627 wchar_t ordinal = *w++;
628 ordinal -= 0x10000;
629 *u++ = 0xD800 | (ordinal >> 10);
630 *u++ = 0xDC00 | (ordinal & 0x3FF);
631 }
632 else
633 *u++ = *w++;
634 }
635 }
636 return (PyObject *)unicode;
637}
638
639#else
640
Alexander Belopolsky40018472011-02-26 01:02:56 +0000641PyObject *
642PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643{
644 PyUnicodeObject *unicode;
645
646 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000647 if (size == 0)
648 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 PyErr_BadInternalCall();
650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 }
652
Martin v. Löwis790465f2008-04-05 20:41:37 +0000653 if (size == -1) {
654 size = wcslen(w);
655 }
656
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 unicode = _PyUnicode_New(size);
658 if (!unicode)
659 return NULL;
660
661 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000662#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000664#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000666 register Py_UNICODE *u;
667 register Py_ssize_t i;
668 u = PyUnicode_AS_UNICODE(unicode);
669 for (i = size; i > 0; i--)
670 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671 }
672#endif
673
674 return (PyObject *)unicode;
675}
676
Mark Dickinson081dfee2009-03-18 14:47:41 +0000677#endif /* CONVERT_WCHAR_TO_SURROGATES */
678
679#undef CONVERT_WCHAR_TO_SURROGATES
680
Walter Dörwald346737f2007-05-31 10:44:43 +0000681static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000682makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
683 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000684{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000685 *fmt++ = '%';
686 if (width) {
687 if (zeropad)
688 *fmt++ = '0';
689 fmt += sprintf(fmt, "%d", width);
690 }
691 if (precision)
692 fmt += sprintf(fmt, ".%d", precision);
693 if (longflag)
694 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000695 else if (longlongflag) {
696 /* longlongflag should only ever be nonzero on machines with
697 HAVE_LONG_LONG defined */
698#ifdef HAVE_LONG_LONG
699 char *f = PY_FORMAT_LONG_LONG;
700 while (*f)
701 *fmt++ = *f++;
702#else
703 /* we shouldn't ever get here */
704 assert(0);
705 *fmt++ = 'l';
706#endif
707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000708 else if (size_tflag) {
709 char *f = PY_FORMAT_SIZE_T;
710 while (*f)
711 *fmt++ = *f++;
712 }
713 *fmt++ = c;
714 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000715}
716
Victor Stinner96865452011-03-01 23:44:09 +0000717/* helper for PyUnicode_FromFormatV() */
718
719static const char*
720parse_format_flags(const char *f,
721 int *p_width, int *p_precision,
722 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
723{
724 int width, precision, longflag, longlongflag, size_tflag;
725
726 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
727 f++;
728 width = 0;
729 while (Py_ISDIGIT((unsigned)*f))
730 width = (width*10) + *f++ - '0';
731 precision = 0;
732 if (*f == '.') {
733 f++;
734 while (Py_ISDIGIT((unsigned)*f))
735 precision = (precision*10) + *f++ - '0';
736 if (*f == '%') {
737 /* "%.3%s" => f points to "3" */
738 f--;
739 }
740 }
741 if (*f == '\0') {
742 /* bogus format "%.1" => go backward, f points to "1" */
743 f--;
744 }
745 if (p_width != NULL)
746 *p_width = width;
747 if (p_precision != NULL)
748 *p_precision = precision;
749
750 /* Handle %ld, %lu, %lld and %llu. */
751 longflag = 0;
752 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +0000753 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +0000754
755 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +0000756 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +0000757 longflag = 1;
758 ++f;
759 }
760#ifdef HAVE_LONG_LONG
761 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +0000762 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +0000763 longlongflag = 1;
764 f += 2;
765 }
766#endif
767 }
768 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +0000769 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +0000770 size_tflag = 1;
771 ++f;
772 }
773 if (p_longflag != NULL)
774 *p_longflag = longflag;
775 if (p_longlongflag != NULL)
776 *p_longlongflag = longlongflag;
777 if (p_size_tflag != NULL)
778 *p_size_tflag = size_tflag;
779 return f;
780}
781
Walter Dörwaldd2034312007-05-18 16:29:38 +0000782#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
783
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000784/* size of fixed-size buffer for formatting single arguments */
785#define ITEM_BUFFER_LEN 21
786/* maximum number of characters required for output of %ld. 21 characters
787 allows for 64-bit integers (in decimal) and an optional sign. */
788#define MAX_LONG_CHARS 21
789/* maximum number of characters required for output of %lld.
790 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
791 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
792#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
793
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794PyObject *
795PyUnicode_FromFormatV(const char *format, va_list vargs)
796{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000797 va_list count;
798 Py_ssize_t callcount = 0;
799 PyObject **callresults = NULL;
800 PyObject **callresult = NULL;
801 Py_ssize_t n = 0;
802 int width = 0;
803 int precision = 0;
804 int zeropad;
805 const char* f;
806 Py_UNICODE *s;
807 PyObject *string;
808 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000809 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* use abuffer instead of buffer, if we need more space
811 * (which can happen if there's a format specifier with width). */
812 char *abuffer = NULL;
813 char *realbuffer;
814 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000815 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000816 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000817
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000818 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000819 /* step 1: count the number of %S/%R/%A/%s format specifications
820 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
821 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
822 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000823 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000824 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +0000825 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
826 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
827 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000828 ++callcount;
829 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000830 else if (128 <= (unsigned char)*f) {
831 PyErr_Format(PyExc_ValueError,
832 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000833 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000834 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000835 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000836 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000837 }
838 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000839 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000840 if (callcount) {
841 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
842 if (!callresults) {
843 PyErr_NoMemory();
844 return NULL;
845 }
846 callresult = callresults;
847 }
848 /* step 3: figure out how large a buffer we need */
849 for (f = format; *f; f++) {
850 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000851#ifdef HAVE_LONG_LONG
Victor Stinner96865452011-03-01 23:44:09 +0000852 int longlongflag;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000853#endif
Victor Stinner96865452011-03-01 23:44:09 +0000854 const char* p;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000855
Victor Stinner96865452011-03-01 23:44:09 +0000856 p = f;
857 f = parse_format_flags(f, &width, NULL,
858 NULL, &longlongflag, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859
Benjamin Peterson14339b62009-01-31 16:36:08 +0000860 switch (*f) {
861 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000862 {
863#ifndef Py_UNICODE_WIDE
864 int ordinal = va_arg(count, int);
865 if (ordinal > 0xffff)
866 n += 2;
867 else
868 n++;
869#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000870 (void)va_arg(count, int);
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000871 n++;
872#endif
873 break;
874 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000875 case '%':
876 n++;
877 break;
878 case 'd': case 'u': case 'i': case 'x':
879 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000880#ifdef HAVE_LONG_LONG
881 if (longlongflag) {
882 if (width < MAX_LONG_LONG_CHARS)
883 width = MAX_LONG_LONG_CHARS;
884 }
885 else
886#endif
887 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
888 including sign. Decimal takes the most space. This
889 isn't enough for octal. If a width is specified we
890 need more (which we allocate later). */
891 if (width < MAX_LONG_CHARS)
892 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000893 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000894 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000895 if (abuffersize < width)
896 abuffersize = width;
897 break;
898 case 's':
899 {
900 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000901 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000902 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
903 if (!str)
904 goto fail;
905 n += PyUnicode_GET_SIZE(str);
906 /* Remember the str and switch to the next slot */
907 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000908 break;
909 }
910 case 'U':
911 {
912 PyObject *obj = va_arg(count, PyObject *);
913 assert(obj && PyUnicode_Check(obj));
914 n += PyUnicode_GET_SIZE(obj);
915 break;
916 }
917 case 'V':
918 {
919 PyObject *obj = va_arg(count, PyObject *);
920 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000921 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000922 assert(obj || str);
923 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +0000924 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000925 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000926 *callresult++ = NULL;
927 }
928 else {
929 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
930 if (!str_obj)
931 goto fail;
932 n += PyUnicode_GET_SIZE(str_obj);
933 *callresult++ = str_obj;
934 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000935 break;
936 }
937 case 'S':
938 {
939 PyObject *obj = va_arg(count, PyObject *);
940 PyObject *str;
941 assert(obj);
942 str = PyObject_Str(obj);
943 if (!str)
944 goto fail;
945 n += PyUnicode_GET_SIZE(str);
946 /* Remember the str and switch to the next slot */
947 *callresult++ = str;
948 break;
949 }
950 case 'R':
951 {
952 PyObject *obj = va_arg(count, PyObject *);
953 PyObject *repr;
954 assert(obj);
955 repr = PyObject_Repr(obj);
956 if (!repr)
957 goto fail;
958 n += PyUnicode_GET_SIZE(repr);
959 /* Remember the repr and switch to the next slot */
960 *callresult++ = repr;
961 break;
962 }
963 case 'A':
964 {
965 PyObject *obj = va_arg(count, PyObject *);
966 PyObject *ascii;
967 assert(obj);
968 ascii = PyObject_ASCII(obj);
969 if (!ascii)
970 goto fail;
971 n += PyUnicode_GET_SIZE(ascii);
972 /* Remember the repr and switch to the next slot */
973 *callresult++ = ascii;
974 break;
975 }
976 case 'p':
977 (void) va_arg(count, int);
978 /* maximum 64-bit pointer representation:
979 * 0xffffffffffffffff
980 * so 19 characters is enough.
981 * XXX I count 18 -- what's the extra for?
982 */
983 n += 19;
984 break;
985 default:
986 /* if we stumble upon an unknown
987 formatting code, copy the rest of
988 the format string to the output
989 string. (we cannot just skip the
990 code, since there's no way to know
991 what's in the argument list) */
992 n += strlen(p);
993 goto expand;
994 }
995 } else
996 n++;
997 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000998 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000999 if (abuffersize > ITEM_BUFFER_LEN) {
1000 /* add 1 for sprintf's trailing null byte */
1001 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001002 if (!abuffer) {
1003 PyErr_NoMemory();
1004 goto fail;
1005 }
1006 realbuffer = abuffer;
1007 }
1008 else
1009 realbuffer = buffer;
1010 /* step 4: fill the buffer */
1011 /* Since we've analyzed how much space we need for the worst case,
1012 we don't have to resize the string.
1013 There can be no errors beyond this point. */
1014 string = PyUnicode_FromUnicode(NULL, n);
1015 if (!string)
1016 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001017
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 s = PyUnicode_AS_UNICODE(string);
1019 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001020
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 for (f = format; *f; f++) {
1022 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001023 const char* p;
1024 int longflag;
1025 int longlongflag;
1026 int size_tflag;
1027
1028 p = f;
1029 zeropad = (f[1] == '0');
1030 f = parse_format_flags(f, &width, &precision,
1031 &longflag, &longlongflag, &size_tflag);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001032
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 switch (*f) {
1034 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001035 {
1036 int ordinal = va_arg(vargs, int);
1037#ifndef Py_UNICODE_WIDE
1038 if (ordinal > 0xffff) {
1039 ordinal -= 0x10000;
1040 *s++ = 0xD800 | (ordinal >> 10);
1041 *s++ = 0xDC00 | (ordinal & 0x3FF);
1042 } else
1043#endif
1044 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001046 }
Victor Stinner6d970f42011-03-02 00:04:25 +00001047 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001048 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001049 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
Victor Stinner6d970f42011-03-02 00:04:25 +00001050 width, precision, *f);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001051 if (longflag)
1052 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001053#ifdef HAVE_LONG_LONG
1054 else if (longlongflag)
1055 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1056#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001057 else if (size_tflag)
1058 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1059 else
1060 sprintf(realbuffer, fmt, va_arg(vargs, int));
1061 appendstring(realbuffer);
1062 break;
1063 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001064 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1065 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001066 if (longflag)
1067 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001068#ifdef HAVE_LONG_LONG
1069 else if (longlongflag)
1070 sprintf(realbuffer, fmt, va_arg(vargs,
1071 unsigned PY_LONG_LONG));
1072#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001073 else if (size_tflag)
1074 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1075 else
1076 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1077 appendstring(realbuffer);
1078 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001079 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001080 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001081 sprintf(realbuffer, fmt, va_arg(vargs, int));
1082 appendstring(realbuffer);
1083 break;
1084 case 's':
1085 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001086 /* unused, since we already have the result */
1087 (void) va_arg(vargs, char *);
1088 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1089 PyUnicode_GET_SIZE(*callresult));
1090 s += PyUnicode_GET_SIZE(*callresult);
1091 /* We're done with the unicode()/repr() => forget it */
1092 Py_DECREF(*callresult);
1093 /* switch to next unicode()/repr() result */
1094 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001095 break;
1096 }
1097 case 'U':
1098 {
1099 PyObject *obj = va_arg(vargs, PyObject *);
1100 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1101 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1102 s += size;
1103 break;
1104 }
1105 case 'V':
1106 {
1107 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001108 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001109 if (obj) {
1110 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1111 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1112 s += size;
1113 } else {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001114 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1115 PyUnicode_GET_SIZE(*callresult));
1116 s += PyUnicode_GET_SIZE(*callresult);
1117 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001118 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001119 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001120 break;
1121 }
1122 case 'S':
1123 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001124 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001125 {
1126 Py_UNICODE *ucopy;
1127 Py_ssize_t usize;
1128 Py_ssize_t upos;
1129 /* unused, since we already have the result */
1130 (void) va_arg(vargs, PyObject *);
1131 ucopy = PyUnicode_AS_UNICODE(*callresult);
1132 usize = PyUnicode_GET_SIZE(*callresult);
1133 for (upos = 0; upos<usize;)
1134 *s++ = ucopy[upos++];
1135 /* We're done with the unicode()/repr() => forget it */
1136 Py_DECREF(*callresult);
1137 /* switch to next unicode()/repr() result */
1138 ++callresult;
1139 break;
1140 }
1141 case 'p':
1142 sprintf(buffer, "%p", va_arg(vargs, void*));
1143 /* %p is ill-defined: ensure leading 0x. */
1144 if (buffer[1] == 'X')
1145 buffer[1] = 'x';
1146 else if (buffer[1] != 'x') {
1147 memmove(buffer+2, buffer, strlen(buffer)+1);
1148 buffer[0] = '0';
1149 buffer[1] = 'x';
1150 }
1151 appendstring(buffer);
1152 break;
1153 case '%':
1154 *s++ = '%';
1155 break;
1156 default:
1157 appendstring(p);
1158 goto end;
1159 }
Victor Stinner1205f272010-09-11 00:54:47 +00001160 }
Victor Stinner1205f272010-09-11 00:54:47 +00001161 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001162 *s++ = *f;
1163 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001164
Benjamin Peterson29060642009-01-31 22:14:21 +00001165 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001166 if (callresults)
1167 PyObject_Free(callresults);
1168 if (abuffer)
1169 PyObject_Free(abuffer);
1170 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1171 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001172 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001173 if (callresults) {
1174 PyObject **callresult2 = callresults;
1175 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001176 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001177 ++callresult2;
1178 }
1179 PyObject_Free(callresults);
1180 }
1181 if (abuffer)
1182 PyObject_Free(abuffer);
1183 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001184}
1185
1186#undef appendstring
1187
1188PyObject *
1189PyUnicode_FromFormat(const char *format, ...)
1190{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001191 PyObject* ret;
1192 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001193
1194#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001195 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001196#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001197 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001198#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001199 ret = PyUnicode_FromFormatV(format, vargs);
1200 va_end(vargs);
1201 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001202}
1203
Victor Stinner5593d8a2010-10-02 11:11:27 +00001204/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1205 convert a Unicode object to a wide character string.
1206
1207 - If w is NULL: return the number of wide characters (including the nul
1208 character) required to convert the unicode object. Ignore size argument.
1209
1210 - Otherwise: return the number of wide characters (excluding the nul
1211 character) written into w. Write at most size wide characters (including
1212 the nul character). */
1213static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001214unicode_aswidechar(PyUnicodeObject *unicode,
1215 wchar_t *w,
1216 Py_ssize_t size)
1217{
1218#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001219 Py_ssize_t res;
1220 if (w != NULL) {
1221 res = PyUnicode_GET_SIZE(unicode);
1222 if (size > res)
1223 size = res + 1;
1224 else
1225 res = size;
1226 memcpy(w, unicode->str, size * sizeof(wchar_t));
1227 return res;
1228 }
1229 else
1230 return PyUnicode_GET_SIZE(unicode) + 1;
1231#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1232 register const Py_UNICODE *u;
1233 const Py_UNICODE *uend;
1234 const wchar_t *worig, *wend;
1235 Py_ssize_t nchar;
1236
Victor Stinner137c34c2010-09-29 10:25:54 +00001237 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001238 uend = u + PyUnicode_GET_SIZE(unicode);
1239 if (w != NULL) {
1240 worig = w;
1241 wend = w + size;
1242 while (u != uend && w != wend) {
1243 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1244 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1245 {
1246 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1247 u += 2;
1248 }
1249 else {
1250 *w = *u;
1251 u++;
1252 }
1253 w++;
1254 }
1255 if (w != wend)
1256 *w = L'\0';
1257 return w - worig;
1258 }
1259 else {
1260 nchar = 1; /* nul character at the end */
1261 while (u != uend) {
1262 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1263 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1264 u += 2;
1265 else
1266 u++;
1267 nchar++;
1268 }
1269 }
1270 return nchar;
1271#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1272 register Py_UNICODE *u, *uend, ordinal;
1273 register Py_ssize_t i;
1274 wchar_t *worig, *wend;
1275 Py_ssize_t nchar;
1276
1277 u = PyUnicode_AS_UNICODE(unicode);
1278 uend = u + PyUnicode_GET_SIZE(u);
1279 if (w != NULL) {
1280 worig = w;
1281 wend = w + size;
1282 while (u != uend && w != wend) {
1283 ordinal = *u;
1284 if (ordinal > 0xffff) {
1285 ordinal -= 0x10000;
1286 *w++ = 0xD800 | (ordinal >> 10);
1287 *w++ = 0xDC00 | (ordinal & 0x3FF);
1288 }
1289 else
1290 *w++ = ordinal;
1291 u++;
1292 }
1293 if (w != wend)
1294 *w = 0;
1295 return w - worig;
1296 }
1297 else {
1298 nchar = 1; /* nul character */
1299 while (u != uend) {
1300 if (*u > 0xffff)
1301 nchar += 2;
1302 else
1303 nchar++;
1304 u++;
1305 }
1306 return nchar;
1307 }
1308#else
1309# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001310#endif
1311}
1312
1313Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001314PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001315 wchar_t *w,
1316 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001317{
1318 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001319 PyErr_BadInternalCall();
1320 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001322 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001323}
1324
Victor Stinner137c34c2010-09-29 10:25:54 +00001325wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001326PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001327 Py_ssize_t *size)
1328{
1329 wchar_t* buffer;
1330 Py_ssize_t buflen;
1331
1332 if (unicode == NULL) {
1333 PyErr_BadInternalCall();
1334 return NULL;
1335 }
1336
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001337 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001338 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001339 PyErr_NoMemory();
1340 return NULL;
1341 }
1342
Victor Stinner137c34c2010-09-29 10:25:54 +00001343 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1344 if (buffer == NULL) {
1345 PyErr_NoMemory();
1346 return NULL;
1347 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001348 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001349 if (size != NULL)
1350 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001351 return buffer;
1352}
1353
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354#endif
1355
Alexander Belopolsky40018472011-02-26 01:02:56 +00001356PyObject *
1357PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001358{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001359 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001360
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001361 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001362 PyErr_SetString(PyExc_ValueError,
1363 "chr() arg not in range(0x110000)");
1364 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001365 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001366
1367#ifndef Py_UNICODE_WIDE
1368 if (ordinal > 0xffff) {
1369 ordinal -= 0x10000;
1370 s[0] = 0xD800 | (ordinal >> 10);
1371 s[1] = 0xDC00 | (ordinal & 0x3FF);
1372 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001373 }
1374#endif
1375
Hye-Shik Chang40574832004-04-06 07:24:51 +00001376 s[0] = (Py_UNICODE)ordinal;
1377 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001378}
1379
Alexander Belopolsky40018472011-02-26 01:02:56 +00001380PyObject *
1381PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001383 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001384 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001385 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001386 Py_INCREF(obj);
1387 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001388 }
1389 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 /* For a Unicode subtype that's not a Unicode object,
1391 return a true Unicode object with the same data. */
1392 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1393 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001394 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001395 PyErr_Format(PyExc_TypeError,
1396 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001397 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001398 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001399}
1400
Alexander Belopolsky40018472011-02-26 01:02:56 +00001401PyObject *
1402PyUnicode_FromEncodedObject(register PyObject *obj,
1403 const char *encoding,
1404 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001405{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001406 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001407 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001408
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001410 PyErr_BadInternalCall();
1411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001413
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001414 /* Decoding bytes objects is the most common case and should be fast */
1415 if (PyBytes_Check(obj)) {
1416 if (PyBytes_GET_SIZE(obj) == 0) {
1417 Py_INCREF(unicode_empty);
1418 v = (PyObject *) unicode_empty;
1419 }
1420 else {
1421 v = PyUnicode_Decode(
1422 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1423 encoding, errors);
1424 }
1425 return v;
1426 }
1427
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001428 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001429 PyErr_SetString(PyExc_TypeError,
1430 "decoding str is not supported");
1431 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001432 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001433
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001434 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1435 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1436 PyErr_Format(PyExc_TypeError,
1437 "coercing to str: need bytes, bytearray "
1438 "or buffer-like object, %.80s found",
1439 Py_TYPE(obj)->tp_name);
1440 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001441 }
Tim Petersced69f82003-09-16 20:30:58 +00001442
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001443 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001444 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001445 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446 }
Tim Petersced69f82003-09-16 20:30:58 +00001447 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001448 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001449
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001450 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001451 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452}
1453
Victor Stinner600d3be2010-06-10 12:00:55 +00001454/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001455 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1456 1 on success. */
1457static int
1458normalize_encoding(const char *encoding,
1459 char *lower,
1460 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001462 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001463 char *l;
1464 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001465
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001466 e = encoding;
1467 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001468 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001469 while (*e) {
1470 if (l == l_end)
1471 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001472 if (Py_ISUPPER(*e)) {
1473 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001474 }
1475 else if (*e == '_') {
1476 *l++ = '-';
1477 e++;
1478 }
1479 else {
1480 *l++ = *e++;
1481 }
1482 }
1483 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001484 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001485}
1486
Alexander Belopolsky40018472011-02-26 01:02:56 +00001487PyObject *
1488PyUnicode_Decode(const char *s,
1489 Py_ssize_t size,
1490 const char *encoding,
1491 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00001492{
1493 PyObject *buffer = NULL, *unicode;
1494 Py_buffer info;
1495 char lower[11]; /* Enough for any encoding shortcut */
1496
1497 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001499
1500 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001501 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001502 if ((strcmp(lower, "utf-8") == 0) ||
1503 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001504 return PyUnicode_DecodeUTF8(s, size, errors);
1505 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001506 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001507 (strcmp(lower, "iso-8859-1") == 0))
1508 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001509#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001510 else if (strcmp(lower, "mbcs") == 0)
1511 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001512#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001513 else if (strcmp(lower, "ascii") == 0)
1514 return PyUnicode_DecodeASCII(s, size, errors);
1515 else if (strcmp(lower, "utf-16") == 0)
1516 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1517 else if (strcmp(lower, "utf-32") == 0)
1518 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1519 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001520
1521 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001522 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001523 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001524 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001525 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 if (buffer == NULL)
1527 goto onError;
1528 unicode = PyCodec_Decode(buffer, encoding, errors);
1529 if (unicode == NULL)
1530 goto onError;
1531 if (!PyUnicode_Check(unicode)) {
1532 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001533 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001534 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001535 Py_DECREF(unicode);
1536 goto onError;
1537 }
1538 Py_DECREF(buffer);
1539 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001540
Benjamin Peterson29060642009-01-31 22:14:21 +00001541 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 Py_XDECREF(buffer);
1543 return NULL;
1544}
1545
Alexander Belopolsky40018472011-02-26 01:02:56 +00001546PyObject *
1547PyUnicode_AsDecodedObject(PyObject *unicode,
1548 const char *encoding,
1549 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001550{
1551 PyObject *v;
1552
1553 if (!PyUnicode_Check(unicode)) {
1554 PyErr_BadArgument();
1555 goto onError;
1556 }
1557
1558 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001559 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001560
1561 /* Decode via the codec registry */
1562 v = PyCodec_Decode(unicode, encoding, errors);
1563 if (v == NULL)
1564 goto onError;
1565 return v;
1566
Benjamin Peterson29060642009-01-31 22:14:21 +00001567 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001568 return NULL;
1569}
1570
Alexander Belopolsky40018472011-02-26 01:02:56 +00001571PyObject *
1572PyUnicode_AsDecodedUnicode(PyObject *unicode,
1573 const char *encoding,
1574 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001575{
1576 PyObject *v;
1577
1578 if (!PyUnicode_Check(unicode)) {
1579 PyErr_BadArgument();
1580 goto onError;
1581 }
1582
1583 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001584 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001585
1586 /* Decode via the codec registry */
1587 v = PyCodec_Decode(unicode, encoding, errors);
1588 if (v == NULL)
1589 goto onError;
1590 if (!PyUnicode_Check(v)) {
1591 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001592 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001593 Py_TYPE(v)->tp_name);
1594 Py_DECREF(v);
1595 goto onError;
1596 }
1597 return v;
1598
Benjamin Peterson29060642009-01-31 22:14:21 +00001599 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001600 return NULL;
1601}
1602
Alexander Belopolsky40018472011-02-26 01:02:56 +00001603PyObject *
1604PyUnicode_Encode(const Py_UNICODE *s,
1605 Py_ssize_t size,
1606 const char *encoding,
1607 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608{
1609 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001610
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 unicode = PyUnicode_FromUnicode(s, size);
1612 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001613 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1615 Py_DECREF(unicode);
1616 return v;
1617}
1618
Alexander Belopolsky40018472011-02-26 01:02:56 +00001619PyObject *
1620PyUnicode_AsEncodedObject(PyObject *unicode,
1621 const char *encoding,
1622 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001623{
1624 PyObject *v;
1625
1626 if (!PyUnicode_Check(unicode)) {
1627 PyErr_BadArgument();
1628 goto onError;
1629 }
1630
1631 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001632 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001633
1634 /* Encode via the codec registry */
1635 v = PyCodec_Encode(unicode, encoding, errors);
1636 if (v == NULL)
1637 goto onError;
1638 return v;
1639
Benjamin Peterson29060642009-01-31 22:14:21 +00001640 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001641 return NULL;
1642}
1643
Victor Stinnerad158722010-10-27 00:25:46 +00001644PyObject *
1645PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001646{
Victor Stinner313a1202010-06-11 23:56:51 +00001647#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001648 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1649 PyUnicode_GET_SIZE(unicode),
1650 NULL);
1651#elif defined(__APPLE__)
1652 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1653 PyUnicode_GET_SIZE(unicode),
1654 "surrogateescape");
1655#else
Victor Stinner793b5312011-04-27 00:24:21 +02001656 PyInterpreterState *interp = PyThreadState_GET()->interp;
1657 /* Bootstrap check: if the filesystem codec is implemented in Python, we
1658 cannot use it to encode and decode filenames before it is loaded. Load
1659 the Python codec requires to encode at least its own filename. Use the C
1660 version of the locale codec until the codec registry is initialized and
1661 the Python codec is loaded.
1662
1663 Py_FileSystemDefaultEncoding is shared between all interpreters, we
1664 cannot only rely on it: check also interp->fscodec_initialized for
1665 subinterpreters. */
1666 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001667 return PyUnicode_AsEncodedString(unicode,
1668 Py_FileSystemDefaultEncoding,
1669 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001670 }
1671 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001672 /* locale encoding with surrogateescape */
1673 wchar_t *wchar;
1674 char *bytes;
1675 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001676 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001677
1678 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1679 if (wchar == NULL)
1680 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001681 bytes = _Py_wchar2char(wchar, &error_pos);
1682 if (bytes == NULL) {
1683 if (error_pos != (size_t)-1) {
1684 char *errmsg = strerror(errno);
1685 PyObject *exc = NULL;
1686 if (errmsg == NULL)
1687 errmsg = "Py_wchar2char() failed";
1688 raise_encode_exception(&exc,
1689 "filesystemencoding",
1690 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1691 error_pos, error_pos+1,
1692 errmsg);
1693 Py_XDECREF(exc);
1694 }
1695 else
1696 PyErr_NoMemory();
1697 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001698 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001699 }
1700 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001701
1702 bytes_obj = PyBytes_FromString(bytes);
1703 PyMem_Free(bytes);
1704 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001705 }
Victor Stinnerad158722010-10-27 00:25:46 +00001706#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001707}
1708
Alexander Belopolsky40018472011-02-26 01:02:56 +00001709PyObject *
1710PyUnicode_AsEncodedString(PyObject *unicode,
1711 const char *encoding,
1712 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713{
1714 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001715 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001716
Guido van Rossumd57fd912000-03-10 22:53:23 +00001717 if (!PyUnicode_Check(unicode)) {
1718 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720 }
Fred Drakee4315f52000-05-09 19:53:39 +00001721
Victor Stinner2f283c22011-03-02 01:21:46 +00001722 if (encoding == NULL) {
1723 if (errors == NULL || strcmp(errors, "strict") == 0)
1724 return PyUnicode_AsUTF8String(unicode);
1725 else
1726 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1727 PyUnicode_GET_SIZE(unicode),
1728 errors);
1729 }
Fred Drakee4315f52000-05-09 19:53:39 +00001730
1731 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001732 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001733 if ((strcmp(lower, "utf-8") == 0) ||
1734 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00001735 {
Victor Stinner2f283c22011-03-02 01:21:46 +00001736 if (errors == NULL || strcmp(errors, "strict") == 0)
Victor Stinnera5c68c32011-03-02 01:03:14 +00001737 return PyUnicode_AsUTF8String(unicode);
Victor Stinner2f283c22011-03-02 01:21:46 +00001738 else
Victor Stinnera5c68c32011-03-02 01:03:14 +00001739 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1740 PyUnicode_GET_SIZE(unicode),
1741 errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00001742 }
Victor Stinner37296e82010-06-10 13:36:23 +00001743 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001744 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001745 (strcmp(lower, "iso-8859-1") == 0))
1746 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1747 PyUnicode_GET_SIZE(unicode),
1748 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001749#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001750 else if (strcmp(lower, "mbcs") == 0)
1751 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1752 PyUnicode_GET_SIZE(unicode),
1753 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001754#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001755 else if (strcmp(lower, "ascii") == 0)
1756 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1757 PyUnicode_GET_SIZE(unicode),
1758 errors);
1759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760
1761 /* Encode via the codec registry */
1762 v = PyCodec_Encode(unicode, encoding, errors);
1763 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001764 return NULL;
1765
1766 /* The normal path */
1767 if (PyBytes_Check(v))
1768 return v;
1769
1770 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001771 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001772 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001773 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001774
1775 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1776 "encoder %s returned bytearray instead of bytes",
1777 encoding);
1778 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001779 Py_DECREF(v);
1780 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001781 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001782
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001783 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1784 Py_DECREF(v);
1785 return b;
1786 }
1787
1788 PyErr_Format(PyExc_TypeError,
1789 "encoder did not return a bytes object (type=%.400s)",
1790 Py_TYPE(v)->tp_name);
1791 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001792 return NULL;
1793}
1794
Alexander Belopolsky40018472011-02-26 01:02:56 +00001795PyObject *
1796PyUnicode_AsEncodedUnicode(PyObject *unicode,
1797 const char *encoding,
1798 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001799{
1800 PyObject *v;
1801
1802 if (!PyUnicode_Check(unicode)) {
1803 PyErr_BadArgument();
1804 goto onError;
1805 }
1806
1807 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001808 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001809
1810 /* Encode via the codec registry */
1811 v = PyCodec_Encode(unicode, encoding, errors);
1812 if (v == NULL)
1813 goto onError;
1814 if (!PyUnicode_Check(v)) {
1815 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001816 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001817 Py_TYPE(v)->tp_name);
1818 Py_DECREF(v);
1819 goto onError;
1820 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001821 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001822
Benjamin Peterson29060642009-01-31 22:14:21 +00001823 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001824 return NULL;
1825}
1826
Alexander Belopolsky40018472011-02-26 01:02:56 +00001827PyObject *
Victor Stinnerf3fd7332011-03-02 01:03:11 +00001828_PyUnicode_AsDefaultEncodedString(PyObject *unicode)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001829{
1830 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001831 if (v)
1832 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +00001833 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001834 PyUnicode_GET_SIZE(unicode),
1835 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001836 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001837 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001838 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001839 return v;
1840}
1841
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001842PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001843PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001844 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001845 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1846}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001847
Christian Heimes5894ba72007-11-04 11:43:14 +00001848PyObject*
1849PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1850{
Victor Stinnerad158722010-10-27 00:25:46 +00001851#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1852 return PyUnicode_DecodeMBCS(s, size, NULL);
1853#elif defined(__APPLE__)
1854 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1855#else
Victor Stinner793b5312011-04-27 00:24:21 +02001856 PyInterpreterState *interp = PyThreadState_GET()->interp;
1857 /* Bootstrap check: if the filesystem codec is implemented in Python, we
1858 cannot use it to encode and decode filenames before it is loaded. Load
1859 the Python codec requires to encode at least its own filename. Use the C
1860 version of the locale codec until the codec registry is initialized and
1861 the Python codec is loaded.
1862
1863 Py_FileSystemDefaultEncoding is shared between all interpreters, we
1864 cannot only rely on it: check also interp->fscodec_initialized for
1865 subinterpreters. */
1866 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001867 return PyUnicode_Decode(s, size,
1868 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001869 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001870 }
1871 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001872 /* locale encoding with surrogateescape */
1873 wchar_t *wchar;
1874 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001875 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001876
1877 if (s[size] != '\0' || size != strlen(s)) {
1878 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1879 return NULL;
1880 }
1881
Victor Stinner168e1172010-10-16 23:16:16 +00001882 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001883 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001884 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001885
Victor Stinner168e1172010-10-16 23:16:16 +00001886 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001887 PyMem_Free(wchar);
1888 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001889 }
Victor Stinnerad158722010-10-27 00:25:46 +00001890#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001891}
1892
Martin v. Löwis011e8422009-05-05 04:43:17 +00001893
1894int
1895PyUnicode_FSConverter(PyObject* arg, void* addr)
1896{
1897 PyObject *output = NULL;
1898 Py_ssize_t size;
1899 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001900 if (arg == NULL) {
1901 Py_DECREF(*(PyObject**)addr);
1902 return 1;
1903 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001904 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001905 output = arg;
1906 Py_INCREF(output);
1907 }
1908 else {
1909 arg = PyUnicode_FromObject(arg);
1910 if (!arg)
1911 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001912 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001913 Py_DECREF(arg);
1914 if (!output)
1915 return 0;
1916 if (!PyBytes_Check(output)) {
1917 Py_DECREF(output);
1918 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1919 return 0;
1920 }
1921 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001922 size = PyBytes_GET_SIZE(output);
1923 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001924 if (size != strlen(data)) {
1925 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1926 Py_DECREF(output);
1927 return 0;
1928 }
1929 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001930 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001931}
1932
1933
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001934int
1935PyUnicode_FSDecoder(PyObject* arg, void* addr)
1936{
1937 PyObject *output = NULL;
1938 Py_ssize_t size;
1939 void *data;
1940 if (arg == NULL) {
1941 Py_DECREF(*(PyObject**)addr);
1942 return 1;
1943 }
1944 if (PyUnicode_Check(arg)) {
1945 output = arg;
1946 Py_INCREF(output);
1947 }
1948 else {
1949 arg = PyBytes_FromObject(arg);
1950 if (!arg)
1951 return 0;
1952 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1953 PyBytes_GET_SIZE(arg));
1954 Py_DECREF(arg);
1955 if (!output)
1956 return 0;
1957 if (!PyUnicode_Check(output)) {
1958 Py_DECREF(output);
1959 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1960 return 0;
1961 }
1962 }
1963 size = PyUnicode_GET_SIZE(output);
1964 data = PyUnicode_AS_UNICODE(output);
1965 if (size != Py_UNICODE_strlen(data)) {
1966 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1967 Py_DECREF(output);
1968 return 0;
1969 }
1970 *(PyObject**)addr = output;
1971 return Py_CLEANUP_SUPPORTED;
1972}
1973
1974
Martin v. Löwis5b222132007-06-10 09:51:05 +00001975char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001976_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001977{
Christian Heimesf3863112007-11-22 07:46:41 +00001978 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001979 if (!PyUnicode_Check(unicode)) {
1980 PyErr_BadArgument();
1981 return NULL;
1982 }
Victor Stinnerf3fd7332011-03-02 01:03:11 +00001983 bytes = _PyUnicode_AsDefaultEncodedString(unicode);
Christian Heimesf3863112007-11-22 07:46:41 +00001984 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001985 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001986 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001987 *psize = PyBytes_GET_SIZE(bytes);
1988 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001989}
1990
1991char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001992_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001993{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001994 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001995}
1996
Alexander Belopolsky40018472011-02-26 01:02:56 +00001997Py_UNICODE *
1998PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001999{
2000 if (!PyUnicode_Check(unicode)) {
2001 PyErr_BadArgument();
2002 goto onError;
2003 }
2004 return PyUnicode_AS_UNICODE(unicode);
2005
Benjamin Peterson29060642009-01-31 22:14:21 +00002006 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002007 return NULL;
2008}
2009
Alexander Belopolsky40018472011-02-26 01:02:56 +00002010Py_ssize_t
2011PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002012{
2013 if (!PyUnicode_Check(unicode)) {
2014 PyErr_BadArgument();
2015 goto onError;
2016 }
2017 return PyUnicode_GET_SIZE(unicode);
2018
Benjamin Peterson29060642009-01-31 22:14:21 +00002019 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002020 return -1;
2021}
2022
Alexander Belopolsky40018472011-02-26 01:02:56 +00002023const char *
2024PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002025{
Victor Stinner42cb4622010-09-01 19:39:01 +00002026 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002027}
2028
Victor Stinner554f3f02010-06-16 23:33:54 +00002029/* create or adjust a UnicodeDecodeError */
2030static void
2031make_decode_exception(PyObject **exceptionObject,
2032 const char *encoding,
2033 const char *input, Py_ssize_t length,
2034 Py_ssize_t startpos, Py_ssize_t endpos,
2035 const char *reason)
2036{
2037 if (*exceptionObject == NULL) {
2038 *exceptionObject = PyUnicodeDecodeError_Create(
2039 encoding, input, length, startpos, endpos, reason);
2040 }
2041 else {
2042 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2043 goto onError;
2044 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2045 goto onError;
2046 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2047 goto onError;
2048 }
2049 return;
2050
2051onError:
2052 Py_DECREF(*exceptionObject);
2053 *exceptionObject = NULL;
2054}
2055
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002056/* error handling callback helper:
2057 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002058 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002059 and adjust various state variables.
2060 return 0 on success, -1 on error
2061*/
2062
Alexander Belopolsky40018472011-02-26 01:02:56 +00002063static int
2064unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2065 const char *encoding, const char *reason,
2066 const char **input, const char **inend, Py_ssize_t *startinpos,
2067 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2068 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002069{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002070 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002071
2072 PyObject *restuple = NULL;
2073 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002074 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002075 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002076 Py_ssize_t requiredsize;
2077 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002078 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002079 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002080 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002081 int res = -1;
2082
2083 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002084 *errorHandler = PyCodec_LookupError(errors);
2085 if (*errorHandler == NULL)
2086 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002087 }
2088
Victor Stinner554f3f02010-06-16 23:33:54 +00002089 make_decode_exception(exceptionObject,
2090 encoding,
2091 *input, *inend - *input,
2092 *startinpos, *endinpos,
2093 reason);
2094 if (*exceptionObject == NULL)
2095 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002096
2097 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2098 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002099 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002100 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002101 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002102 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002103 }
2104 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002105 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002106
2107 /* Copy back the bytes variables, which might have been modified by the
2108 callback */
2109 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2110 if (!inputobj)
2111 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002112 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002113 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002114 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002115 *input = PyBytes_AS_STRING(inputobj);
2116 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002117 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002118 /* we can DECREF safely, as the exception has another reference,
2119 so the object won't go away. */
2120 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002121
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002122 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002123 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002124 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002125 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2126 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002127 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002128
2129 /* need more space? (at least enough for what we
2130 have+the replacement+the rest of the string (starting
2131 at the new input position), so we won't have to check space
2132 when there are no errors in the rest of the string) */
2133 repptr = PyUnicode_AS_UNICODE(repunicode);
2134 repsize = PyUnicode_GET_SIZE(repunicode);
2135 requiredsize = *outpos + repsize + insize-newpos;
2136 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002137 if (requiredsize<2*outsize)
2138 requiredsize = 2*outsize;
2139 if (_PyUnicode_Resize(output, requiredsize) < 0)
2140 goto onError;
2141 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002142 }
2143 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002144 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002145 Py_UNICODE_COPY(*outptr, repptr, repsize);
2146 *outptr += repsize;
2147 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002148
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002149 /* we made it! */
2150 res = 0;
2151
Benjamin Peterson29060642009-01-31 22:14:21 +00002152 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002153 Py_XDECREF(restuple);
2154 return res;
2155}
2156
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002157/* --- UTF-7 Codec -------------------------------------------------------- */
2158
Antoine Pitrou244651a2009-05-04 18:56:13 +00002159/* See RFC2152 for details. We encode conservatively and decode liberally. */
2160
2161/* Three simple macros defining base-64. */
2162
2163/* Is c a base-64 character? */
2164
2165#define IS_BASE64(c) \
2166 (((c) >= 'A' && (c) <= 'Z') || \
2167 ((c) >= 'a' && (c) <= 'z') || \
2168 ((c) >= '0' && (c) <= '9') || \
2169 (c) == '+' || (c) == '/')
2170
2171/* given that c is a base-64 character, what is its base-64 value? */
2172
2173#define FROM_BASE64(c) \
2174 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2175 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2176 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2177 (c) == '+' ? 62 : 63)
2178
2179/* What is the base-64 character of the bottom 6 bits of n? */
2180
2181#define TO_BASE64(n) \
2182 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2183
2184/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2185 * decoded as itself. We are permissive on decoding; the only ASCII
2186 * byte not decoding to itself is the + which begins a base64
2187 * string. */
2188
2189#define DECODE_DIRECT(c) \
2190 ((c) <= 127 && (c) != '+')
2191
2192/* The UTF-7 encoder treats ASCII characters differently according to
2193 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2194 * the above). See RFC2152. This array identifies these different
2195 * sets:
2196 * 0 : "Set D"
2197 * alphanumeric and '(),-./:?
2198 * 1 : "Set O"
2199 * !"#$%&*;<=>@[]^_`{|}
2200 * 2 : "whitespace"
2201 * ht nl cr sp
2202 * 3 : special (must be base64 encoded)
2203 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2204 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002205
Tim Petersced69f82003-09-16 20:30:58 +00002206static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002207char utf7_category[128] = {
2208/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2209 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2210/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2211 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2212/* sp ! " # $ % & ' ( ) * + , - . / */
2213 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2214/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2215 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2216/* @ A B C D E F G H I J K L M N O */
2217 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2218/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2219 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2220/* ` a b c d e f g h i j k l m n o */
2221 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2222/* p q r s t u v w x y z { | } ~ del */
2223 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002224};
2225
Antoine Pitrou244651a2009-05-04 18:56:13 +00002226/* ENCODE_DIRECT: this character should be encoded as itself. The
2227 * answer depends on whether we are encoding set O as itself, and also
2228 * on whether we are encoding whitespace as itself. RFC2152 makes it
2229 * clear that the answers to these questions vary between
2230 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002231
Antoine Pitrou244651a2009-05-04 18:56:13 +00002232#define ENCODE_DIRECT(c, directO, directWS) \
2233 ((c) < 128 && (c) > 0 && \
2234 ((utf7_category[(c)] == 0) || \
2235 (directWS && (utf7_category[(c)] == 2)) || \
2236 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002237
Alexander Belopolsky40018472011-02-26 01:02:56 +00002238PyObject *
2239PyUnicode_DecodeUTF7(const char *s,
2240 Py_ssize_t size,
2241 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002242{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002243 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2244}
2245
Antoine Pitrou244651a2009-05-04 18:56:13 +00002246/* The decoder. The only state we preserve is our read position,
2247 * i.e. how many characters we have consumed. So if we end in the
2248 * middle of a shift sequence we have to back off the read position
2249 * and the output to the beginning of the sequence, otherwise we lose
2250 * all the shift state (seen bits, number of bits seen, high
2251 * surrogate). */
2252
Alexander Belopolsky40018472011-02-26 01:02:56 +00002253PyObject *
2254PyUnicode_DecodeUTF7Stateful(const char *s,
2255 Py_ssize_t size,
2256 const char *errors,
2257 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002258{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002259 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002260 Py_ssize_t startinpos;
2261 Py_ssize_t endinpos;
2262 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002263 const char *e;
2264 PyUnicodeObject *unicode;
2265 Py_UNICODE *p;
2266 const char *errmsg = "";
2267 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002268 Py_UNICODE *shiftOutStart;
2269 unsigned int base64bits = 0;
2270 unsigned long base64buffer = 0;
2271 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002272 PyObject *errorHandler = NULL;
2273 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002274
2275 unicode = _PyUnicode_New(size);
2276 if (!unicode)
2277 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002278 if (size == 0) {
2279 if (consumed)
2280 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002281 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002282 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002283
2284 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002285 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002286 e = s + size;
2287
2288 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002289 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002290 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002291 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002292
Antoine Pitrou244651a2009-05-04 18:56:13 +00002293 if (inShift) { /* in a base-64 section */
2294 if (IS_BASE64(ch)) { /* consume a base-64 character */
2295 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2296 base64bits += 6;
2297 s++;
2298 if (base64bits >= 16) {
2299 /* we have enough bits for a UTF-16 value */
2300 Py_UNICODE outCh = (Py_UNICODE)
2301 (base64buffer >> (base64bits-16));
2302 base64bits -= 16;
2303 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2304 if (surrogate) {
2305 /* expecting a second surrogate */
2306 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2307#ifdef Py_UNICODE_WIDE
2308 *p++ = (((surrogate & 0x3FF)<<10)
2309 | (outCh & 0x3FF)) + 0x10000;
2310#else
2311 *p++ = surrogate;
2312 *p++ = outCh;
2313#endif
2314 surrogate = 0;
2315 }
2316 else {
2317 surrogate = 0;
2318 errmsg = "second surrogate missing";
2319 goto utf7Error;
2320 }
2321 }
2322 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2323 /* first surrogate */
2324 surrogate = outCh;
2325 }
2326 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2327 errmsg = "unexpected second surrogate";
2328 goto utf7Error;
2329 }
2330 else {
2331 *p++ = outCh;
2332 }
2333 }
2334 }
2335 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002336 inShift = 0;
2337 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002338 if (surrogate) {
2339 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002340 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002341 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002342 if (base64bits > 0) { /* left-over bits */
2343 if (base64bits >= 6) {
2344 /* We've seen at least one base-64 character */
2345 errmsg = "partial character in shift sequence";
2346 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002347 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002348 else {
2349 /* Some bits remain; they should be zero */
2350 if (base64buffer != 0) {
2351 errmsg = "non-zero padding bits in shift sequence";
2352 goto utf7Error;
2353 }
2354 }
2355 }
2356 if (ch != '-') {
2357 /* '-' is absorbed; other terminating
2358 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002359 *p++ = ch;
2360 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002361 }
2362 }
2363 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002364 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002365 s++; /* consume '+' */
2366 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002367 s++;
2368 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002369 }
2370 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002371 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002372 shiftOutStart = p;
2373 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002374 }
2375 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002376 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002377 *p++ = ch;
2378 s++;
2379 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002380 else {
2381 startinpos = s-starts;
2382 s++;
2383 errmsg = "unexpected special character";
2384 goto utf7Error;
2385 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002386 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002387utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002388 outpos = p-PyUnicode_AS_UNICODE(unicode);
2389 endinpos = s-starts;
2390 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002391 errors, &errorHandler,
2392 "utf7", errmsg,
2393 &starts, &e, &startinpos, &endinpos, &exc, &s,
2394 &unicode, &outpos, &p))
2395 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002396 }
2397
Antoine Pitrou244651a2009-05-04 18:56:13 +00002398 /* end of string */
2399
2400 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2401 /* if we're in an inconsistent state, that's an error */
2402 if (surrogate ||
2403 (base64bits >= 6) ||
2404 (base64bits > 0 && base64buffer != 0)) {
2405 outpos = p-PyUnicode_AS_UNICODE(unicode);
2406 endinpos = size;
2407 if (unicode_decode_call_errorhandler(
2408 errors, &errorHandler,
2409 "utf7", "unterminated shift sequence",
2410 &starts, &e, &startinpos, &endinpos, &exc, &s,
2411 &unicode, &outpos, &p))
2412 goto onError;
2413 if (s < e)
2414 goto restart;
2415 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002416 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002417
2418 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002419 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002420 if (inShift) {
2421 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002422 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002423 }
2424 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002425 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002426 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002427 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002428
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002429 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002430 goto onError;
2431
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002432 Py_XDECREF(errorHandler);
2433 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002434 return (PyObject *)unicode;
2435
Benjamin Peterson29060642009-01-31 22:14:21 +00002436 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002437 Py_XDECREF(errorHandler);
2438 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002439 Py_DECREF(unicode);
2440 return NULL;
2441}
2442
2443
Alexander Belopolsky40018472011-02-26 01:02:56 +00002444PyObject *
2445PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2446 Py_ssize_t size,
2447 int base64SetO,
2448 int base64WhiteSpace,
2449 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002450{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002451 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002452 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002453 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002454 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002455 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002456 unsigned int base64bits = 0;
2457 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002458 char * out;
2459 char * start;
2460
2461 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002462 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002463
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002464 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002465 return PyErr_NoMemory();
2466
Antoine Pitrou244651a2009-05-04 18:56:13 +00002467 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002468 if (v == NULL)
2469 return NULL;
2470
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002471 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002472 for (;i < size; ++i) {
2473 Py_UNICODE ch = s[i];
2474
Antoine Pitrou244651a2009-05-04 18:56:13 +00002475 if (inShift) {
2476 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2477 /* shifting out */
2478 if (base64bits) { /* output remaining bits */
2479 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2480 base64buffer = 0;
2481 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002482 }
2483 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002484 /* Characters not in the BASE64 set implicitly unshift the sequence
2485 so no '-' is required, except if the character is itself a '-' */
2486 if (IS_BASE64(ch) || ch == '-') {
2487 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002488 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002489 *out++ = (char) ch;
2490 }
2491 else {
2492 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002493 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002494 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002495 else { /* not in a shift sequence */
2496 if (ch == '+') {
2497 *out++ = '+';
2498 *out++ = '-';
2499 }
2500 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2501 *out++ = (char) ch;
2502 }
2503 else {
2504 *out++ = '+';
2505 inShift = 1;
2506 goto encode_char;
2507 }
2508 }
2509 continue;
2510encode_char:
2511#ifdef Py_UNICODE_WIDE
2512 if (ch >= 0x10000) {
2513 /* code first surrogate */
2514 base64bits += 16;
2515 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2516 while (base64bits >= 6) {
2517 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2518 base64bits -= 6;
2519 }
2520 /* prepare second surrogate */
2521 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2522 }
2523#endif
2524 base64bits += 16;
2525 base64buffer = (base64buffer << 16) | ch;
2526 while (base64bits >= 6) {
2527 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2528 base64bits -= 6;
2529 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002530 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002531 if (base64bits)
2532 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2533 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002534 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002535 if (_PyBytes_Resize(&v, out - start) < 0)
2536 return NULL;
2537 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002538}
2539
Antoine Pitrou244651a2009-05-04 18:56:13 +00002540#undef IS_BASE64
2541#undef FROM_BASE64
2542#undef TO_BASE64
2543#undef DECODE_DIRECT
2544#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002545
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546/* --- UTF-8 Codec -------------------------------------------------------- */
2547
Tim Petersced69f82003-09-16 20:30:58 +00002548static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002550 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2551 illegal prefix. See RFC 3629 for details */
2552 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2553 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002554 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2556 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2557 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2558 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002559 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2560 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2562 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002563 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2564 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2565 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2566 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2567 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002568};
2569
Alexander Belopolsky40018472011-02-26 01:02:56 +00002570PyObject *
2571PyUnicode_DecodeUTF8(const char *s,
2572 Py_ssize_t size,
2573 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002574{
Walter Dörwald69652032004-09-07 20:24:22 +00002575 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2576}
2577
Antoine Pitrouab868312009-01-10 15:40:25 +00002578/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2579#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2580
2581/* Mask to quickly check whether a C 'long' contains a
2582 non-ASCII, UTF8-encoded char. */
2583#if (SIZEOF_LONG == 8)
2584# define ASCII_CHAR_MASK 0x8080808080808080L
2585#elif (SIZEOF_LONG == 4)
2586# define ASCII_CHAR_MASK 0x80808080L
2587#else
2588# error C 'long' size should be either 4 or 8!
2589#endif
2590
Alexander Belopolsky40018472011-02-26 01:02:56 +00002591PyObject *
2592PyUnicode_DecodeUTF8Stateful(const char *s,
2593 Py_ssize_t size,
2594 const char *errors,
2595 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002596{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002597 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002599 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002600 Py_ssize_t startinpos;
2601 Py_ssize_t endinpos;
2602 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002603 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 PyUnicodeObject *unicode;
2605 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002606 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002607 PyObject *errorHandler = NULL;
2608 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609
2610 /* Note: size will always be longer than the resulting Unicode
2611 character count */
2612 unicode = _PyUnicode_New(size);
2613 if (!unicode)
2614 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002615 if (size == 0) {
2616 if (consumed)
2617 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002619 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620
2621 /* Unpack UTF-8 encoded data */
2622 p = unicode->str;
2623 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002624 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625
2626 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002627 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002628
2629 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002630 /* Fast path for runs of ASCII characters. Given that common UTF-8
2631 input will consist of an overwhelming majority of ASCII
2632 characters, we try to optimize for this case by checking
2633 as many characters as a C 'long' can contain.
2634 First, check if we can do an aligned read, as most CPUs have
2635 a penalty for unaligned reads.
2636 */
2637 if (!((size_t) s & LONG_PTR_MASK)) {
2638 /* Help register allocation */
2639 register const char *_s = s;
2640 register Py_UNICODE *_p = p;
2641 while (_s < aligned_end) {
2642 /* Read a whole long at a time (either 4 or 8 bytes),
2643 and do a fast unrolled copy if it only contains ASCII
2644 characters. */
2645 unsigned long data = *(unsigned long *) _s;
2646 if (data & ASCII_CHAR_MASK)
2647 break;
2648 _p[0] = (unsigned char) _s[0];
2649 _p[1] = (unsigned char) _s[1];
2650 _p[2] = (unsigned char) _s[2];
2651 _p[3] = (unsigned char) _s[3];
2652#if (SIZEOF_LONG == 8)
2653 _p[4] = (unsigned char) _s[4];
2654 _p[5] = (unsigned char) _s[5];
2655 _p[6] = (unsigned char) _s[6];
2656 _p[7] = (unsigned char) _s[7];
2657#endif
2658 _s += SIZEOF_LONG;
2659 _p += SIZEOF_LONG;
2660 }
2661 s = _s;
2662 p = _p;
2663 if (s == e)
2664 break;
2665 ch = (unsigned char)*s;
2666 }
2667 }
2668
2669 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002670 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 s++;
2672 continue;
2673 }
2674
2675 n = utf8_code_length[ch];
2676
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002677 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002678 if (consumed)
2679 break;
2680 else {
2681 errmsg = "unexpected end of data";
2682 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002683 endinpos = startinpos+1;
2684 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2685 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002686 goto utf8Error;
2687 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002688 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002689
2690 switch (n) {
2691
2692 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002693 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002694 startinpos = s-starts;
2695 endinpos = startinpos+1;
2696 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697
2698 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002699 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002700 startinpos = s-starts;
2701 endinpos = startinpos+1;
2702 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002703
2704 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002705 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002706 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002707 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002708 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002709 goto utf8Error;
2710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002711 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002712 assert ((ch > 0x007F) && (ch <= 0x07FF));
2713 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 break;
2715
2716 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002717 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2718 will result in surrogates in range d800-dfff. Surrogates are
2719 not valid UTF-8 so they are rejected.
2720 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2721 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002722 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002723 (s[2] & 0xc0) != 0x80 ||
2724 ((unsigned char)s[0] == 0xE0 &&
2725 (unsigned char)s[1] < 0xA0) ||
2726 ((unsigned char)s[0] == 0xED &&
2727 (unsigned char)s[1] > 0x9F)) {
2728 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002729 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002730 endinpos = startinpos + 1;
2731
2732 /* if s[1] first two bits are 1 and 0, then the invalid
2733 continuation byte is s[2], so increment endinpos by 1,
2734 if not, s[1] is invalid and endinpos doesn't need to
2735 be incremented. */
2736 if ((s[1] & 0xC0) == 0x80)
2737 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002738 goto utf8Error;
2739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002741 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2742 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002743 break;
2744
2745 case 4:
2746 if ((s[1] & 0xc0) != 0x80 ||
2747 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002748 (s[3] & 0xc0) != 0x80 ||
2749 ((unsigned char)s[0] == 0xF0 &&
2750 (unsigned char)s[1] < 0x90) ||
2751 ((unsigned char)s[0] == 0xF4 &&
2752 (unsigned char)s[1] > 0x8F)) {
2753 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002754 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002755 endinpos = startinpos + 1;
2756 if ((s[1] & 0xC0) == 0x80) {
2757 endinpos++;
2758 if ((s[2] & 0xC0) == 0x80)
2759 endinpos++;
2760 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002761 goto utf8Error;
2762 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002763 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002764 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2765 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2766
Fredrik Lundh8f455852001-06-27 18:59:43 +00002767#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002768 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002769#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002770 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002771
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002772 /* translate from 10000..10FFFF to 0..FFFF */
2773 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002774
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002775 /* high surrogate = top 10 bits added to D800 */
2776 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002777
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002778 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002779 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002780#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782 }
2783 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002784 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002785
Benjamin Peterson29060642009-01-31 22:14:21 +00002786 utf8Error:
2787 outpos = p-PyUnicode_AS_UNICODE(unicode);
2788 if (unicode_decode_call_errorhandler(
2789 errors, &errorHandler,
2790 "utf8", errmsg,
2791 &starts, &e, &startinpos, &endinpos, &exc, &s,
2792 &unicode, &outpos, &p))
2793 goto onError;
2794 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795 }
Walter Dörwald69652032004-09-07 20:24:22 +00002796 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002797 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798
2799 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002800 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801 goto onError;
2802
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002803 Py_XDECREF(errorHandler);
2804 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 return (PyObject *)unicode;
2806
Benjamin Peterson29060642009-01-31 22:14:21 +00002807 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002808 Py_XDECREF(errorHandler);
2809 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810 Py_DECREF(unicode);
2811 return NULL;
2812}
2813
Antoine Pitrouab868312009-01-10 15:40:25 +00002814#undef ASCII_CHAR_MASK
2815
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002816#ifdef __APPLE__
2817
2818/* Simplified UTF-8 decoder using surrogateescape error handler,
2819 used to decode the command line arguments on Mac OS X. */
2820
2821wchar_t*
2822_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2823{
2824 int n;
2825 const char *e;
2826 wchar_t *unicode, *p;
2827
2828 /* Note: size will always be longer than the resulting Unicode
2829 character count */
2830 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2831 PyErr_NoMemory();
2832 return NULL;
2833 }
2834 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2835 if (!unicode)
2836 return NULL;
2837
2838 /* Unpack UTF-8 encoded data */
2839 p = unicode;
2840 e = s + size;
2841 while (s < e) {
2842 Py_UCS4 ch = (unsigned char)*s;
2843
2844 if (ch < 0x80) {
2845 *p++ = (wchar_t)ch;
2846 s++;
2847 continue;
2848 }
2849
2850 n = utf8_code_length[ch];
2851 if (s + n > e) {
2852 goto surrogateescape;
2853 }
2854
2855 switch (n) {
2856 case 0:
2857 case 1:
2858 goto surrogateescape;
2859
2860 case 2:
2861 if ((s[1] & 0xc0) != 0x80)
2862 goto surrogateescape;
2863 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2864 assert ((ch > 0x007F) && (ch <= 0x07FF));
2865 *p++ = (wchar_t)ch;
2866 break;
2867
2868 case 3:
2869 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2870 will result in surrogates in range d800-dfff. Surrogates are
2871 not valid UTF-8 so they are rejected.
2872 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2873 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2874 if ((s[1] & 0xc0) != 0x80 ||
2875 (s[2] & 0xc0) != 0x80 ||
2876 ((unsigned char)s[0] == 0xE0 &&
2877 (unsigned char)s[1] < 0xA0) ||
2878 ((unsigned char)s[0] == 0xED &&
2879 (unsigned char)s[1] > 0x9F)) {
2880
2881 goto surrogateescape;
2882 }
2883 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2884 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2885 *p++ = (Py_UNICODE)ch;
2886 break;
2887
2888 case 4:
2889 if ((s[1] & 0xc0) != 0x80 ||
2890 (s[2] & 0xc0) != 0x80 ||
2891 (s[3] & 0xc0) != 0x80 ||
2892 ((unsigned char)s[0] == 0xF0 &&
2893 (unsigned char)s[1] < 0x90) ||
2894 ((unsigned char)s[0] == 0xF4 &&
2895 (unsigned char)s[1] > 0x8F)) {
2896 goto surrogateescape;
2897 }
2898 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2899 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2900 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2901
2902#if SIZEOF_WCHAR_T == 4
2903 *p++ = (wchar_t)ch;
2904#else
2905 /* compute and append the two surrogates: */
2906
2907 /* translate from 10000..10FFFF to 0..FFFF */
2908 ch -= 0x10000;
2909
2910 /* high surrogate = top 10 bits added to D800 */
2911 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2912
2913 /* low surrogate = bottom 10 bits added to DC00 */
2914 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2915#endif
2916 break;
2917 }
2918 s += n;
2919 continue;
2920
2921 surrogateescape:
2922 *p++ = 0xDC00 + ch;
2923 s++;
2924 }
2925 *p = L'\0';
2926 return unicode;
2927}
2928
2929#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002930
Tim Peters602f7402002-04-27 18:03:26 +00002931/* Allocation strategy: if the string is short, convert into a stack buffer
2932 and allocate exactly as much space needed at the end. Else allocate the
2933 maximum possible needed (4 result bytes per Unicode character), and return
2934 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002935*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002936PyObject *
2937PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002938 Py_ssize_t size,
2939 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940{
Tim Peters602f7402002-04-27 18:03:26 +00002941#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002942
Guido van Rossum98297ee2007-11-06 21:34:58 +00002943 Py_ssize_t i; /* index into s of next input byte */
2944 PyObject *result; /* result string object */
2945 char *p; /* next free byte in output buffer */
2946 Py_ssize_t nallocated; /* number of result bytes allocated */
2947 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002948 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002949 PyObject *errorHandler = NULL;
2950 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002951
Tim Peters602f7402002-04-27 18:03:26 +00002952 assert(s != NULL);
2953 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002954
Tim Peters602f7402002-04-27 18:03:26 +00002955 if (size <= MAX_SHORT_UNICHARS) {
2956 /* Write into the stack buffer; nallocated can't overflow.
2957 * At the end, we'll allocate exactly as much heap space as it
2958 * turns out we need.
2959 */
2960 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002961 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002962 p = stackbuf;
2963 }
2964 else {
2965 /* Overallocate on the heap, and give the excess back at the end. */
2966 nallocated = size * 4;
2967 if (nallocated / 4 != size) /* overflow! */
2968 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002969 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002970 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002971 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002972 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002973 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002974
Tim Peters602f7402002-04-27 18:03:26 +00002975 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002976 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002977
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002978 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002979 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002981
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002983 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002984 *p++ = (char)(0xc0 | (ch >> 6));
2985 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002986 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002987#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002988 /* Special case: check for high and low surrogate */
2989 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2990 Py_UCS4 ch2 = s[i];
2991 /* Combine the two surrogates to form a UCS4 value */
2992 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2993 i++;
2994
2995 /* Encode UCS4 Unicode ordinals */
2996 *p++ = (char)(0xf0 | (ch >> 18));
2997 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002998 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2999 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00003000 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00003001#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003002 Py_ssize_t newpos;
3003 PyObject *rep;
3004 Py_ssize_t repsize, k;
3005 rep = unicode_encode_call_errorhandler
3006 (errors, &errorHandler, "utf-8", "surrogates not allowed",
3007 s, size, &exc, i-1, i, &newpos);
3008 if (!rep)
3009 goto error;
3010
3011 if (PyBytes_Check(rep))
3012 repsize = PyBytes_GET_SIZE(rep);
3013 else
3014 repsize = PyUnicode_GET_SIZE(rep);
3015
3016 if (repsize > 4) {
3017 Py_ssize_t offset;
3018
3019 if (result == NULL)
3020 offset = p - stackbuf;
3021 else
3022 offset = p - PyBytes_AS_STRING(result);
3023
3024 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
3025 /* integer overflow */
3026 PyErr_NoMemory();
3027 goto error;
3028 }
3029 nallocated += repsize - 4;
3030 if (result != NULL) {
3031 if (_PyBytes_Resize(&result, nallocated) < 0)
3032 goto error;
3033 } else {
3034 result = PyBytes_FromStringAndSize(NULL, nallocated);
3035 if (result == NULL)
3036 goto error;
3037 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3038 }
3039 p = PyBytes_AS_STRING(result) + offset;
3040 }
3041
3042 if (PyBytes_Check(rep)) {
3043 char *prep = PyBytes_AS_STRING(rep);
3044 for(k = repsize; k > 0; k--)
3045 *p++ = *prep++;
3046 } else /* rep is unicode */ {
3047 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3048 Py_UNICODE c;
3049
3050 for(k=0; k<repsize; k++) {
3051 c = prep[k];
3052 if (0x80 <= c) {
3053 raise_encode_exception(&exc, "utf-8", s, size,
3054 i-1, i, "surrogates not allowed");
3055 goto error;
3056 }
3057 *p++ = (char)prep[k];
3058 }
3059 }
3060 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003061#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003062 }
Victor Stinner445a6232010-04-22 20:01:57 +00003063#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003064 } else if (ch < 0x10000) {
3065 *p++ = (char)(0xe0 | (ch >> 12));
3066 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3067 *p++ = (char)(0x80 | (ch & 0x3f));
3068 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003069 /* Encode UCS4 Unicode ordinals */
3070 *p++ = (char)(0xf0 | (ch >> 18));
3071 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3072 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3073 *p++ = (char)(0x80 | (ch & 0x3f));
3074 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003076
Guido van Rossum98297ee2007-11-06 21:34:58 +00003077 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003078 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003079 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003080 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003081 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003082 }
3083 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003084 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003085 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003086 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003087 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003088 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003089 Py_XDECREF(errorHandler);
3090 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003091 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003092 error:
3093 Py_XDECREF(errorHandler);
3094 Py_XDECREF(exc);
3095 Py_XDECREF(result);
3096 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003097
Tim Peters602f7402002-04-27 18:03:26 +00003098#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099}
3100
Alexander Belopolsky40018472011-02-26 01:02:56 +00003101PyObject *
3102PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103{
Victor Stinnera5c68c32011-03-02 01:03:14 +00003104 PyObject *utf8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 if (!PyUnicode_Check(unicode)) {
3106 PyErr_BadArgument();
3107 return NULL;
3108 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003109 utf8 = _PyUnicode_AsDefaultEncodedString(unicode);
3110 if (utf8 == NULL)
3111 return NULL;
3112 Py_INCREF(utf8);
3113 return utf8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114}
3115
Walter Dörwald41980ca2007-08-16 21:55:45 +00003116/* --- UTF-32 Codec ------------------------------------------------------- */
3117
3118PyObject *
3119PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003120 Py_ssize_t size,
3121 const char *errors,
3122 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003123{
3124 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3125}
3126
3127PyObject *
3128PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003129 Py_ssize_t size,
3130 const char *errors,
3131 int *byteorder,
3132 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003133{
3134 const char *starts = s;
3135 Py_ssize_t startinpos;
3136 Py_ssize_t endinpos;
3137 Py_ssize_t outpos;
3138 PyUnicodeObject *unicode;
3139 Py_UNICODE *p;
3140#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003141 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003142 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003143#else
3144 const int pairs = 0;
3145#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003146 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003147 int bo = 0; /* assume native ordering by default */
3148 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003149 /* Offsets from q for retrieving bytes in the right order. */
3150#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3151 int iorder[] = {0, 1, 2, 3};
3152#else
3153 int iorder[] = {3, 2, 1, 0};
3154#endif
3155 PyObject *errorHandler = NULL;
3156 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003157
Walter Dörwald41980ca2007-08-16 21:55:45 +00003158 q = (unsigned char *)s;
3159 e = q + size;
3160
3161 if (byteorder)
3162 bo = *byteorder;
3163
3164 /* Check for BOM marks (U+FEFF) in the input and adjust current
3165 byte order setting accordingly. In native mode, the leading BOM
3166 mark is skipped, in all other modes, it is copied to the output
3167 stream as-is (giving a ZWNBSP character). */
3168 if (bo == 0) {
3169 if (size >= 4) {
3170 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003171 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003172#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003173 if (bom == 0x0000FEFF) {
3174 q += 4;
3175 bo = -1;
3176 }
3177 else if (bom == 0xFFFE0000) {
3178 q += 4;
3179 bo = 1;
3180 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003181#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003182 if (bom == 0x0000FEFF) {
3183 q += 4;
3184 bo = 1;
3185 }
3186 else if (bom == 0xFFFE0000) {
3187 q += 4;
3188 bo = -1;
3189 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003190#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003191 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003192 }
3193
3194 if (bo == -1) {
3195 /* force LE */
3196 iorder[0] = 0;
3197 iorder[1] = 1;
3198 iorder[2] = 2;
3199 iorder[3] = 3;
3200 }
3201 else if (bo == 1) {
3202 /* force BE */
3203 iorder[0] = 3;
3204 iorder[1] = 2;
3205 iorder[2] = 1;
3206 iorder[3] = 0;
3207 }
3208
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003209 /* On narrow builds we split characters outside the BMP into two
3210 codepoints => count how much extra space we need. */
3211#ifndef Py_UNICODE_WIDE
3212 for (qq = q; qq < e; qq += 4)
3213 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3214 pairs++;
3215#endif
3216
3217 /* This might be one to much, because of a BOM */
3218 unicode = _PyUnicode_New((size+3)/4+pairs);
3219 if (!unicode)
3220 return NULL;
3221 if (size == 0)
3222 return (PyObject *)unicode;
3223
3224 /* Unpack UTF-32 encoded data */
3225 p = unicode->str;
3226
Walter Dörwald41980ca2007-08-16 21:55:45 +00003227 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 Py_UCS4 ch;
3229 /* remaining bytes at the end? (size should be divisible by 4) */
3230 if (e-q<4) {
3231 if (consumed)
3232 break;
3233 errmsg = "truncated data";
3234 startinpos = ((const char *)q)-starts;
3235 endinpos = ((const char *)e)-starts;
3236 goto utf32Error;
3237 /* The remaining input chars are ignored if the callback
3238 chooses to skip the input */
3239 }
3240 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3241 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003242
Benjamin Peterson29060642009-01-31 22:14:21 +00003243 if (ch >= 0x110000)
3244 {
3245 errmsg = "codepoint not in range(0x110000)";
3246 startinpos = ((const char *)q)-starts;
3247 endinpos = startinpos+4;
3248 goto utf32Error;
3249 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003250#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003251 if (ch >= 0x10000)
3252 {
3253 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3254 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3255 }
3256 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003257#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003258 *p++ = ch;
3259 q += 4;
3260 continue;
3261 utf32Error:
3262 outpos = p-PyUnicode_AS_UNICODE(unicode);
3263 if (unicode_decode_call_errorhandler(
3264 errors, &errorHandler,
3265 "utf32", errmsg,
3266 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3267 &unicode, &outpos, &p))
3268 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003269 }
3270
3271 if (byteorder)
3272 *byteorder = bo;
3273
3274 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003275 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003276
3277 /* Adjust length */
3278 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3279 goto onError;
3280
3281 Py_XDECREF(errorHandler);
3282 Py_XDECREF(exc);
3283 return (PyObject *)unicode;
3284
Benjamin Peterson29060642009-01-31 22:14:21 +00003285 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003286 Py_DECREF(unicode);
3287 Py_XDECREF(errorHandler);
3288 Py_XDECREF(exc);
3289 return NULL;
3290}
3291
3292PyObject *
3293PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003294 Py_ssize_t size,
3295 const char *errors,
3296 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003297{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003298 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003299 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003300 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003301#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003302 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003303#else
3304 const int pairs = 0;
3305#endif
3306 /* Offsets from p for storing byte pairs in the right order. */
3307#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3308 int iorder[] = {0, 1, 2, 3};
3309#else
3310 int iorder[] = {3, 2, 1, 0};
3311#endif
3312
Benjamin Peterson29060642009-01-31 22:14:21 +00003313#define STORECHAR(CH) \
3314 do { \
3315 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3316 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3317 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3318 p[iorder[0]] = (CH) & 0xff; \
3319 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003320 } while(0)
3321
3322 /* In narrow builds we can output surrogate pairs as one codepoint,
3323 so we need less space. */
3324#ifndef Py_UNICODE_WIDE
3325 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003326 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3327 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3328 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003329#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003330 nsize = (size - pairs + (byteorder == 0));
3331 bytesize = nsize * 4;
3332 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003333 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003334 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003335 if (v == NULL)
3336 return NULL;
3337
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003338 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003339 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003340 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003341 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003342 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003343
3344 if (byteorder == -1) {
3345 /* force LE */
3346 iorder[0] = 0;
3347 iorder[1] = 1;
3348 iorder[2] = 2;
3349 iorder[3] = 3;
3350 }
3351 else if (byteorder == 1) {
3352 /* force BE */
3353 iorder[0] = 3;
3354 iorder[1] = 2;
3355 iorder[2] = 1;
3356 iorder[3] = 0;
3357 }
3358
3359 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003360 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003361#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003362 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3363 Py_UCS4 ch2 = *s;
3364 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3365 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3366 s++;
3367 size--;
3368 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003369 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003370#endif
3371 STORECHAR(ch);
3372 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003373
3374 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003375 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003376#undef STORECHAR
3377}
3378
Alexander Belopolsky40018472011-02-26 01:02:56 +00003379PyObject *
3380PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003381{
3382 if (!PyUnicode_Check(unicode)) {
3383 PyErr_BadArgument();
3384 return NULL;
3385 }
3386 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003387 PyUnicode_GET_SIZE(unicode),
3388 NULL,
3389 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003390}
3391
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392/* --- UTF-16 Codec ------------------------------------------------------- */
3393
Tim Peters772747b2001-08-09 22:21:55 +00003394PyObject *
3395PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003396 Py_ssize_t size,
3397 const char *errors,
3398 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003399{
Walter Dörwald69652032004-09-07 20:24:22 +00003400 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3401}
3402
Antoine Pitrouab868312009-01-10 15:40:25 +00003403/* Two masks for fast checking of whether a C 'long' may contain
3404 UTF16-encoded surrogate characters. This is an efficient heuristic,
3405 assuming that non-surrogate characters with a code point >= 0x8000 are
3406 rare in most input.
3407 FAST_CHAR_MASK is used when the input is in native byte ordering,
3408 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003409*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003410#if (SIZEOF_LONG == 8)
3411# define FAST_CHAR_MASK 0x8000800080008000L
3412# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3413#elif (SIZEOF_LONG == 4)
3414# define FAST_CHAR_MASK 0x80008000L
3415# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3416#else
3417# error C 'long' size should be either 4 or 8!
3418#endif
3419
Walter Dörwald69652032004-09-07 20:24:22 +00003420PyObject *
3421PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003422 Py_ssize_t size,
3423 const char *errors,
3424 int *byteorder,
3425 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003426{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003427 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003428 Py_ssize_t startinpos;
3429 Py_ssize_t endinpos;
3430 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003431 PyUnicodeObject *unicode;
3432 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003433 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003434 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003435 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003436 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003437 /* Offsets from q for retrieving byte pairs in the right order. */
3438#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3439 int ihi = 1, ilo = 0;
3440#else
3441 int ihi = 0, ilo = 1;
3442#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003443 PyObject *errorHandler = NULL;
3444 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445
3446 /* Note: size will always be longer than the resulting Unicode
3447 character count */
3448 unicode = _PyUnicode_New(size);
3449 if (!unicode)
3450 return NULL;
3451 if (size == 0)
3452 return (PyObject *)unicode;
3453
3454 /* Unpack UTF-16 encoded data */
3455 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003456 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003457 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003458
3459 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003460 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003462 /* Check for BOM marks (U+FEFF) in the input and adjust current
3463 byte order setting accordingly. In native mode, the leading BOM
3464 mark is skipped, in all other modes, it is copied to the output
3465 stream as-is (giving a ZWNBSP character). */
3466 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003467 if (size >= 2) {
3468 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003469#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003470 if (bom == 0xFEFF) {
3471 q += 2;
3472 bo = -1;
3473 }
3474 else if (bom == 0xFFFE) {
3475 q += 2;
3476 bo = 1;
3477 }
Tim Petersced69f82003-09-16 20:30:58 +00003478#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003479 if (bom == 0xFEFF) {
3480 q += 2;
3481 bo = 1;
3482 }
3483 else if (bom == 0xFFFE) {
3484 q += 2;
3485 bo = -1;
3486 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003487#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003488 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490
Tim Peters772747b2001-08-09 22:21:55 +00003491 if (bo == -1) {
3492 /* force LE */
3493 ihi = 1;
3494 ilo = 0;
3495 }
3496 else if (bo == 1) {
3497 /* force BE */
3498 ihi = 0;
3499 ilo = 1;
3500 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003501#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3502 native_ordering = ilo < ihi;
3503#else
3504 native_ordering = ilo > ihi;
3505#endif
Tim Peters772747b2001-08-09 22:21:55 +00003506
Antoine Pitrouab868312009-01-10 15:40:25 +00003507 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003508 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003509 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003510 /* First check for possible aligned read of a C 'long'. Unaligned
3511 reads are more expensive, better to defer to another iteration. */
3512 if (!((size_t) q & LONG_PTR_MASK)) {
3513 /* Fast path for runs of non-surrogate chars. */
3514 register const unsigned char *_q = q;
3515 Py_UNICODE *_p = p;
3516 if (native_ordering) {
3517 /* Native ordering is simple: as long as the input cannot
3518 possibly contain a surrogate char, do an unrolled copy
3519 of several 16-bit code points to the target object.
3520 The non-surrogate check is done on several input bytes
3521 at a time (as many as a C 'long' can contain). */
3522 while (_q < aligned_end) {
3523 unsigned long data = * (unsigned long *) _q;
3524 if (data & FAST_CHAR_MASK)
3525 break;
3526 _p[0] = ((unsigned short *) _q)[0];
3527 _p[1] = ((unsigned short *) _q)[1];
3528#if (SIZEOF_LONG == 8)
3529 _p[2] = ((unsigned short *) _q)[2];
3530 _p[3] = ((unsigned short *) _q)[3];
3531#endif
3532 _q += SIZEOF_LONG;
3533 _p += SIZEOF_LONG / 2;
3534 }
3535 }
3536 else {
3537 /* Byteswapped ordering is similar, but we must decompose
3538 the copy bytewise, and take care of zero'ing out the
3539 upper bytes if the target object is in 32-bit units
3540 (that is, in UCS-4 builds). */
3541 while (_q < aligned_end) {
3542 unsigned long data = * (unsigned long *) _q;
3543 if (data & SWAPPED_FAST_CHAR_MASK)
3544 break;
3545 /* Zero upper bytes in UCS-4 builds */
3546#if (Py_UNICODE_SIZE > 2)
3547 _p[0] = 0;
3548 _p[1] = 0;
3549#if (SIZEOF_LONG == 8)
3550 _p[2] = 0;
3551 _p[3] = 0;
3552#endif
3553#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003554 /* Issue #4916; UCS-4 builds on big endian machines must
3555 fill the two last bytes of each 4-byte unit. */
3556#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3557# define OFF 2
3558#else
3559# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003560#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003561 ((unsigned char *) _p)[OFF + 1] = _q[0];
3562 ((unsigned char *) _p)[OFF + 0] = _q[1];
3563 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3564 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3565#if (SIZEOF_LONG == 8)
3566 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3567 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3568 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3569 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3570#endif
3571#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003572 _q += SIZEOF_LONG;
3573 _p += SIZEOF_LONG / 2;
3574 }
3575 }
3576 p = _p;
3577 q = _q;
3578 if (q >= e)
3579 break;
3580 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003581 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003582
Benjamin Peterson14339b62009-01-31 16:36:08 +00003583 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003584
3585 if (ch < 0xD800 || ch > 0xDFFF) {
3586 *p++ = ch;
3587 continue;
3588 }
3589
3590 /* UTF-16 code pair: */
3591 if (q > e) {
3592 errmsg = "unexpected end of data";
3593 startinpos = (((const char *)q) - 2) - starts;
3594 endinpos = ((const char *)e) + 1 - starts;
3595 goto utf16Error;
3596 }
3597 if (0xD800 <= ch && ch <= 0xDBFF) {
3598 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3599 q += 2;
3600 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003601#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003602 *p++ = ch;
3603 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003604#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003605 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003606#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003607 continue;
3608 }
3609 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003610 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003611 startinpos = (((const char *)q)-4)-starts;
3612 endinpos = startinpos+2;
3613 goto utf16Error;
3614 }
3615
Benjamin Peterson14339b62009-01-31 16:36:08 +00003616 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003617 errmsg = "illegal encoding";
3618 startinpos = (((const char *)q)-2)-starts;
3619 endinpos = startinpos+2;
3620 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003621
Benjamin Peterson29060642009-01-31 22:14:21 +00003622 utf16Error:
3623 outpos = p - PyUnicode_AS_UNICODE(unicode);
3624 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003625 errors,
3626 &errorHandler,
3627 "utf16", errmsg,
3628 &starts,
3629 (const char **)&e,
3630 &startinpos,
3631 &endinpos,
3632 &exc,
3633 (const char **)&q,
3634 &unicode,
3635 &outpos,
3636 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003637 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003639 /* remaining byte at the end? (size should be even) */
3640 if (e == q) {
3641 if (!consumed) {
3642 errmsg = "truncated data";
3643 startinpos = ((const char *)q) - starts;
3644 endinpos = ((const char *)e) + 1 - starts;
3645 outpos = p - PyUnicode_AS_UNICODE(unicode);
3646 if (unicode_decode_call_errorhandler(
3647 errors,
3648 &errorHandler,
3649 "utf16", errmsg,
3650 &starts,
3651 (const char **)&e,
3652 &startinpos,
3653 &endinpos,
3654 &exc,
3655 (const char **)&q,
3656 &unicode,
3657 &outpos,
3658 &p))
3659 goto onError;
3660 /* The remaining input chars are ignored if the callback
3661 chooses to skip the input */
3662 }
3663 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664
3665 if (byteorder)
3666 *byteorder = bo;
3667
Walter Dörwald69652032004-09-07 20:24:22 +00003668 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003669 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003670
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003672 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673 goto onError;
3674
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 Py_XDECREF(errorHandler);
3676 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 return (PyObject *)unicode;
3678
Benjamin Peterson29060642009-01-31 22:14:21 +00003679 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681 Py_XDECREF(errorHandler);
3682 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 return NULL;
3684}
3685
Antoine Pitrouab868312009-01-10 15:40:25 +00003686#undef FAST_CHAR_MASK
3687#undef SWAPPED_FAST_CHAR_MASK
3688
Tim Peters772747b2001-08-09 22:21:55 +00003689PyObject *
3690PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003691 Py_ssize_t size,
3692 const char *errors,
3693 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003695 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003696 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003697 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003698#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003699 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003700#else
3701 const int pairs = 0;
3702#endif
Tim Peters772747b2001-08-09 22:21:55 +00003703 /* Offsets from p for storing byte pairs in the right order. */
3704#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3705 int ihi = 1, ilo = 0;
3706#else
3707 int ihi = 0, ilo = 1;
3708#endif
3709
Benjamin Peterson29060642009-01-31 22:14:21 +00003710#define STORECHAR(CH) \
3711 do { \
3712 p[ihi] = ((CH) >> 8) & 0xff; \
3713 p[ilo] = (CH) & 0xff; \
3714 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003715 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003717#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003718 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003719 if (s[i] >= 0x10000)
3720 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003721#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003722 /* 2 * (size + pairs + (byteorder == 0)) */
3723 if (size > PY_SSIZE_T_MAX ||
3724 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003725 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003726 nsize = size + pairs + (byteorder == 0);
3727 bytesize = nsize * 2;
3728 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003729 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003730 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 if (v == NULL)
3732 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003734 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003735 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003736 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003737 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003738 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003739
3740 if (byteorder == -1) {
3741 /* force LE */
3742 ihi = 1;
3743 ilo = 0;
3744 }
3745 else if (byteorder == 1) {
3746 /* force BE */
3747 ihi = 0;
3748 ilo = 1;
3749 }
3750
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003751 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003752 Py_UNICODE ch = *s++;
3753 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003754#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003755 if (ch >= 0x10000) {
3756 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3757 ch = 0xD800 | ((ch-0x10000) >> 10);
3758 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003759#endif
Tim Peters772747b2001-08-09 22:21:55 +00003760 STORECHAR(ch);
3761 if (ch2)
3762 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003763 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003764
3765 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003766 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003767#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003768}
3769
Alexander Belopolsky40018472011-02-26 01:02:56 +00003770PyObject *
3771PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003772{
3773 if (!PyUnicode_Check(unicode)) {
3774 PyErr_BadArgument();
3775 return NULL;
3776 }
3777 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003778 PyUnicode_GET_SIZE(unicode),
3779 NULL,
3780 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781}
3782
3783/* --- Unicode Escape Codec ----------------------------------------------- */
3784
Fredrik Lundh06d12682001-01-24 07:59:11 +00003785static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003786
Alexander Belopolsky40018472011-02-26 01:02:56 +00003787PyObject *
3788PyUnicode_DecodeUnicodeEscape(const char *s,
3789 Py_ssize_t size,
3790 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003793 Py_ssize_t startinpos;
3794 Py_ssize_t endinpos;
3795 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003798 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003800 char* message;
3801 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003802 PyObject *errorHandler = NULL;
3803 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003804
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 /* Escaped strings will always be longer than the resulting
3806 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003807 length after conversion to the true value.
3808 (but if the error callback returns a long replacement string
3809 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810 v = _PyUnicode_New(size);
3811 if (v == NULL)
3812 goto onError;
3813 if (size == 0)
3814 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003815
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003816 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003818
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819 while (s < end) {
3820 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003821 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003822 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823
3824 /* Non-escape characters are interpreted as Unicode ordinals */
3825 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003826 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 continue;
3828 }
3829
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003830 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003831 /* \ - Escapes */
3832 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003833 c = *s++;
3834 if (s > end)
3835 c = '\0'; /* Invalid after \ */
3836 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837
Benjamin Peterson29060642009-01-31 22:14:21 +00003838 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 case '\n': break;
3840 case '\\': *p++ = '\\'; break;
3841 case '\'': *p++ = '\''; break;
3842 case '\"': *p++ = '\"'; break;
3843 case 'b': *p++ = '\b'; break;
3844 case 'f': *p++ = '\014'; break; /* FF */
3845 case 't': *p++ = '\t'; break;
3846 case 'n': *p++ = '\n'; break;
3847 case 'r': *p++ = '\r'; break;
3848 case 'v': *p++ = '\013'; break; /* VT */
3849 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3850
Benjamin Peterson29060642009-01-31 22:14:21 +00003851 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852 case '0': case '1': case '2': case '3':
3853 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003854 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003855 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003856 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003857 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003858 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003859 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003860 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861 break;
3862
Benjamin Peterson29060642009-01-31 22:14:21 +00003863 /* hex escapes */
3864 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003865 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003866 digits = 2;
3867 message = "truncated \\xXX escape";
3868 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869
Benjamin Peterson29060642009-01-31 22:14:21 +00003870 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003872 digits = 4;
3873 message = "truncated \\uXXXX escape";
3874 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875
Benjamin Peterson29060642009-01-31 22:14:21 +00003876 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003877 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003878 digits = 8;
3879 message = "truncated \\UXXXXXXXX escape";
3880 hexescape:
3881 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003882 outpos = p-PyUnicode_AS_UNICODE(v);
3883 if (s+digits>end) {
3884 endinpos = size;
3885 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003886 errors, &errorHandler,
3887 "unicodeescape", "end of string in escape sequence",
3888 &starts, &end, &startinpos, &endinpos, &exc, &s,
3889 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003890 goto onError;
3891 goto nextByte;
3892 }
3893 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003894 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003895 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003896 endinpos = (s+i+1)-starts;
3897 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003898 errors, &errorHandler,
3899 "unicodeescape", message,
3900 &starts, &end, &startinpos, &endinpos, &exc, &s,
3901 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003902 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003903 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003904 }
3905 chr = (chr<<4) & ~0xF;
3906 if (c >= '0' && c <= '9')
3907 chr += c - '0';
3908 else if (c >= 'a' && c <= 'f')
3909 chr += 10 + c - 'a';
3910 else
3911 chr += 10 + c - 'A';
3912 }
3913 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003914 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003915 /* _decoding_error will have already written into the
3916 target buffer. */
3917 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003918 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003919 /* when we get here, chr is a 32-bit unicode character */
3920 if (chr <= 0xffff)
3921 /* UCS-2 character */
3922 *p++ = (Py_UNICODE) chr;
3923 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003924 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003925 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003926#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003927 *p++ = chr;
3928#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003929 chr -= 0x10000L;
3930 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003931 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003932#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003933 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003934 endinpos = s-starts;
3935 outpos = p-PyUnicode_AS_UNICODE(v);
3936 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003937 errors, &errorHandler,
3938 "unicodeescape", "illegal Unicode character",
3939 &starts, &end, &startinpos, &endinpos, &exc, &s,
3940 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003941 goto onError;
3942 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003943 break;
3944
Benjamin Peterson29060642009-01-31 22:14:21 +00003945 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003946 case 'N':
3947 message = "malformed \\N character escape";
3948 if (ucnhash_CAPI == NULL) {
3949 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003950 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003951 if (ucnhash_CAPI == NULL)
3952 goto ucnhashError;
3953 }
3954 if (*s == '{') {
3955 const char *start = s+1;
3956 /* look for the closing brace */
3957 while (*s != '}' && s < end)
3958 s++;
3959 if (s > start && s < end && *s == '}') {
3960 /* found a name. look it up in the unicode database */
3961 message = "unknown Unicode character name";
3962 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003963 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003964 goto store;
3965 }
3966 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003967 endinpos = s-starts;
3968 outpos = p-PyUnicode_AS_UNICODE(v);
3969 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003970 errors, &errorHandler,
3971 "unicodeescape", message,
3972 &starts, &end, &startinpos, &endinpos, &exc, &s,
3973 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003974 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003975 break;
3976
3977 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003978 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003979 message = "\\ at end of string";
3980 s--;
3981 endinpos = s-starts;
3982 outpos = p-PyUnicode_AS_UNICODE(v);
3983 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003984 errors, &errorHandler,
3985 "unicodeescape", message,
3986 &starts, &end, &startinpos, &endinpos, &exc, &s,
3987 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003988 goto onError;
3989 }
3990 else {
3991 *p++ = '\\';
3992 *p++ = (unsigned char)s[-1];
3993 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003994 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003996 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003997 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003999 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00004001 Py_XDECREF(errorHandler);
4002 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00004004
Benjamin Peterson29060642009-01-31 22:14:21 +00004005 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00004006 PyErr_SetString(
4007 PyExc_UnicodeError,
4008 "\\N escapes not supported (can't load unicodedata module)"
4009 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00004010 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011 Py_XDECREF(errorHandler);
4012 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00004013 return NULL;
4014
Benjamin Peterson29060642009-01-31 22:14:21 +00004015 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 Py_XDECREF(errorHandler);
4018 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004019 return NULL;
4020}
4021
4022/* Return a Unicode-Escape string version of the Unicode object.
4023
4024 If quotes is true, the string is enclosed in u"" or u'' quotes as
4025 appropriate.
4026
4027*/
4028
Thomas Wouters477c8d52006-05-27 19:21:47 +00004029Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004030 Py_ssize_t size,
4031 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00004032{
4033 /* like wcschr, but doesn't stop at NULL characters */
4034
4035 while (size-- > 0) {
4036 if (*s == ch)
4037 return s;
4038 s++;
4039 }
4040
4041 return NULL;
4042}
Barry Warsaw51ac5802000-03-20 16:36:48 +00004043
Walter Dörwald79e913e2007-05-12 11:08:06 +00004044static const char *hexdigits = "0123456789abcdef";
4045
Alexander Belopolsky40018472011-02-26 01:02:56 +00004046PyObject *
4047PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
4048 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004050 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004053#ifdef Py_UNICODE_WIDE
4054 const Py_ssize_t expandsize = 10;
4055#else
4056 const Py_ssize_t expandsize = 6;
4057#endif
4058
Thomas Wouters89f507f2006-12-13 04:49:30 +00004059 /* XXX(nnorwitz): rather than over-allocating, it would be
4060 better to choose a different scheme. Perhaps scan the
4061 first N-chars of the string and allocate based on that size.
4062 */
4063 /* Initial allocation is based on the longest-possible unichr
4064 escape.
4065
4066 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4067 unichr, so in this case it's the longest unichr escape. In
4068 narrow (UTF-16) builds this is five chars per source unichr
4069 since there are two unichrs in the surrogate pair, so in narrow
4070 (UTF-16) builds it's not the longest unichr escape.
4071
4072 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4073 so in the narrow (UTF-16) build case it's the longest unichr
4074 escape.
4075 */
4076
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004077 if (size == 0)
4078 return PyBytes_FromStringAndSize(NULL, 0);
4079
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004080 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004081 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004082
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004083 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004084 2
4085 + expandsize*size
4086 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087 if (repr == NULL)
4088 return NULL;
4089
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004090 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092 while (size-- > 0) {
4093 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004094
Walter Dörwald79e913e2007-05-12 11:08:06 +00004095 /* Escape backslashes */
4096 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097 *p++ = '\\';
4098 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004099 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004100 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004101
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004102#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004103 /* Map 21-bit characters to '\U00xxxxxx' */
4104 else if (ch >= 0x10000) {
4105 *p++ = '\\';
4106 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004107 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4108 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4109 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4110 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4111 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4112 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4113 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4114 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004115 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004116 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004117#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004118 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4119 else if (ch >= 0xD800 && ch < 0xDC00) {
4120 Py_UNICODE ch2;
4121 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004122
Benjamin Peterson29060642009-01-31 22:14:21 +00004123 ch2 = *s++;
4124 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004125 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004126 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4127 *p++ = '\\';
4128 *p++ = 'U';
4129 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4130 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4131 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4132 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4133 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4134 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4135 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4136 *p++ = hexdigits[ucs & 0x0000000F];
4137 continue;
4138 }
4139 /* Fall through: isolated surrogates are copied as-is */
4140 s--;
4141 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004142 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004143#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004144
Guido van Rossumd57fd912000-03-10 22:53:23 +00004145 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004146 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147 *p++ = '\\';
4148 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004149 *p++ = hexdigits[(ch >> 12) & 0x000F];
4150 *p++ = hexdigits[(ch >> 8) & 0x000F];
4151 *p++ = hexdigits[(ch >> 4) & 0x000F];
4152 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004154
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004155 /* Map special whitespace to '\t', \n', '\r' */
4156 else if (ch == '\t') {
4157 *p++ = '\\';
4158 *p++ = 't';
4159 }
4160 else if (ch == '\n') {
4161 *p++ = '\\';
4162 *p++ = 'n';
4163 }
4164 else if (ch == '\r') {
4165 *p++ = '\\';
4166 *p++ = 'r';
4167 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004168
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004169 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004170 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004172 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004173 *p++ = hexdigits[(ch >> 4) & 0x000F];
4174 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004175 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004176
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177 /* Copy everything else as-is */
4178 else
4179 *p++ = (char) ch;
4180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004182 assert(p - PyBytes_AS_STRING(repr) > 0);
4183 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4184 return NULL;
4185 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186}
4187
Alexander Belopolsky40018472011-02-26 01:02:56 +00004188PyObject *
4189PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004191 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 if (!PyUnicode_Check(unicode)) {
4193 PyErr_BadArgument();
4194 return NULL;
4195 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004196 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4197 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004198 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199}
4200
4201/* --- Raw Unicode Escape Codec ------------------------------------------- */
4202
Alexander Belopolsky40018472011-02-26 01:02:56 +00004203PyObject *
4204PyUnicode_DecodeRawUnicodeEscape(const char *s,
4205 Py_ssize_t size,
4206 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004209 Py_ssize_t startinpos;
4210 Py_ssize_t endinpos;
4211 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214 const char *end;
4215 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004216 PyObject *errorHandler = NULL;
4217 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004218
Guido van Rossumd57fd912000-03-10 22:53:23 +00004219 /* Escaped strings will always be longer than the resulting
4220 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004221 length after conversion to the true value. (But decoding error
4222 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004223 v = _PyUnicode_New(size);
4224 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004226 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004227 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004229 end = s + size;
4230 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004231 unsigned char c;
4232 Py_UCS4 x;
4233 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004234 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235
Benjamin Peterson29060642009-01-31 22:14:21 +00004236 /* Non-escape characters are interpreted as Unicode ordinals */
4237 if (*s != '\\') {
4238 *p++ = (unsigned char)*s++;
4239 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004240 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004241 startinpos = s-starts;
4242
4243 /* \u-escapes are only interpreted iff the number of leading
4244 backslashes if odd */
4245 bs = s;
4246 for (;s < end;) {
4247 if (*s != '\\')
4248 break;
4249 *p++ = (unsigned char)*s++;
4250 }
4251 if (((s - bs) & 1) == 0 ||
4252 s >= end ||
4253 (*s != 'u' && *s != 'U')) {
4254 continue;
4255 }
4256 p--;
4257 count = *s=='u' ? 4 : 8;
4258 s++;
4259
4260 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4261 outpos = p-PyUnicode_AS_UNICODE(v);
4262 for (x = 0, i = 0; i < count; ++i, ++s) {
4263 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004264 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004265 endinpos = s-starts;
4266 if (unicode_decode_call_errorhandler(
4267 errors, &errorHandler,
4268 "rawunicodeescape", "truncated \\uXXXX",
4269 &starts, &end, &startinpos, &endinpos, &exc, &s,
4270 &v, &outpos, &p))
4271 goto onError;
4272 goto nextByte;
4273 }
4274 x = (x<<4) & ~0xF;
4275 if (c >= '0' && c <= '9')
4276 x += c - '0';
4277 else if (c >= 'a' && c <= 'f')
4278 x += 10 + c - 'a';
4279 else
4280 x += 10 + c - 'A';
4281 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004282 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 /* UCS-2 character */
4284 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004285 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004286 /* UCS-4 character. Either store directly, or as
4287 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004288#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004290#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004291 x -= 0x10000L;
4292 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4293 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004294#endif
4295 } else {
4296 endinpos = s-starts;
4297 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004298 if (unicode_decode_call_errorhandler(
4299 errors, &errorHandler,
4300 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004301 &starts, &end, &startinpos, &endinpos, &exc, &s,
4302 &v, &outpos, &p))
4303 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004304 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004305 nextByte:
4306 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004307 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004308 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004309 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310 Py_XDECREF(errorHandler);
4311 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004313
Benjamin Peterson29060642009-01-31 22:14:21 +00004314 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004316 Py_XDECREF(errorHandler);
4317 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318 return NULL;
4319}
4320
Alexander Belopolsky40018472011-02-26 01:02:56 +00004321PyObject *
4322PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4323 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004325 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 char *p;
4327 char *q;
4328
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004329#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004330 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004331#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004332 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004333#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004334
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004335 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004336 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004337
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004338 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339 if (repr == NULL)
4340 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004341 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004342 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004344 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345 while (size-- > 0) {
4346 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004347#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004348 /* Map 32-bit characters to '\Uxxxxxxxx' */
4349 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004350 *p++ = '\\';
4351 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004352 *p++ = hexdigits[(ch >> 28) & 0xf];
4353 *p++ = hexdigits[(ch >> 24) & 0xf];
4354 *p++ = hexdigits[(ch >> 20) & 0xf];
4355 *p++ = hexdigits[(ch >> 16) & 0xf];
4356 *p++ = hexdigits[(ch >> 12) & 0xf];
4357 *p++ = hexdigits[(ch >> 8) & 0xf];
4358 *p++ = hexdigits[(ch >> 4) & 0xf];
4359 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004360 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004361 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004362#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004363 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4364 if (ch >= 0xD800 && ch < 0xDC00) {
4365 Py_UNICODE ch2;
4366 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004367
Benjamin Peterson29060642009-01-31 22:14:21 +00004368 ch2 = *s++;
4369 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004370 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004371 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4372 *p++ = '\\';
4373 *p++ = 'U';
4374 *p++ = hexdigits[(ucs >> 28) & 0xf];
4375 *p++ = hexdigits[(ucs >> 24) & 0xf];
4376 *p++ = hexdigits[(ucs >> 20) & 0xf];
4377 *p++ = hexdigits[(ucs >> 16) & 0xf];
4378 *p++ = hexdigits[(ucs >> 12) & 0xf];
4379 *p++ = hexdigits[(ucs >> 8) & 0xf];
4380 *p++ = hexdigits[(ucs >> 4) & 0xf];
4381 *p++ = hexdigits[ucs & 0xf];
4382 continue;
4383 }
4384 /* Fall through: isolated surrogates are copied as-is */
4385 s--;
4386 size++;
4387 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004388#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004389 /* Map 16-bit characters to '\uxxxx' */
4390 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391 *p++ = '\\';
4392 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004393 *p++ = hexdigits[(ch >> 12) & 0xf];
4394 *p++ = hexdigits[(ch >> 8) & 0xf];
4395 *p++ = hexdigits[(ch >> 4) & 0xf];
4396 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004398 /* Copy everything else as-is */
4399 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400 *p++ = (char) ch;
4401 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004402 size = p - q;
4403
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004404 assert(size > 0);
4405 if (_PyBytes_Resize(&repr, size) < 0)
4406 return NULL;
4407 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408}
4409
Alexander Belopolsky40018472011-02-26 01:02:56 +00004410PyObject *
4411PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004413 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004415 PyErr_BadArgument();
4416 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004418 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4419 PyUnicode_GET_SIZE(unicode));
4420
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004421 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422}
4423
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004424/* --- Unicode Internal Codec ------------------------------------------- */
4425
Alexander Belopolsky40018472011-02-26 01:02:56 +00004426PyObject *
4427_PyUnicode_DecodeUnicodeInternal(const char *s,
4428 Py_ssize_t size,
4429 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004430{
4431 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004432 Py_ssize_t startinpos;
4433 Py_ssize_t endinpos;
4434 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004435 PyUnicodeObject *v;
4436 Py_UNICODE *p;
4437 const char *end;
4438 const char *reason;
4439 PyObject *errorHandler = NULL;
4440 PyObject *exc = NULL;
4441
Neal Norwitzd43069c2006-01-08 01:12:10 +00004442#ifdef Py_UNICODE_WIDE
4443 Py_UNICODE unimax = PyUnicode_GetMax();
4444#endif
4445
Thomas Wouters89f507f2006-12-13 04:49:30 +00004446 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004447 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4448 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004449 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004450 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004451 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004452 p = PyUnicode_AS_UNICODE(v);
4453 end = s + size;
4454
4455 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004456 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004457 /* We have to sanity check the raw data, otherwise doom looms for
4458 some malformed UCS-4 data. */
4459 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004460#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004461 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004462#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004463 end-s < Py_UNICODE_SIZE
4464 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004465 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004466 startinpos = s - starts;
4467 if (end-s < Py_UNICODE_SIZE) {
4468 endinpos = end-starts;
4469 reason = "truncated input";
4470 }
4471 else {
4472 endinpos = s - starts + Py_UNICODE_SIZE;
4473 reason = "illegal code point (> 0x10FFFF)";
4474 }
4475 outpos = p - PyUnicode_AS_UNICODE(v);
4476 if (unicode_decode_call_errorhandler(
4477 errors, &errorHandler,
4478 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004479 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004480 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004481 goto onError;
4482 }
4483 }
4484 else {
4485 p++;
4486 s += Py_UNICODE_SIZE;
4487 }
4488 }
4489
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004490 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004491 goto onError;
4492 Py_XDECREF(errorHandler);
4493 Py_XDECREF(exc);
4494 return (PyObject *)v;
4495
Benjamin Peterson29060642009-01-31 22:14:21 +00004496 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004497 Py_XDECREF(v);
4498 Py_XDECREF(errorHandler);
4499 Py_XDECREF(exc);
4500 return NULL;
4501}
4502
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503/* --- Latin-1 Codec ------------------------------------------------------ */
4504
Alexander Belopolsky40018472011-02-26 01:02:56 +00004505PyObject *
4506PyUnicode_DecodeLatin1(const char *s,
4507 Py_ssize_t size,
4508 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004509{
4510 PyUnicodeObject *v;
4511 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004512 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004513
Guido van Rossumd57fd912000-03-10 22:53:23 +00004514 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004515 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004516 Py_UNICODE r = *(unsigned char*)s;
4517 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004518 }
4519
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520 v = _PyUnicode_New(size);
4521 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004526 e = s + size;
4527 /* Unrolling the copy makes it much faster by reducing the looping
4528 overhead. This is similar to what many memcpy() implementations do. */
4529 unrolled_end = e - 4;
4530 while (s < unrolled_end) {
4531 p[0] = (unsigned char) s[0];
4532 p[1] = (unsigned char) s[1];
4533 p[2] = (unsigned char) s[2];
4534 p[3] = (unsigned char) s[3];
4535 s += 4;
4536 p += 4;
4537 }
4538 while (s < e)
4539 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004541
Benjamin Peterson29060642009-01-31 22:14:21 +00004542 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543 Py_XDECREF(v);
4544 return NULL;
4545}
4546
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004547/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004548static void
4549make_encode_exception(PyObject **exceptionObject,
4550 const char *encoding,
4551 const Py_UNICODE *unicode, Py_ssize_t size,
4552 Py_ssize_t startpos, Py_ssize_t endpos,
4553 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004554{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004555 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004556 *exceptionObject = PyUnicodeEncodeError_Create(
4557 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004558 }
4559 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004560 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4561 goto onError;
4562 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4563 goto onError;
4564 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4565 goto onError;
4566 return;
4567 onError:
4568 Py_DECREF(*exceptionObject);
4569 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570 }
4571}
4572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004573/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004574static void
4575raise_encode_exception(PyObject **exceptionObject,
4576 const char *encoding,
4577 const Py_UNICODE *unicode, Py_ssize_t size,
4578 Py_ssize_t startpos, Py_ssize_t endpos,
4579 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580{
4581 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004582 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004583 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004585}
4586
4587/* error handling callback helper:
4588 build arguments, call the callback and check the arguments,
4589 put the result into newpos and return the replacement string, which
4590 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004591static PyObject *
4592unicode_encode_call_errorhandler(const char *errors,
4593 PyObject **errorHandler,
4594 const char *encoding, const char *reason,
4595 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4596 Py_ssize_t startpos, Py_ssize_t endpos,
4597 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004598{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004599 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004600
4601 PyObject *restuple;
4602 PyObject *resunicode;
4603
4604 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004605 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004607 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004608 }
4609
4610 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004611 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614
4615 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004619 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004620 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004621 Py_DECREF(restuple);
4622 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004624 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004625 &resunicode, newpos)) {
4626 Py_DECREF(restuple);
4627 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004628 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004629 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4630 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4631 Py_DECREF(restuple);
4632 return NULL;
4633 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004634 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004635 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004636 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004637 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4638 Py_DECREF(restuple);
4639 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004640 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641 Py_INCREF(resunicode);
4642 Py_DECREF(restuple);
4643 return resunicode;
4644}
4645
Alexander Belopolsky40018472011-02-26 01:02:56 +00004646static PyObject *
4647unicode_encode_ucs1(const Py_UNICODE *p,
4648 Py_ssize_t size,
4649 const char *errors,
4650 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651{
4652 /* output object */
4653 PyObject *res;
4654 /* pointers to the beginning and end+1 of input */
4655 const Py_UNICODE *startp = p;
4656 const Py_UNICODE *endp = p + size;
4657 /* pointer to the beginning of the unencodable characters */
4658 /* const Py_UNICODE *badp = NULL; */
4659 /* pointer into the output */
4660 char *str;
4661 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004662 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004663 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4664 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004665 PyObject *errorHandler = NULL;
4666 PyObject *exc = NULL;
4667 /* the following variable is used for caching string comparisons
4668 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4669 int known_errorHandler = -1;
4670
4671 /* allocate enough for a simple encoding without
4672 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004673 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004674 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004675 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004676 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004677 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004678 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004679 ressize = size;
4680
4681 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004682 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004683
Benjamin Peterson29060642009-01-31 22:14:21 +00004684 /* can we encode this? */
4685 if (c<limit) {
4686 /* no overflow check, because we know that the space is enough */
4687 *str++ = (char)c;
4688 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004689 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004690 else {
4691 Py_ssize_t unicodepos = p-startp;
4692 Py_ssize_t requiredsize;
4693 PyObject *repunicode;
4694 Py_ssize_t repsize;
4695 Py_ssize_t newpos;
4696 Py_ssize_t respos;
4697 Py_UNICODE *uni2;
4698 /* startpos for collecting unencodable chars */
4699 const Py_UNICODE *collstart = p;
4700 const Py_UNICODE *collend = p;
4701 /* find all unecodable characters */
4702 while ((collend < endp) && ((*collend)>=limit))
4703 ++collend;
4704 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4705 if (known_errorHandler==-1) {
4706 if ((errors==NULL) || (!strcmp(errors, "strict")))
4707 known_errorHandler = 1;
4708 else if (!strcmp(errors, "replace"))
4709 known_errorHandler = 2;
4710 else if (!strcmp(errors, "ignore"))
4711 known_errorHandler = 3;
4712 else if (!strcmp(errors, "xmlcharrefreplace"))
4713 known_errorHandler = 4;
4714 else
4715 known_errorHandler = 0;
4716 }
4717 switch (known_errorHandler) {
4718 case 1: /* strict */
4719 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4720 goto onError;
4721 case 2: /* replace */
4722 while (collstart++<collend)
4723 *str++ = '?'; /* fall through */
4724 case 3: /* ignore */
4725 p = collend;
4726 break;
4727 case 4: /* xmlcharrefreplace */
4728 respos = str - PyBytes_AS_STRING(res);
4729 /* determine replacement size (temporarily (mis)uses p) */
4730 for (p = collstart, repsize = 0; p < collend; ++p) {
4731 if (*p<10)
4732 repsize += 2+1+1;
4733 else if (*p<100)
4734 repsize += 2+2+1;
4735 else if (*p<1000)
4736 repsize += 2+3+1;
4737 else if (*p<10000)
4738 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004739#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004740 else
4741 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004742#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004743 else if (*p<100000)
4744 repsize += 2+5+1;
4745 else if (*p<1000000)
4746 repsize += 2+6+1;
4747 else
4748 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004749#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 }
4751 requiredsize = respos+repsize+(endp-collend);
4752 if (requiredsize > ressize) {
4753 if (requiredsize<2*ressize)
4754 requiredsize = 2*ressize;
4755 if (_PyBytes_Resize(&res, requiredsize))
4756 goto onError;
4757 str = PyBytes_AS_STRING(res) + respos;
4758 ressize = requiredsize;
4759 }
4760 /* generate replacement (temporarily (mis)uses p) */
4761 for (p = collstart; p < collend; ++p) {
4762 str += sprintf(str, "&#%d;", (int)*p);
4763 }
4764 p = collend;
4765 break;
4766 default:
4767 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4768 encoding, reason, startp, size, &exc,
4769 collstart-startp, collend-startp, &newpos);
4770 if (repunicode == NULL)
4771 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004772 if (PyBytes_Check(repunicode)) {
4773 /* Directly copy bytes result to output. */
4774 repsize = PyBytes_Size(repunicode);
4775 if (repsize > 1) {
4776 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004777 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004778 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4779 Py_DECREF(repunicode);
4780 goto onError;
4781 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004782 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004783 ressize += repsize-1;
4784 }
4785 memcpy(str, PyBytes_AsString(repunicode), repsize);
4786 str += repsize;
4787 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004788 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004789 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004790 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004791 /* need more space? (at least enough for what we
4792 have+the replacement+the rest of the string, so
4793 we won't have to check space for encodable characters) */
4794 respos = str - PyBytes_AS_STRING(res);
4795 repsize = PyUnicode_GET_SIZE(repunicode);
4796 requiredsize = respos+repsize+(endp-collend);
4797 if (requiredsize > ressize) {
4798 if (requiredsize<2*ressize)
4799 requiredsize = 2*ressize;
4800 if (_PyBytes_Resize(&res, requiredsize)) {
4801 Py_DECREF(repunicode);
4802 goto onError;
4803 }
4804 str = PyBytes_AS_STRING(res) + respos;
4805 ressize = requiredsize;
4806 }
4807 /* check if there is anything unencodable in the replacement
4808 and copy it to the output */
4809 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4810 c = *uni2;
4811 if (c >= limit) {
4812 raise_encode_exception(&exc, encoding, startp, size,
4813 unicodepos, unicodepos+1, reason);
4814 Py_DECREF(repunicode);
4815 goto onError;
4816 }
4817 *str = (char)c;
4818 }
4819 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004820 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004821 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004822 }
4823 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004824 /* Resize if we allocated to much */
4825 size = str - PyBytes_AS_STRING(res);
4826 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004827 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004828 if (_PyBytes_Resize(&res, size) < 0)
4829 goto onError;
4830 }
4831
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004832 Py_XDECREF(errorHandler);
4833 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004834 return res;
4835
4836 onError:
4837 Py_XDECREF(res);
4838 Py_XDECREF(errorHandler);
4839 Py_XDECREF(exc);
4840 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004841}
4842
Alexander Belopolsky40018472011-02-26 01:02:56 +00004843PyObject *
4844PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4845 Py_ssize_t size,
4846 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849}
4850
Alexander Belopolsky40018472011-02-26 01:02:56 +00004851PyObject *
4852PyUnicode_AsLatin1String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853{
4854 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004855 PyErr_BadArgument();
4856 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 }
4858 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004859 PyUnicode_GET_SIZE(unicode),
4860 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861}
4862
4863/* --- 7-bit ASCII Codec -------------------------------------------------- */
4864
Alexander Belopolsky40018472011-02-26 01:02:56 +00004865PyObject *
4866PyUnicode_DecodeASCII(const char *s,
4867 Py_ssize_t size,
4868 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004870 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 PyUnicodeObject *v;
4872 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004873 Py_ssize_t startinpos;
4874 Py_ssize_t endinpos;
4875 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004876 const char *e;
4877 PyObject *errorHandler = NULL;
4878 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004879
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004881 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004882 Py_UNICODE r = *(unsigned char*)s;
4883 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004884 }
Tim Petersced69f82003-09-16 20:30:58 +00004885
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 v = _PyUnicode_New(size);
4887 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004888 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004890 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004892 e = s + size;
4893 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004894 register unsigned char c = (unsigned char)*s;
4895 if (c < 128) {
4896 *p++ = c;
4897 ++s;
4898 }
4899 else {
4900 startinpos = s-starts;
4901 endinpos = startinpos + 1;
4902 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4903 if (unicode_decode_call_errorhandler(
4904 errors, &errorHandler,
4905 "ascii", "ordinal not in range(128)",
4906 &starts, &e, &startinpos, &endinpos, &exc, &s,
4907 &v, &outpos, &p))
4908 goto onError;
4909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004911 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004912 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4913 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004914 Py_XDECREF(errorHandler);
4915 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004917
Benjamin Peterson29060642009-01-31 22:14:21 +00004918 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004920 Py_XDECREF(errorHandler);
4921 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922 return NULL;
4923}
4924
Alexander Belopolsky40018472011-02-26 01:02:56 +00004925PyObject *
4926PyUnicode_EncodeASCII(const Py_UNICODE *p,
4927 Py_ssize_t size,
4928 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004930 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931}
4932
Alexander Belopolsky40018472011-02-26 01:02:56 +00004933PyObject *
4934PyUnicode_AsASCIIString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935{
4936 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 PyErr_BadArgument();
4938 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939 }
4940 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 PyUnicode_GET_SIZE(unicode),
4942 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943}
4944
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004945#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004946
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004947/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004948
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004949#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004950#define NEED_RETRY
4951#endif
4952
4953/* XXX This code is limited to "true" double-byte encodings, as
4954 a) it assumes an incomplete character consists of a single byte, and
4955 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004956 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004957
Alexander Belopolsky40018472011-02-26 01:02:56 +00004958static int
4959is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004960{
4961 const char *curr = s + offset;
4962
4963 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004964 const char *prev = CharPrev(s, curr);
4965 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004966 }
4967 return 0;
4968}
4969
4970/*
4971 * Decode MBCS string into unicode object. If 'final' is set, converts
4972 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4973 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004974static int
4975decode_mbcs(PyUnicodeObject **v,
4976 const char *s, /* MBCS string */
4977 int size, /* sizeof MBCS string */
4978 int final,
4979 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004980{
4981 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004982 Py_ssize_t n;
4983 DWORD usize;
4984 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004985
4986 assert(size >= 0);
4987
Victor Stinner554f3f02010-06-16 23:33:54 +00004988 /* check and handle 'errors' arg */
4989 if (errors==NULL || strcmp(errors, "strict")==0)
4990 flags = MB_ERR_INVALID_CHARS;
4991 else if (strcmp(errors, "ignore")==0)
4992 flags = 0;
4993 else {
4994 PyErr_Format(PyExc_ValueError,
4995 "mbcs encoding does not support errors='%s'",
4996 errors);
4997 return -1;
4998 }
4999
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005000 /* Skip trailing lead-byte unless 'final' is set */
5001 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00005002 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005003
5004 /* First get the size of the result */
5005 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005006 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
5007 if (usize==0)
5008 goto mbcs_decode_error;
5009 } else
5010 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005011
5012 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 /* Create unicode object */
5014 *v = _PyUnicode_New(usize);
5015 if (*v == NULL)
5016 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005017 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005018 }
5019 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 /* Extend unicode object */
5021 n = PyUnicode_GET_SIZE(*v);
5022 if (_PyUnicode_Resize(v, n + usize) < 0)
5023 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005024 }
5025
5026 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00005027 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005029 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
5030 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00005031 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005032 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005033 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00005034
5035mbcs_decode_error:
5036 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
5037 we raise a UnicodeDecodeError - else it is a 'generic'
5038 windows error
5039 */
5040 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
5041 /* Ideally, we should get reason from FormatMessage - this
5042 is the Windows 2000 English version of the message
5043 */
5044 PyObject *exc = NULL;
5045 const char *reason = "No mapping for the Unicode character exists "
5046 "in the target multi-byte code page.";
5047 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
5048 if (exc != NULL) {
5049 PyCodec_StrictErrors(exc);
5050 Py_DECREF(exc);
5051 }
5052 } else {
5053 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5054 }
5055 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005056}
5057
Alexander Belopolsky40018472011-02-26 01:02:56 +00005058PyObject *
5059PyUnicode_DecodeMBCSStateful(const char *s,
5060 Py_ssize_t size,
5061 const char *errors,
5062 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005063{
5064 PyUnicodeObject *v = NULL;
5065 int done;
5066
5067 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005069
5070#ifdef NEED_RETRY
5071 retry:
5072 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005073 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005074 else
5075#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005076 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005077
5078 if (done < 0) {
5079 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005080 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005081 }
5082
5083 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005085
5086#ifdef NEED_RETRY
5087 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005088 s += done;
5089 size -= done;
5090 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005091 }
5092#endif
5093
5094 return (PyObject *)v;
5095}
5096
Alexander Belopolsky40018472011-02-26 01:02:56 +00005097PyObject *
5098PyUnicode_DecodeMBCS(const char *s,
5099 Py_ssize_t size,
5100 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005101{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005102 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5103}
5104
5105/*
5106 * Convert unicode into string object (MBCS).
5107 * Returns 0 if succeed, -1 otherwise.
5108 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005109static int
5110encode_mbcs(PyObject **repr,
5111 const Py_UNICODE *p, /* unicode */
5112 int size, /* size of unicode */
5113 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005114{
Victor Stinner554f3f02010-06-16 23:33:54 +00005115 BOOL usedDefaultChar = FALSE;
5116 BOOL *pusedDefaultChar;
5117 int mbcssize;
5118 Py_ssize_t n;
5119 PyObject *exc = NULL;
5120 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005121
5122 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005123
Victor Stinner554f3f02010-06-16 23:33:54 +00005124 /* check and handle 'errors' arg */
5125 if (errors==NULL || strcmp(errors, "strict")==0) {
5126 flags = WC_NO_BEST_FIT_CHARS;
5127 pusedDefaultChar = &usedDefaultChar;
5128 } else if (strcmp(errors, "replace")==0) {
5129 flags = 0;
5130 pusedDefaultChar = NULL;
5131 } else {
5132 PyErr_Format(PyExc_ValueError,
5133 "mbcs encoding does not support errors='%s'",
5134 errors);
5135 return -1;
5136 }
5137
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005138 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005139 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005140 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5141 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005142 if (mbcssize == 0) {
5143 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5144 return -1;
5145 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005146 /* If we used a default char, then we failed! */
5147 if (pusedDefaultChar && *pusedDefaultChar)
5148 goto mbcs_encode_error;
5149 } else {
5150 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005151 }
5152
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005153 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005154 /* Create string object */
5155 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5156 if (*repr == NULL)
5157 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005158 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005159 }
5160 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005161 /* Extend string object */
5162 n = PyBytes_Size(*repr);
5163 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5164 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005165 }
5166
5167 /* Do the conversion */
5168 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005169 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005170 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5171 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005172 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5173 return -1;
5174 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005175 if (pusedDefaultChar && *pusedDefaultChar)
5176 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005177 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005178 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005179
5180mbcs_encode_error:
5181 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5182 Py_XDECREF(exc);
5183 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005184}
5185
Alexander Belopolsky40018472011-02-26 01:02:56 +00005186PyObject *
5187PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5188 Py_ssize_t size,
5189 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005190{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005191 PyObject *repr = NULL;
5192 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005193
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005194#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005195 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005196 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005197 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005198 else
5199#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005200 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005201
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005202 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005203 Py_XDECREF(repr);
5204 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005205 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005206
5207#ifdef NEED_RETRY
5208 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005209 p += INT_MAX;
5210 size -= INT_MAX;
5211 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005212 }
5213#endif
5214
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005215 return repr;
5216}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005217
Alexander Belopolsky40018472011-02-26 01:02:56 +00005218PyObject *
5219PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005220{
5221 if (!PyUnicode_Check(unicode)) {
5222 PyErr_BadArgument();
5223 return NULL;
5224 }
5225 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005226 PyUnicode_GET_SIZE(unicode),
5227 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005228}
5229
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005230#undef NEED_RETRY
5231
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005232#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005233
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234/* --- Character Mapping Codec -------------------------------------------- */
5235
Alexander Belopolsky40018472011-02-26 01:02:56 +00005236PyObject *
5237PyUnicode_DecodeCharmap(const char *s,
5238 Py_ssize_t size,
5239 PyObject *mapping,
5240 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005242 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005243 Py_ssize_t startinpos;
5244 Py_ssize_t endinpos;
5245 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005246 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247 PyUnicodeObject *v;
5248 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005249 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005250 PyObject *errorHandler = NULL;
5251 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005252 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005253 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005254
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255 /* Default to Latin-1 */
5256 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258
5259 v = _PyUnicode_New(size);
5260 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005262 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005265 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005266 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005267 mapstring = PyUnicode_AS_UNICODE(mapping);
5268 maplen = PyUnicode_GET_SIZE(mapping);
5269 while (s < e) {
5270 unsigned char ch = *s;
5271 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272
Benjamin Peterson29060642009-01-31 22:14:21 +00005273 if (ch < maplen)
5274 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275
Benjamin Peterson29060642009-01-31 22:14:21 +00005276 if (x == 0xfffe) {
5277 /* undefined mapping */
5278 outpos = p-PyUnicode_AS_UNICODE(v);
5279 startinpos = s-starts;
5280 endinpos = startinpos+1;
5281 if (unicode_decode_call_errorhandler(
5282 errors, &errorHandler,
5283 "charmap", "character maps to <undefined>",
5284 &starts, &e, &startinpos, &endinpos, &exc, &s,
5285 &v, &outpos, &p)) {
5286 goto onError;
5287 }
5288 continue;
5289 }
5290 *p++ = x;
5291 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005292 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005293 }
5294 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005295 while (s < e) {
5296 unsigned char ch = *s;
5297 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005298
Benjamin Peterson29060642009-01-31 22:14:21 +00005299 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5300 w = PyLong_FromLong((long)ch);
5301 if (w == NULL)
5302 goto onError;
5303 x = PyObject_GetItem(mapping, w);
5304 Py_DECREF(w);
5305 if (x == NULL) {
5306 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5307 /* No mapping found means: mapping is undefined. */
5308 PyErr_Clear();
5309 x = Py_None;
5310 Py_INCREF(x);
5311 } else
5312 goto onError;
5313 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005314
Benjamin Peterson29060642009-01-31 22:14:21 +00005315 /* Apply mapping */
5316 if (PyLong_Check(x)) {
5317 long value = PyLong_AS_LONG(x);
5318 if (value < 0 || value > 65535) {
5319 PyErr_SetString(PyExc_TypeError,
5320 "character mapping must be in range(65536)");
5321 Py_DECREF(x);
5322 goto onError;
5323 }
5324 *p++ = (Py_UNICODE)value;
5325 }
5326 else if (x == Py_None) {
5327 /* undefined mapping */
5328 outpos = p-PyUnicode_AS_UNICODE(v);
5329 startinpos = s-starts;
5330 endinpos = startinpos+1;
5331 if (unicode_decode_call_errorhandler(
5332 errors, &errorHandler,
5333 "charmap", "character maps to <undefined>",
5334 &starts, &e, &startinpos, &endinpos, &exc, &s,
5335 &v, &outpos, &p)) {
5336 Py_DECREF(x);
5337 goto onError;
5338 }
5339 Py_DECREF(x);
5340 continue;
5341 }
5342 else if (PyUnicode_Check(x)) {
5343 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005344
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 if (targetsize == 1)
5346 /* 1-1 mapping */
5347 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005348
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 else if (targetsize > 1) {
5350 /* 1-n mapping */
5351 if (targetsize > extrachars) {
5352 /* resize first */
5353 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5354 Py_ssize_t needed = (targetsize - extrachars) + \
5355 (targetsize << 2);
5356 extrachars += needed;
5357 /* XXX overflow detection missing */
5358 if (_PyUnicode_Resize(&v,
5359 PyUnicode_GET_SIZE(v) + needed) < 0) {
5360 Py_DECREF(x);
5361 goto onError;
5362 }
5363 p = PyUnicode_AS_UNICODE(v) + oldpos;
5364 }
5365 Py_UNICODE_COPY(p,
5366 PyUnicode_AS_UNICODE(x),
5367 targetsize);
5368 p += targetsize;
5369 extrachars -= targetsize;
5370 }
5371 /* 1-0 mapping: skip the character */
5372 }
5373 else {
5374 /* wrong return value */
5375 PyErr_SetString(PyExc_TypeError,
5376 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005377 Py_DECREF(x);
5378 goto onError;
5379 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005380 Py_DECREF(x);
5381 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005382 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 }
5384 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005385 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5386 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005387 Py_XDECREF(errorHandler);
5388 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005390
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005392 Py_XDECREF(errorHandler);
5393 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 Py_XDECREF(v);
5395 return NULL;
5396}
5397
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005398/* Charmap encoding: the lookup table */
5399
Alexander Belopolsky40018472011-02-26 01:02:56 +00005400struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00005401 PyObject_HEAD
5402 unsigned char level1[32];
5403 int count2, count3;
5404 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005405};
5406
5407static PyObject*
5408encoding_map_size(PyObject *obj, PyObject* args)
5409{
5410 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005411 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005413}
5414
5415static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005416 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 PyDoc_STR("Return the size (in bytes) of this object") },
5418 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005419};
5420
5421static void
5422encoding_map_dealloc(PyObject* o)
5423{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005424 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005425}
5426
5427static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005428 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 "EncodingMap", /*tp_name*/
5430 sizeof(struct encoding_map), /*tp_basicsize*/
5431 0, /*tp_itemsize*/
5432 /* methods */
5433 encoding_map_dealloc, /*tp_dealloc*/
5434 0, /*tp_print*/
5435 0, /*tp_getattr*/
5436 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005437 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005438 0, /*tp_repr*/
5439 0, /*tp_as_number*/
5440 0, /*tp_as_sequence*/
5441 0, /*tp_as_mapping*/
5442 0, /*tp_hash*/
5443 0, /*tp_call*/
5444 0, /*tp_str*/
5445 0, /*tp_getattro*/
5446 0, /*tp_setattro*/
5447 0, /*tp_as_buffer*/
5448 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5449 0, /*tp_doc*/
5450 0, /*tp_traverse*/
5451 0, /*tp_clear*/
5452 0, /*tp_richcompare*/
5453 0, /*tp_weaklistoffset*/
5454 0, /*tp_iter*/
5455 0, /*tp_iternext*/
5456 encoding_map_methods, /*tp_methods*/
5457 0, /*tp_members*/
5458 0, /*tp_getset*/
5459 0, /*tp_base*/
5460 0, /*tp_dict*/
5461 0, /*tp_descr_get*/
5462 0, /*tp_descr_set*/
5463 0, /*tp_dictoffset*/
5464 0, /*tp_init*/
5465 0, /*tp_alloc*/
5466 0, /*tp_new*/
5467 0, /*tp_free*/
5468 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005469};
5470
5471PyObject*
5472PyUnicode_BuildEncodingMap(PyObject* string)
5473{
5474 Py_UNICODE *decode;
5475 PyObject *result;
5476 struct encoding_map *mresult;
5477 int i;
5478 int need_dict = 0;
5479 unsigned char level1[32];
5480 unsigned char level2[512];
5481 unsigned char *mlevel1, *mlevel2, *mlevel3;
5482 int count2 = 0, count3 = 0;
5483
5484 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5485 PyErr_BadArgument();
5486 return NULL;
5487 }
5488 decode = PyUnicode_AS_UNICODE(string);
5489 memset(level1, 0xFF, sizeof level1);
5490 memset(level2, 0xFF, sizeof level2);
5491
5492 /* If there isn't a one-to-one mapping of NULL to \0,
5493 or if there are non-BMP characters, we need to use
5494 a mapping dictionary. */
5495 if (decode[0] != 0)
5496 need_dict = 1;
5497 for (i = 1; i < 256; i++) {
5498 int l1, l2;
5499 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005500#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005501 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005502#endif
5503 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005504 need_dict = 1;
5505 break;
5506 }
5507 if (decode[i] == 0xFFFE)
5508 /* unmapped character */
5509 continue;
5510 l1 = decode[i] >> 11;
5511 l2 = decode[i] >> 7;
5512 if (level1[l1] == 0xFF)
5513 level1[l1] = count2++;
5514 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005515 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005516 }
5517
5518 if (count2 >= 0xFF || count3 >= 0xFF)
5519 need_dict = 1;
5520
5521 if (need_dict) {
5522 PyObject *result = PyDict_New();
5523 PyObject *key, *value;
5524 if (!result)
5525 return NULL;
5526 for (i = 0; i < 256; i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00005527 key = PyLong_FromLong(decode[i]);
5528 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005529 if (!key || !value)
5530 goto failed1;
5531 if (PyDict_SetItem(result, key, value) == -1)
5532 goto failed1;
5533 Py_DECREF(key);
5534 Py_DECREF(value);
5535 }
5536 return result;
5537 failed1:
5538 Py_XDECREF(key);
5539 Py_XDECREF(value);
5540 Py_DECREF(result);
5541 return NULL;
5542 }
5543
5544 /* Create a three-level trie */
5545 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5546 16*count2 + 128*count3 - 1);
5547 if (!result)
5548 return PyErr_NoMemory();
5549 PyObject_Init(result, &EncodingMapType);
5550 mresult = (struct encoding_map*)result;
5551 mresult->count2 = count2;
5552 mresult->count3 = count3;
5553 mlevel1 = mresult->level1;
5554 mlevel2 = mresult->level23;
5555 mlevel3 = mresult->level23 + 16*count2;
5556 memcpy(mlevel1, level1, 32);
5557 memset(mlevel2, 0xFF, 16*count2);
5558 memset(mlevel3, 0, 128*count3);
5559 count3 = 0;
5560 for (i = 1; i < 256; i++) {
5561 int o1, o2, o3, i2, i3;
5562 if (decode[i] == 0xFFFE)
5563 /* unmapped character */
5564 continue;
5565 o1 = decode[i]>>11;
5566 o2 = (decode[i]>>7) & 0xF;
5567 i2 = 16*mlevel1[o1] + o2;
5568 if (mlevel2[i2] == 0xFF)
5569 mlevel2[i2] = count3++;
5570 o3 = decode[i] & 0x7F;
5571 i3 = 128*mlevel2[i2] + o3;
5572 mlevel3[i3] = i;
5573 }
5574 return result;
5575}
5576
5577static int
5578encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5579{
5580 struct encoding_map *map = (struct encoding_map*)mapping;
5581 int l1 = c>>11;
5582 int l2 = (c>>7) & 0xF;
5583 int l3 = c & 0x7F;
5584 int i;
5585
5586#ifdef Py_UNICODE_WIDE
5587 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005589 }
5590#endif
5591 if (c == 0)
5592 return 0;
5593 /* level 1*/
5594 i = map->level1[l1];
5595 if (i == 0xFF) {
5596 return -1;
5597 }
5598 /* level 2*/
5599 i = map->level23[16*i+l2];
5600 if (i == 0xFF) {
5601 return -1;
5602 }
5603 /* level 3 */
5604 i = map->level23[16*map->count2 + 128*i + l3];
5605 if (i == 0) {
5606 return -1;
5607 }
5608 return i;
5609}
5610
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005611/* Lookup the character ch in the mapping. If the character
5612 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005613 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005614static PyObject *
5615charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616{
Christian Heimes217cfd12007-12-02 14:31:20 +00005617 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618 PyObject *x;
5619
5620 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005621 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005622 x = PyObject_GetItem(mapping, w);
5623 Py_DECREF(w);
5624 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005625 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5626 /* No mapping found means: mapping is undefined. */
5627 PyErr_Clear();
5628 x = Py_None;
5629 Py_INCREF(x);
5630 return x;
5631 } else
5632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005634 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005635 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005636 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 long value = PyLong_AS_LONG(x);
5638 if (value < 0 || value > 255) {
5639 PyErr_SetString(PyExc_TypeError,
5640 "character mapping must be in range(256)");
5641 Py_DECREF(x);
5642 return NULL;
5643 }
5644 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005646 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005649 /* wrong return value */
5650 PyErr_Format(PyExc_TypeError,
5651 "character mapping must return integer, bytes or None, not %.400s",
5652 x->ob_type->tp_name);
5653 Py_DECREF(x);
5654 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655 }
5656}
5657
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005658static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005659charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005660{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005661 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5662 /* exponentially overallocate to minimize reallocations */
5663 if (requiredsize < 2*outsize)
5664 requiredsize = 2*outsize;
5665 if (_PyBytes_Resize(outobj, requiredsize))
5666 return -1;
5667 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005668}
5669
Benjamin Peterson14339b62009-01-31 16:36:08 +00005670typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005671 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00005672} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005673/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005674 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005675 space is available. Return a new reference to the object that
5676 was put in the output buffer, or Py_None, if the mapping was undefined
5677 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005678 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005679static charmapencode_result
5680charmapencode_output(Py_UNICODE c, PyObject *mapping,
5681 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005682{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005683 PyObject *rep;
5684 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005685 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686
Christian Heimes90aa7642007-12-19 02:45:37 +00005687 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005688 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005689 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005690 if (res == -1)
5691 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 if (outsize<requiredsize)
5693 if (charmapencode_resize(outobj, outpos, requiredsize))
5694 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005695 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 outstart[(*outpos)++] = (char)res;
5697 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005698 }
5699
5700 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005703 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005704 Py_DECREF(rep);
5705 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005706 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 if (PyLong_Check(rep)) {
5708 Py_ssize_t requiredsize = *outpos+1;
5709 if (outsize<requiredsize)
5710 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5711 Py_DECREF(rep);
5712 return enc_EXCEPTION;
5713 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005714 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005716 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 else {
5718 const char *repchars = PyBytes_AS_STRING(rep);
5719 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5720 Py_ssize_t requiredsize = *outpos+repsize;
5721 if (outsize<requiredsize)
5722 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5723 Py_DECREF(rep);
5724 return enc_EXCEPTION;
5725 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005726 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005727 memcpy(outstart + *outpos, repchars, repsize);
5728 *outpos += repsize;
5729 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005730 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005731 Py_DECREF(rep);
5732 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005733}
5734
5735/* handle an error in PyUnicode_EncodeCharmap
5736 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005737static int
5738charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005739 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005740 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005741 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005742 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005743{
5744 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005745 Py_ssize_t repsize;
5746 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005747 Py_UNICODE *uni2;
5748 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005749 Py_ssize_t collstartpos = *inpos;
5750 Py_ssize_t collendpos = *inpos+1;
5751 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752 char *encoding = "charmap";
5753 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005754 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005755
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756 /* find all unencodable characters */
5757 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005758 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005759 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 int res = encoding_map_lookup(p[collendpos], mapping);
5761 if (res != -1)
5762 break;
5763 ++collendpos;
5764 continue;
5765 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005766
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 rep = charmapencode_lookup(p[collendpos], mapping);
5768 if (rep==NULL)
5769 return -1;
5770 else if (rep!=Py_None) {
5771 Py_DECREF(rep);
5772 break;
5773 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005774 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005775 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005776 }
5777 /* cache callback name lookup
5778 * (if not done yet, i.e. it's the first error) */
5779 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 if ((errors==NULL) || (!strcmp(errors, "strict")))
5781 *known_errorHandler = 1;
5782 else if (!strcmp(errors, "replace"))
5783 *known_errorHandler = 2;
5784 else if (!strcmp(errors, "ignore"))
5785 *known_errorHandler = 3;
5786 else if (!strcmp(errors, "xmlcharrefreplace"))
5787 *known_errorHandler = 4;
5788 else
5789 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005790 }
5791 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005792 case 1: /* strict */
5793 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5794 return -1;
5795 case 2: /* replace */
5796 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005797 x = charmapencode_output('?', mapping, res, respos);
5798 if (x==enc_EXCEPTION) {
5799 return -1;
5800 }
5801 else if (x==enc_FAILED) {
5802 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5803 return -1;
5804 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005805 }
5806 /* fall through */
5807 case 3: /* ignore */
5808 *inpos = collendpos;
5809 break;
5810 case 4: /* xmlcharrefreplace */
5811 /* generate replacement (temporarily (mis)uses p) */
5812 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 char buffer[2+29+1+1];
5814 char *cp;
5815 sprintf(buffer, "&#%d;", (int)p[collpos]);
5816 for (cp = buffer; *cp; ++cp) {
5817 x = charmapencode_output(*cp, mapping, res, respos);
5818 if (x==enc_EXCEPTION)
5819 return -1;
5820 else if (x==enc_FAILED) {
5821 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5822 return -1;
5823 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005824 }
5825 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005826 *inpos = collendpos;
5827 break;
5828 default:
5829 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 encoding, reason, p, size, exceptionObject,
5831 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005832 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005834 if (PyBytes_Check(repunicode)) {
5835 /* Directly copy bytes result to output. */
5836 Py_ssize_t outsize = PyBytes_Size(*res);
5837 Py_ssize_t requiredsize;
5838 repsize = PyBytes_Size(repunicode);
5839 requiredsize = *respos + repsize;
5840 if (requiredsize > outsize)
5841 /* Make room for all additional bytes. */
5842 if (charmapencode_resize(res, respos, requiredsize)) {
5843 Py_DECREF(repunicode);
5844 return -1;
5845 }
5846 memcpy(PyBytes_AsString(*res) + *respos,
5847 PyBytes_AsString(repunicode), repsize);
5848 *respos += repsize;
5849 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005850 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005851 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005852 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005853 /* generate replacement */
5854 repsize = PyUnicode_GET_SIZE(repunicode);
5855 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005856 x = charmapencode_output(*uni2, mapping, res, respos);
5857 if (x==enc_EXCEPTION) {
5858 return -1;
5859 }
5860 else if (x==enc_FAILED) {
5861 Py_DECREF(repunicode);
5862 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5863 return -1;
5864 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005865 }
5866 *inpos = newpos;
5867 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005868 }
5869 return 0;
5870}
5871
Alexander Belopolsky40018472011-02-26 01:02:56 +00005872PyObject *
5873PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5874 Py_ssize_t size,
5875 PyObject *mapping,
5876 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005878 /* output object */
5879 PyObject *res = NULL;
5880 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005881 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005883 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005884 PyObject *errorHandler = NULL;
5885 PyObject *exc = NULL;
5886 /* the following variable is used for caching string comparisons
5887 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5888 * 3=ignore, 4=xmlcharrefreplace */
5889 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890
5891 /* Default to Latin-1 */
5892 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005893 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 /* allocate enough for a simple encoding without
5896 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005897 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 if (res == NULL)
5899 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005900 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005903 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 /* try to encode it */
5905 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5906 if (x==enc_EXCEPTION) /* error */
5907 goto onError;
5908 if (x==enc_FAILED) { /* unencodable character */
5909 if (charmap_encoding_error(p, size, &inpos, mapping,
5910 &exc,
5911 &known_errorHandler, &errorHandler, errors,
5912 &res, &respos)) {
5913 goto onError;
5914 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005915 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005916 else
5917 /* done with this character => adjust input position */
5918 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005921 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005922 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005923 if (_PyBytes_Resize(&res, respos) < 0)
5924 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005925
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005926 Py_XDECREF(exc);
5927 Py_XDECREF(errorHandler);
5928 return res;
5929
Benjamin Peterson29060642009-01-31 22:14:21 +00005930 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005931 Py_XDECREF(res);
5932 Py_XDECREF(exc);
5933 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 return NULL;
5935}
5936
Alexander Belopolsky40018472011-02-26 01:02:56 +00005937PyObject *
5938PyUnicode_AsCharmapString(PyObject *unicode,
5939 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940{
5941 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 PyErr_BadArgument();
5943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 }
5945 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005946 PyUnicode_GET_SIZE(unicode),
5947 mapping,
5948 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949}
5950
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005951/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005952static void
5953make_translate_exception(PyObject **exceptionObject,
5954 const Py_UNICODE *unicode, Py_ssize_t size,
5955 Py_ssize_t startpos, Py_ssize_t endpos,
5956 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005958 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005959 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 }
5962 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5964 goto onError;
5965 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5966 goto onError;
5967 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5968 goto onError;
5969 return;
5970 onError:
5971 Py_DECREF(*exceptionObject);
5972 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 }
5974}
5975
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005976/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005977static void
5978raise_translate_exception(PyObject **exceptionObject,
5979 const Py_UNICODE *unicode, Py_ssize_t size,
5980 Py_ssize_t startpos, Py_ssize_t endpos,
5981 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005982{
5983 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005985 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987}
5988
5989/* error handling callback helper:
5990 build arguments, call the callback and check the arguments,
5991 put the result into newpos and return the replacement string, which
5992 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005993static PyObject *
5994unicode_translate_call_errorhandler(const char *errors,
5995 PyObject **errorHandler,
5996 const char *reason,
5997 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5998 Py_ssize_t startpos, Py_ssize_t endpos,
5999 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006000{
Benjamin Peterson142957c2008-07-04 19:55:29 +00006001 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006002
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006003 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006004 PyObject *restuple;
6005 PyObject *resunicode;
6006
6007 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006009 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006010 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011 }
6012
6013 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006015 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006017
6018 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006020 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006022 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00006023 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 Py_DECREF(restuple);
6025 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026 }
6027 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 &resunicode, &i_newpos)) {
6029 Py_DECREF(restuple);
6030 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006031 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006032 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006034 else
6035 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006036 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6038 Py_DECREF(restuple);
6039 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006040 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006041 Py_INCREF(resunicode);
6042 Py_DECREF(restuple);
6043 return resunicode;
6044}
6045
6046/* Lookup the character ch in the mapping and put the result in result,
6047 which must be decrefed by the caller.
6048 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006049static int
6050charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006051{
Christian Heimes217cfd12007-12-02 14:31:20 +00006052 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006053 PyObject *x;
6054
6055 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006057 x = PyObject_GetItem(mapping, w);
6058 Py_DECREF(w);
6059 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6061 /* No mapping found means: use 1:1 mapping. */
6062 PyErr_Clear();
6063 *result = NULL;
6064 return 0;
6065 } else
6066 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006067 }
6068 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 *result = x;
6070 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006071 }
Christian Heimes217cfd12007-12-02 14:31:20 +00006072 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 long value = PyLong_AS_LONG(x);
6074 long max = PyUnicode_GetMax();
6075 if (value < 0 || value > max) {
6076 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00006077 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 Py_DECREF(x);
6079 return -1;
6080 }
6081 *result = x;
6082 return 0;
6083 }
6084 else if (PyUnicode_Check(x)) {
6085 *result = x;
6086 return 0;
6087 }
6088 else {
6089 /* wrong return value */
6090 PyErr_SetString(PyExc_TypeError,
6091 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006092 Py_DECREF(x);
6093 return -1;
6094 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006095}
6096/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 if not reallocate and adjust various state variables.
6098 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006099static int
6100charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006102{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006103 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006104 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 /* remember old output position */
6106 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6107 /* exponentially overallocate to minimize reallocations */
6108 if (requiredsize < 2 * oldsize)
6109 requiredsize = 2 * oldsize;
6110 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6111 return -1;
6112 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006113 }
6114 return 0;
6115}
6116/* lookup the character, put the result in the output string and adjust
6117 various state variables. Return a new reference to the object that
6118 was put in the output buffer in *result, or Py_None, if the mapping was
6119 undefined (in which case no character was written).
6120 The called must decref result.
6121 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006122static int
6123charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
6124 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6125 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126{
Walter Dörwald4894c302003-10-24 14:25:28 +00006127 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006129 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 /* not found => default to 1:1 mapping */
6131 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006132 }
6133 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006135 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 /* no overflow check, because we know that the space is enough */
6137 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138 }
6139 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6141 if (repsize==1) {
6142 /* no overflow check, because we know that the space is enough */
6143 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6144 }
6145 else if (repsize!=0) {
6146 /* more than one character */
6147 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6148 (insize - (curinp-startinp)) +
6149 repsize - 1;
6150 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6151 return -1;
6152 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6153 *outp += repsize;
6154 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006155 }
6156 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158 return 0;
6159}
6160
Alexander Belopolsky40018472011-02-26 01:02:56 +00006161PyObject *
6162PyUnicode_TranslateCharmap(const Py_UNICODE *p,
6163 Py_ssize_t size,
6164 PyObject *mapping,
6165 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006167 /* output object */
6168 PyObject *res = NULL;
6169 /* pointers to the beginning and end+1 of input */
6170 const Py_UNICODE *startp = p;
6171 const Py_UNICODE *endp = p + size;
6172 /* pointer into the output */
6173 Py_UNICODE *str;
6174 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006175 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006176 char *reason = "character maps to <undefined>";
6177 PyObject *errorHandler = NULL;
6178 PyObject *exc = NULL;
6179 /* the following variable is used for caching string comparisons
6180 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6181 * 3=ignore, 4=xmlcharrefreplace */
6182 int known_errorHandler = -1;
6183
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 PyErr_BadArgument();
6186 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006188
6189 /* allocate enough for a simple 1:1 translation without
6190 replacements, if we need more, we'll resize */
6191 res = PyUnicode_FromUnicode(NULL, size);
6192 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006193 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006196 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006198 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006199 /* try to encode it */
6200 PyObject *x = NULL;
6201 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6202 Py_XDECREF(x);
6203 goto onError;
6204 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006205 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 if (x!=Py_None) /* it worked => adjust input pointer */
6207 ++p;
6208 else { /* untranslatable character */
6209 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6210 Py_ssize_t repsize;
6211 Py_ssize_t newpos;
6212 Py_UNICODE *uni2;
6213 /* startpos for collecting untranslatable chars */
6214 const Py_UNICODE *collstart = p;
6215 const Py_UNICODE *collend = p+1;
6216 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217
Benjamin Peterson29060642009-01-31 22:14:21 +00006218 /* find all untranslatable characters */
6219 while (collend < endp) {
6220 if (charmaptranslate_lookup(*collend, mapping, &x))
6221 goto onError;
6222 Py_XDECREF(x);
6223 if (x!=Py_None)
6224 break;
6225 ++collend;
6226 }
6227 /* cache callback name lookup
6228 * (if not done yet, i.e. it's the first error) */
6229 if (known_errorHandler==-1) {
6230 if ((errors==NULL) || (!strcmp(errors, "strict")))
6231 known_errorHandler = 1;
6232 else if (!strcmp(errors, "replace"))
6233 known_errorHandler = 2;
6234 else if (!strcmp(errors, "ignore"))
6235 known_errorHandler = 3;
6236 else if (!strcmp(errors, "xmlcharrefreplace"))
6237 known_errorHandler = 4;
6238 else
6239 known_errorHandler = 0;
6240 }
6241 switch (known_errorHandler) {
6242 case 1: /* strict */
6243 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006244 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 case 2: /* replace */
6246 /* No need to check for space, this is a 1:1 replacement */
6247 for (coll = collstart; coll<collend; ++coll)
6248 *str++ = '?';
6249 /* fall through */
6250 case 3: /* ignore */
6251 p = collend;
6252 break;
6253 case 4: /* xmlcharrefreplace */
6254 /* generate replacement (temporarily (mis)uses p) */
6255 for (p = collstart; p < collend; ++p) {
6256 char buffer[2+29+1+1];
6257 char *cp;
6258 sprintf(buffer, "&#%d;", (int)*p);
6259 if (charmaptranslate_makespace(&res, &str,
6260 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6261 goto onError;
6262 for (cp = buffer; *cp; ++cp)
6263 *str++ = *cp;
6264 }
6265 p = collend;
6266 break;
6267 default:
6268 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6269 reason, startp, size, &exc,
6270 collstart-startp, collend-startp, &newpos);
6271 if (repunicode == NULL)
6272 goto onError;
6273 /* generate replacement */
6274 repsize = PyUnicode_GET_SIZE(repunicode);
6275 if (charmaptranslate_makespace(&res, &str,
6276 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6277 Py_DECREF(repunicode);
6278 goto onError;
6279 }
6280 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6281 *str++ = *uni2;
6282 p = startp + newpos;
6283 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006284 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006285 }
6286 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006287 /* Resize if we allocated to much */
6288 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006289 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 if (PyUnicode_Resize(&res, respos) < 0)
6291 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006292 }
6293 Py_XDECREF(exc);
6294 Py_XDECREF(errorHandler);
6295 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006298 Py_XDECREF(res);
6299 Py_XDECREF(exc);
6300 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 return NULL;
6302}
6303
Alexander Belopolsky40018472011-02-26 01:02:56 +00006304PyObject *
6305PyUnicode_Translate(PyObject *str,
6306 PyObject *mapping,
6307 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308{
6309 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006310
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311 str = PyUnicode_FromObject(str);
6312 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006313 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006315 PyUnicode_GET_SIZE(str),
6316 mapping,
6317 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318 Py_DECREF(str);
6319 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006320
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322 Py_XDECREF(str);
6323 return NULL;
6324}
Tim Petersced69f82003-09-16 20:30:58 +00006325
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006326PyObject *
6327PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6328 Py_ssize_t length)
6329{
6330 PyObject *result;
6331 Py_UNICODE *p; /* write pointer into result */
6332 Py_ssize_t i;
6333 /* Copy to a new string */
6334 result = (PyObject *)_PyUnicode_New(length);
6335 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6336 if (result == NULL)
6337 return result;
6338 p = PyUnicode_AS_UNICODE(result);
6339 /* Iterate over code points */
6340 for (i = 0; i < length; i++) {
6341 Py_UNICODE ch =s[i];
6342 if (ch > 127) {
6343 int decimal = Py_UNICODE_TODECIMAL(ch);
6344 if (decimal >= 0)
6345 p[i] = '0' + decimal;
6346 }
6347 }
6348 return result;
6349}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006350/* --- Decimal Encoder ---------------------------------------------------- */
6351
Alexander Belopolsky40018472011-02-26 01:02:56 +00006352int
6353PyUnicode_EncodeDecimal(Py_UNICODE *s,
6354 Py_ssize_t length,
6355 char *output,
6356 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006357{
6358 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006359 PyObject *errorHandler = NULL;
6360 PyObject *exc = NULL;
6361 const char *encoding = "decimal";
6362 const char *reason = "invalid decimal Unicode string";
6363 /* the following variable is used for caching string comparisons
6364 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6365 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006366
6367 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 PyErr_BadArgument();
6369 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006370 }
6371
6372 p = s;
6373 end = s + length;
6374 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 register Py_UNICODE ch = *p;
6376 int decimal;
6377 PyObject *repunicode;
6378 Py_ssize_t repsize;
6379 Py_ssize_t newpos;
6380 Py_UNICODE *uni2;
6381 Py_UNICODE *collstart;
6382 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006383
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006385 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 ++p;
6387 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006388 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 decimal = Py_UNICODE_TODECIMAL(ch);
6390 if (decimal >= 0) {
6391 *output++ = '0' + decimal;
6392 ++p;
6393 continue;
6394 }
6395 if (0 < ch && ch < 256) {
6396 *output++ = (char)ch;
6397 ++p;
6398 continue;
6399 }
6400 /* All other characters are considered unencodable */
6401 collstart = p;
6402 collend = p+1;
6403 while (collend < end) {
6404 if ((0 < *collend && *collend < 256) ||
6405 !Py_UNICODE_ISSPACE(*collend) ||
6406 Py_UNICODE_TODECIMAL(*collend))
6407 break;
6408 }
6409 /* cache callback name lookup
6410 * (if not done yet, i.e. it's the first error) */
6411 if (known_errorHandler==-1) {
6412 if ((errors==NULL) || (!strcmp(errors, "strict")))
6413 known_errorHandler = 1;
6414 else if (!strcmp(errors, "replace"))
6415 known_errorHandler = 2;
6416 else if (!strcmp(errors, "ignore"))
6417 known_errorHandler = 3;
6418 else if (!strcmp(errors, "xmlcharrefreplace"))
6419 known_errorHandler = 4;
6420 else
6421 known_errorHandler = 0;
6422 }
6423 switch (known_errorHandler) {
6424 case 1: /* strict */
6425 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6426 goto onError;
6427 case 2: /* replace */
6428 for (p = collstart; p < collend; ++p)
6429 *output++ = '?';
6430 /* fall through */
6431 case 3: /* ignore */
6432 p = collend;
6433 break;
6434 case 4: /* xmlcharrefreplace */
6435 /* generate replacement (temporarily (mis)uses p) */
6436 for (p = collstart; p < collend; ++p)
6437 output += sprintf(output, "&#%d;", (int)*p);
6438 p = collend;
6439 break;
6440 default:
6441 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6442 encoding, reason, s, length, &exc,
6443 collstart-s, collend-s, &newpos);
6444 if (repunicode == NULL)
6445 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006446 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006447 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006448 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6449 Py_DECREF(repunicode);
6450 goto onError;
6451 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 /* generate replacement */
6453 repsize = PyUnicode_GET_SIZE(repunicode);
6454 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6455 Py_UNICODE ch = *uni2;
6456 if (Py_UNICODE_ISSPACE(ch))
6457 *output++ = ' ';
6458 else {
6459 decimal = Py_UNICODE_TODECIMAL(ch);
6460 if (decimal >= 0)
6461 *output++ = '0' + decimal;
6462 else if (0 < ch && ch < 256)
6463 *output++ = (char)ch;
6464 else {
6465 Py_DECREF(repunicode);
6466 raise_encode_exception(&exc, encoding,
6467 s, length, collstart-s, collend-s, reason);
6468 goto onError;
6469 }
6470 }
6471 }
6472 p = s + newpos;
6473 Py_DECREF(repunicode);
6474 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006475 }
6476 /* 0-terminate the output string */
Victor Stinner4f2dab52011-05-27 16:46:51 +02006477 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006478 Py_XDECREF(exc);
6479 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006480 return 0;
6481
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483 Py_XDECREF(exc);
6484 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006485 return -1;
6486}
6487
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488/* --- Helpers ------------------------------------------------------------ */
6489
Eric Smith8c663262007-08-25 02:26:07 +00006490#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006491#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006492
Thomas Wouters477c8d52006-05-27 19:21:47 +00006493#include "stringlib/count.h"
6494#include "stringlib/find.h"
6495#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006496#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006497
Eric Smith5807c412008-05-11 21:00:57 +00006498#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006499#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006500#include "stringlib/localeutil.h"
6501
Thomas Wouters477c8d52006-05-27 19:21:47 +00006502/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006503#define ADJUST_INDICES(start, end, len) \
6504 if (end > len) \
6505 end = len; \
6506 else if (end < 0) { \
6507 end += len; \
6508 if (end < 0) \
6509 end = 0; \
6510 } \
6511 if (start < 0) { \
6512 start += len; \
6513 if (start < 0) \
6514 start = 0; \
6515 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006516
Alexander Belopolsky40018472011-02-26 01:02:56 +00006517Py_ssize_t
6518PyUnicode_Count(PyObject *str,
6519 PyObject *substr,
6520 Py_ssize_t start,
6521 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006523 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006524 PyUnicodeObject* str_obj;
6525 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006526
Thomas Wouters477c8d52006-05-27 19:21:47 +00006527 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6528 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006530 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6531 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 Py_DECREF(str_obj);
6533 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 }
Tim Petersced69f82003-09-16 20:30:58 +00006535
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006536 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006537 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006538 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6539 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006540 );
6541
6542 Py_DECREF(sub_obj);
6543 Py_DECREF(str_obj);
6544
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545 return result;
6546}
6547
Alexander Belopolsky40018472011-02-26 01:02:56 +00006548Py_ssize_t
6549PyUnicode_Find(PyObject *str,
6550 PyObject *sub,
6551 Py_ssize_t start,
6552 Py_ssize_t end,
6553 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006555 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006556
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006558 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006560 sub = PyUnicode_FromObject(sub);
6561 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 Py_DECREF(str);
6563 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564 }
Tim Petersced69f82003-09-16 20:30:58 +00006565
Thomas Wouters477c8d52006-05-27 19:21:47 +00006566 if (direction > 0)
6567 result = stringlib_find_slice(
6568 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6569 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6570 start, end
6571 );
6572 else
6573 result = stringlib_rfind_slice(
6574 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6575 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6576 start, end
6577 );
6578
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006580 Py_DECREF(sub);
6581
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 return result;
6583}
6584
Alexander Belopolsky40018472011-02-26 01:02:56 +00006585static int
6586tailmatch(PyUnicodeObject *self,
6587 PyUnicodeObject *substring,
6588 Py_ssize_t start,
6589 Py_ssize_t end,
6590 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 if (substring->length == 0)
6593 return 1;
6594
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006595 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596 end -= substring->length;
6597 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006599
6600 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 if (Py_UNICODE_MATCH(self, end, substring))
6602 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 } else {
6604 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 }
6607
6608 return 0;
6609}
6610
Alexander Belopolsky40018472011-02-26 01:02:56 +00006611Py_ssize_t
6612PyUnicode_Tailmatch(PyObject *str,
6613 PyObject *substr,
6614 Py_ssize_t start,
6615 Py_ssize_t end,
6616 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006618 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006619
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 str = PyUnicode_FromObject(str);
6621 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623 substr = PyUnicode_FromObject(substr);
6624 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 Py_DECREF(str);
6626 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 }
Tim Petersced69f82003-09-16 20:30:58 +00006628
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 (PyUnicodeObject *)substr,
6631 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 Py_DECREF(str);
6633 Py_DECREF(substr);
6634 return result;
6635}
6636
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637/* Apply fixfct filter to the Unicode object self and return a
6638 reference to the modified object */
6639
Alexander Belopolsky40018472011-02-26 01:02:56 +00006640static PyObject *
6641fixup(PyUnicodeObject *self,
6642 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643{
6644
6645 PyUnicodeObject *u;
6646
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006647 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006649 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006650
6651 Py_UNICODE_COPY(u->str, self->str, self->length);
6652
Tim Peters7a29bd52001-09-12 03:03:31 +00006653 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 /* fixfct should return TRUE if it modified the buffer. If
6655 FALSE, return a reference to the original buffer instead
6656 (to save space, not time) */
6657 Py_INCREF(self);
6658 Py_DECREF(u);
6659 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660 }
6661 return (PyObject*) u;
6662}
6663
Alexander Belopolsky40018472011-02-26 01:02:56 +00006664static int
6665fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006667 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 Py_UNICODE *s = self->str;
6669 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006670
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006673
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 ch = Py_UNICODE_TOUPPER(*s);
6675 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006677 *s = ch;
6678 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 s++;
6680 }
6681
6682 return status;
6683}
6684
Alexander Belopolsky40018472011-02-26 01:02:56 +00006685static int
6686fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006688 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689 Py_UNICODE *s = self->str;
6690 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006691
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006694
Benjamin Peterson29060642009-01-31 22:14:21 +00006695 ch = Py_UNICODE_TOLOWER(*s);
6696 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 *s = ch;
6699 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700 s++;
6701 }
6702
6703 return status;
6704}
6705
Alexander Belopolsky40018472011-02-26 01:02:56 +00006706static int
6707fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006709 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 Py_UNICODE *s = self->str;
6711 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006712
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 while (len-- > 0) {
6714 if (Py_UNICODE_ISUPPER(*s)) {
6715 *s = Py_UNICODE_TOLOWER(*s);
6716 status = 1;
6717 } else if (Py_UNICODE_ISLOWER(*s)) {
6718 *s = Py_UNICODE_TOUPPER(*s);
6719 status = 1;
6720 }
6721 s++;
6722 }
6723
6724 return status;
6725}
6726
Alexander Belopolsky40018472011-02-26 01:02:56 +00006727static int
6728fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006730 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006731 Py_UNICODE *s = self->str;
6732 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006733
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006734 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006736 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 *s = Py_UNICODE_TOUPPER(*s);
6738 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006740 s++;
6741 while (--len > 0) {
6742 if (Py_UNICODE_ISUPPER(*s)) {
6743 *s = Py_UNICODE_TOLOWER(*s);
6744 status = 1;
6745 }
6746 s++;
6747 }
6748 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749}
6750
Alexander Belopolsky40018472011-02-26 01:02:56 +00006751static int
6752fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753{
6754 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6755 register Py_UNICODE *e;
6756 int previous_is_cased;
6757
6758 /* Shortcut for single character strings */
6759 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006760 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6761 if (*p != ch) {
6762 *p = ch;
6763 return 1;
6764 }
6765 else
6766 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 }
Tim Petersced69f82003-09-16 20:30:58 +00006768
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 e = p + PyUnicode_GET_SIZE(self);
6770 previous_is_cased = 0;
6771 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006773
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 if (previous_is_cased)
6775 *p = Py_UNICODE_TOLOWER(ch);
6776 else
6777 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006778
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 if (Py_UNICODE_ISLOWER(ch) ||
6780 Py_UNICODE_ISUPPER(ch) ||
6781 Py_UNICODE_ISTITLE(ch))
6782 previous_is_cased = 1;
6783 else
6784 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 }
6786 return 1;
6787}
6788
Tim Peters8ce9f162004-08-27 01:49:32 +00006789PyObject *
6790PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791{
Skip Montanaro6543b452004-09-16 03:28:13 +00006792 const Py_UNICODE blank = ' ';
6793 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006794 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006795 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006796 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6797 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006798 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6799 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006800 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006801 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802
Tim Peters05eba1f2004-08-27 21:32:02 +00006803 fseq = PySequence_Fast(seq, "");
6804 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006805 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006806 }
6807
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006808 /* NOTE: the following code can't call back into Python code,
6809 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006810 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006811
Tim Peters05eba1f2004-08-27 21:32:02 +00006812 seqlen = PySequence_Fast_GET_SIZE(fseq);
6813 /* If empty sequence, return u"". */
6814 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006815 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6816 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006817 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006818 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006819 /* If singleton sequence with an exact Unicode, return that. */
6820 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 item = items[0];
6822 if (PyUnicode_CheckExact(item)) {
6823 Py_INCREF(item);
6824 res = (PyUnicodeObject *)item;
6825 goto Done;
6826 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006827 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006828 else {
6829 /* Set up sep and seplen */
6830 if (separator == NULL) {
6831 sep = &blank;
6832 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006833 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006834 else {
6835 if (!PyUnicode_Check(separator)) {
6836 PyErr_Format(PyExc_TypeError,
6837 "separator: expected str instance,"
6838 " %.80s found",
6839 Py_TYPE(separator)->tp_name);
6840 goto onError;
6841 }
6842 sep = PyUnicode_AS_UNICODE(separator);
6843 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006844 }
6845 }
6846
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006847 /* There are at least two things to join, or else we have a subclass
6848 * of str in the sequence.
6849 * Do a pre-pass to figure out the total amount of space we'll
6850 * need (sz), and see whether all argument are strings.
6851 */
6852 sz = 0;
6853 for (i = 0; i < seqlen; i++) {
6854 const Py_ssize_t old_sz = sz;
6855 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006856 if (!PyUnicode_Check(item)) {
6857 PyErr_Format(PyExc_TypeError,
6858 "sequence item %zd: expected str instance,"
6859 " %.80s found",
6860 i, Py_TYPE(item)->tp_name);
6861 goto onError;
6862 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006863 sz += PyUnicode_GET_SIZE(item);
6864 if (i != 0)
6865 sz += seplen;
6866 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6867 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006868 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006869 goto onError;
6870 }
6871 }
Tim Petersced69f82003-09-16 20:30:58 +00006872
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006873 res = _PyUnicode_New(sz);
6874 if (res == NULL)
6875 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006876
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006877 /* Catenate everything. */
6878 res_p = PyUnicode_AS_UNICODE(res);
6879 for (i = 0; i < seqlen; ++i) {
6880 Py_ssize_t itemlen;
6881 item = items[i];
6882 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 /* Copy item, and maybe the separator. */
6884 if (i) {
6885 Py_UNICODE_COPY(res_p, sep, seplen);
6886 res_p += seplen;
6887 }
6888 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6889 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006890 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006891
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006893 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894 return (PyObject *)res;
6895
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006897 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006898 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006899 return NULL;
6900}
6901
Alexander Belopolsky40018472011-02-26 01:02:56 +00006902static PyUnicodeObject *
6903pad(PyUnicodeObject *self,
6904 Py_ssize_t left,
6905 Py_ssize_t right,
6906 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907{
6908 PyUnicodeObject *u;
6909
6910 if (left < 0)
6911 left = 0;
6912 if (right < 0)
6913 right = 0;
6914
Tim Peters7a29bd52001-09-12 03:03:31 +00006915 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006916 Py_INCREF(self);
6917 return self;
6918 }
6919
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006920 if (left > PY_SSIZE_T_MAX - self->length ||
6921 right > PY_SSIZE_T_MAX - (left + self->length)) {
6922 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6923 return NULL;
6924 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925 u = _PyUnicode_New(left + self->length + right);
6926 if (u) {
6927 if (left)
6928 Py_UNICODE_FILL(u->str, fill, left);
6929 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6930 if (right)
6931 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6932 }
6933
6934 return u;
6935}
6936
Alexander Belopolsky40018472011-02-26 01:02:56 +00006937PyObject *
6938PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941
6942 string = PyUnicode_FromObject(string);
6943 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006944 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006946 list = stringlib_splitlines(
6947 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6948 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949
6950 Py_DECREF(string);
6951 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952}
6953
Alexander Belopolsky40018472011-02-26 01:02:56 +00006954static PyObject *
6955split(PyUnicodeObject *self,
6956 PyUnicodeObject *substring,
6957 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006960 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006963 return stringlib_split_whitespace(
6964 (PyObject*) self, self->str, self->length, maxcount
6965 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006966
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006967 return stringlib_split(
6968 (PyObject*) self, self->str, self->length,
6969 substring->str, substring->length,
6970 maxcount
6971 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972}
6973
Alexander Belopolsky40018472011-02-26 01:02:56 +00006974static PyObject *
6975rsplit(PyUnicodeObject *self,
6976 PyUnicodeObject *substring,
6977 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006978{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006979 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006980 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006981
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006982 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006983 return stringlib_rsplit_whitespace(
6984 (PyObject*) self, self->str, self->length, maxcount
6985 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006986
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006987 return stringlib_rsplit(
6988 (PyObject*) self, self->str, self->length,
6989 substring->str, substring->length,
6990 maxcount
6991 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006992}
6993
Alexander Belopolsky40018472011-02-26 01:02:56 +00006994static PyObject *
6995replace(PyUnicodeObject *self,
6996 PyUnicodeObject *str1,
6997 PyUnicodeObject *str2,
6998 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999{
7000 PyUnicodeObject *u;
7001
7002 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007004 else if (maxcount == 0 || self->length == 0)
7005 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006
Thomas Wouters477c8d52006-05-27 19:21:47 +00007007 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00007008 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007009 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007010 if (str1->length == 0)
7011 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007012 if (str1->length == 1) {
7013 /* replace characters */
7014 Py_UNICODE u1, u2;
7015 if (!findchar(self->str, self->length, str1->str[0]))
7016 goto nothing;
7017 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7018 if (!u)
7019 return NULL;
7020 Py_UNICODE_COPY(u->str, self->str, self->length);
7021 u1 = str1->str[0];
7022 u2 = str2->str[0];
7023 for (i = 0; i < u->length; i++)
7024 if (u->str[i] == u1) {
7025 if (--maxcount < 0)
7026 break;
7027 u->str[i] = u2;
7028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007030 i = stringlib_find(
7031 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00007032 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00007033 if (i < 0)
7034 goto nothing;
7035 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7036 if (!u)
7037 return NULL;
7038 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007039
7040 /* change everything in-place, starting with this one */
7041 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7042 i += str1->length;
7043
7044 while ( --maxcount > 0) {
7045 i = stringlib_find(self->str+i, self->length-i,
7046 str1->str, str1->length,
7047 i);
7048 if (i == -1)
7049 break;
7050 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7051 i += str1->length;
7052 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007055
Brett Cannonb94767f2011-02-22 20:15:44 +00007056 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007057 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058 Py_UNICODE *p;
7059
7060 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007061 n = stringlib_count(self->str, self->length, str1->str, str1->length,
7062 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007063 if (n == 0)
7064 goto nothing;
7065 /* new_size = self->length + n * (str2->length - str1->length)); */
7066 delta = (str2->length - str1->length);
7067 if (delta == 0) {
7068 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007070 product = n * (str2->length - str1->length);
7071 if ((product / (str2->length - str1->length)) != n) {
7072 PyErr_SetString(PyExc_OverflowError,
7073 "replace string is too long");
7074 return NULL;
7075 }
7076 new_size = self->length + product;
7077 if (new_size < 0) {
7078 PyErr_SetString(PyExc_OverflowError,
7079 "replace string is too long");
7080 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081 }
7082 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007083 u = _PyUnicode_New(new_size);
7084 if (!u)
7085 return NULL;
7086 i = 0;
7087 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007088 if (str1->length > 0) {
7089 while (n-- > 0) {
7090 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007091 j = stringlib_find(self->str+i, self->length-i,
7092 str1->str, str1->length,
7093 i);
7094 if (j == -1)
7095 break;
7096 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007097 /* copy unchanged part [i:j] */
7098 Py_UNICODE_COPY(p, self->str+i, j-i);
7099 p += j - i;
7100 }
7101 /* copy substitution string */
7102 if (str2->length > 0) {
7103 Py_UNICODE_COPY(p, str2->str, str2->length);
7104 p += str2->length;
7105 }
7106 i = j + str1->length;
7107 }
7108 if (i < self->length)
7109 /* copy tail [i:] */
7110 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7111 } else {
7112 /* interleave */
7113 while (n > 0) {
7114 Py_UNICODE_COPY(p, str2->str, str2->length);
7115 p += str2->length;
7116 if (--n <= 0)
7117 break;
7118 *p++ = self->str[i++];
7119 }
7120 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7121 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007122 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007124
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007126 /* nothing to replace; return original string (when possible) */
7127 if (PyUnicode_CheckExact(self)) {
7128 Py_INCREF(self);
7129 return (PyObject *) self;
7130 }
7131 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132}
7133
7134/* --- Unicode Object Methods --------------------------------------------- */
7135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007136PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007138\n\
7139Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007140characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141
7142static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007143unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007145 return fixup(self, fixtitle);
7146}
7147
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007148PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007149 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150\n\
7151Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007152have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153
7154static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007155unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007156{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157 return fixup(self, fixcapitalize);
7158}
7159
7160#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007161PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007162 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163\n\
7164Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007165normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166
7167static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007168unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007169{
7170 PyObject *list;
7171 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007172 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173
Guido van Rossumd57fd912000-03-10 22:53:23 +00007174 /* Split into words */
7175 list = split(self, NULL, -1);
7176 if (!list)
7177 return NULL;
7178
7179 /* Capitalize each word */
7180 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7181 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007182 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007183 if (item == NULL)
7184 goto onError;
7185 Py_DECREF(PyList_GET_ITEM(list, i));
7186 PyList_SET_ITEM(list, i, item);
7187 }
7188
7189 /* Join the words to form a new string */
7190 item = PyUnicode_Join(NULL, list);
7191
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 Py_DECREF(list);
7194 return (PyObject *)item;
7195}
7196#endif
7197
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007198/* Argument converter. Coerces to a single unicode character */
7199
7200static int
7201convert_uc(PyObject *obj, void *addr)
7202{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007203 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7204 PyObject *uniobj;
7205 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007206
Benjamin Peterson14339b62009-01-31 16:36:08 +00007207 uniobj = PyUnicode_FromObject(obj);
7208 if (uniobj == NULL) {
7209 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007211 return 0;
7212 }
7213 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7214 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007215 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007216 Py_DECREF(uniobj);
7217 return 0;
7218 }
7219 unistr = PyUnicode_AS_UNICODE(uniobj);
7220 *fillcharloc = unistr[0];
7221 Py_DECREF(uniobj);
7222 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007223}
7224
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007225PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007228Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007229done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230
7231static PyObject *
7232unicode_center(PyUnicodeObject *self, PyObject *args)
7233{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007234 Py_ssize_t marg, left;
7235 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007236 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237
Thomas Woutersde017742006-02-16 19:34:37 +00007238 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007239 return NULL;
7240
Tim Peters7a29bd52001-09-12 03:03:31 +00007241 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007242 Py_INCREF(self);
7243 return (PyObject*) self;
7244 }
7245
7246 marg = width - self->length;
7247 left = marg / 2 + (marg & width & 1);
7248
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007249 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250}
7251
Marc-André Lemburge5034372000-08-08 08:04:29 +00007252#if 0
7253
7254/* This code should go into some future Unicode collation support
7255 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007256 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007257
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007258/* speedy UTF-16 code point order comparison */
7259/* gleaned from: */
7260/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7261
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007262static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007263{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007264 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007265 0, 0, 0, 0, 0, 0, 0, 0,
7266 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007267 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007268};
7269
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270static int
7271unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7272{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007273 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007274
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 Py_UNICODE *s1 = str1->str;
7276 Py_UNICODE *s2 = str2->str;
7277
7278 len1 = str1->length;
7279 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007280
Guido van Rossumd57fd912000-03-10 22:53:23 +00007281 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007282 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007283
7284 c1 = *s1++;
7285 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007286
Benjamin Peterson29060642009-01-31 22:14:21 +00007287 if (c1 > (1<<11) * 26)
7288 c1 += utf16Fixup[c1>>11];
7289 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007290 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007291 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007292
7293 if (c1 != c2)
7294 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007295
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007296 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007297 }
7298
7299 return (len1 < len2) ? -1 : (len1 != len2);
7300}
7301
Marc-André Lemburge5034372000-08-08 08:04:29 +00007302#else
7303
7304static int
7305unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7306{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007307 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007308
7309 Py_UNICODE *s1 = str1->str;
7310 Py_UNICODE *s2 = str2->str;
7311
7312 len1 = str1->length;
7313 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007314
Marc-André Lemburge5034372000-08-08 08:04:29 +00007315 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007316 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007317
Fredrik Lundh45714e92001-06-26 16:39:36 +00007318 c1 = *s1++;
7319 c2 = *s2++;
7320
7321 if (c1 != c2)
7322 return (c1 < c2) ? -1 : 1;
7323
Marc-André Lemburge5034372000-08-08 08:04:29 +00007324 len1--; len2--;
7325 }
7326
7327 return (len1 < len2) ? -1 : (len1 != len2);
7328}
7329
7330#endif
7331
Alexander Belopolsky40018472011-02-26 01:02:56 +00007332int
7333PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007335 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7336 return unicode_compare((PyUnicodeObject *)left,
7337 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007338 PyErr_Format(PyExc_TypeError,
7339 "Can't compare %.100s and %.100s",
7340 left->ob_type->tp_name,
7341 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342 return -1;
7343}
7344
Martin v. Löwis5b222132007-06-10 09:51:05 +00007345int
7346PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7347{
7348 int i;
7349 Py_UNICODE *id;
7350 assert(PyUnicode_Check(uni));
7351 id = PyUnicode_AS_UNICODE(uni);
7352 /* Compare Unicode string and source character set string */
7353 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007354 if (id[i] != str[i])
7355 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007356 /* This check keeps Python strings that end in '\0' from comparing equal
7357 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007358 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007360 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007361 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007362 return 0;
7363}
7364
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007365
Benjamin Peterson29060642009-01-31 22:14:21 +00007366#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007367 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007368
Alexander Belopolsky40018472011-02-26 01:02:56 +00007369PyObject *
7370PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007371{
7372 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007373
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007374 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7375 PyObject *v;
Benjamin Peterson5fd4bd32011-03-06 09:06:34 -06007376 if (PyUnicode_GET_SIZE(left) != PyUnicode_GET_SIZE(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007377 if (op == Py_EQ) {
7378 Py_INCREF(Py_False);
7379 return Py_False;
7380 }
7381 if (op == Py_NE) {
7382 Py_INCREF(Py_True);
7383 return Py_True;
7384 }
7385 }
7386 if (left == right)
7387 result = 0;
7388 else
7389 result = unicode_compare((PyUnicodeObject *)left,
7390 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007391
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007392 /* Convert the return value to a Boolean */
7393 switch (op) {
7394 case Py_EQ:
7395 v = TEST_COND(result == 0);
7396 break;
7397 case Py_NE:
7398 v = TEST_COND(result != 0);
7399 break;
7400 case Py_LE:
7401 v = TEST_COND(result <= 0);
7402 break;
7403 case Py_GE:
7404 v = TEST_COND(result >= 0);
7405 break;
7406 case Py_LT:
7407 v = TEST_COND(result == -1);
7408 break;
7409 case Py_GT:
7410 v = TEST_COND(result == 1);
7411 break;
7412 default:
7413 PyErr_BadArgument();
7414 return NULL;
7415 }
7416 Py_INCREF(v);
7417 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007418 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007419
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007420 Py_INCREF(Py_NotImplemented);
7421 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007422}
7423
Alexander Belopolsky40018472011-02-26 01:02:56 +00007424int
7425PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007426{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007427 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007428 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007429
7430 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007431 sub = PyUnicode_FromObject(element);
7432 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 PyErr_Format(PyExc_TypeError,
7434 "'in <string>' requires string as left operand, not %s",
7435 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007436 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007437 }
7438
Thomas Wouters477c8d52006-05-27 19:21:47 +00007439 str = PyUnicode_FromObject(container);
7440 if (!str) {
7441 Py_DECREF(sub);
7442 return -1;
7443 }
7444
7445 result = stringlib_contains_obj(str, sub);
7446
7447 Py_DECREF(str);
7448 Py_DECREF(sub);
7449
Guido van Rossum403d68b2000-03-13 15:55:09 +00007450 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007451}
7452
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453/* Concat to string or Unicode object giving a new Unicode object. */
7454
Alexander Belopolsky40018472011-02-26 01:02:56 +00007455PyObject *
7456PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457{
7458 PyUnicodeObject *u = NULL, *v = NULL, *w;
7459
7460 /* Coerce the two arguments */
7461 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7462 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007463 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007464 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7465 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467
7468 /* Shortcuts */
7469 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 Py_DECREF(v);
7471 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007472 }
7473 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 Py_DECREF(u);
7475 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476 }
7477
7478 /* Concat the two Unicode strings */
7479 w = _PyUnicode_New(u->length + v->length);
7480 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007481 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482 Py_UNICODE_COPY(w->str, u->str, u->length);
7483 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7484
7485 Py_DECREF(u);
7486 Py_DECREF(v);
7487 return (PyObject *)w;
7488
Benjamin Peterson29060642009-01-31 22:14:21 +00007489 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007490 Py_XDECREF(u);
7491 Py_XDECREF(v);
7492 return NULL;
7493}
7494
Walter Dörwald1ab83302007-05-18 17:15:44 +00007495void
7496PyUnicode_Append(PyObject **pleft, PyObject *right)
7497{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007498 PyObject *new;
7499 if (*pleft == NULL)
7500 return;
7501 if (right == NULL || !PyUnicode_Check(*pleft)) {
7502 Py_DECREF(*pleft);
7503 *pleft = NULL;
7504 return;
7505 }
7506 new = PyUnicode_Concat(*pleft, right);
7507 Py_DECREF(*pleft);
7508 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007509}
7510
7511void
7512PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7513{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007514 PyUnicode_Append(pleft, right);
7515 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007516}
7517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007518PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007519 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007521Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007522string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007523interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007524
7525static PyObject *
7526unicode_count(PyUnicodeObject *self, PyObject *args)
7527{
7528 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007529 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007530 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007531 PyObject *result;
7532
Jesus Ceaac451502011-04-20 17:09:23 +02007533 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
7534 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00007535 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007536
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007537 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007538 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007539 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007540 substring->str, substring->length,
7541 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007542 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007543
7544 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007545
Guido van Rossumd57fd912000-03-10 22:53:23 +00007546 return result;
7547}
7548
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007549PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007550 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007552Encode S using the codec registered for encoding. Default encoding\n\
7553is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007554handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007555a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7556'xmlcharrefreplace' as well as any other name registered with\n\
7557codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007558
7559static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007560unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007561{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007562 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563 char *encoding = NULL;
7564 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007565
Benjamin Peterson308d6372009-09-18 21:42:35 +00007566 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7567 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007568 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007569 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007570}
7571
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007572PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007573 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574\n\
7575Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007576If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577
7578static PyObject*
7579unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7580{
7581 Py_UNICODE *e;
7582 Py_UNICODE *p;
7583 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007584 Py_UNICODE *qe;
7585 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 PyUnicodeObject *u;
7587 int tabsize = 8;
7588
7589 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007590 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591
Thomas Wouters7e474022000-07-16 12:04:32 +00007592 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007593 i = 0; /* chars up to and including most recent \n or \r */
7594 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7595 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596 for (p = self->str; p < e; p++)
7597 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 if (tabsize > 0) {
7599 incr = tabsize - (j % tabsize); /* cannot overflow */
7600 if (j > PY_SSIZE_T_MAX - incr)
7601 goto overflow1;
7602 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007603 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007606 if (j > PY_SSIZE_T_MAX - 1)
7607 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608 j++;
7609 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 if (i > PY_SSIZE_T_MAX - j)
7611 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007613 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614 }
7615 }
7616
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007617 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007619
Guido van Rossumd57fd912000-03-10 22:53:23 +00007620 /* Second pass: create output string and fill it */
7621 u = _PyUnicode_New(i + j);
7622 if (!u)
7623 return NULL;
7624
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007625 j = 0; /* same as in first pass */
7626 q = u->str; /* next output char */
7627 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628
7629 for (p = self->str; p < e; p++)
7630 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 if (tabsize > 0) {
7632 i = tabsize - (j % tabsize);
7633 j += i;
7634 while (i--) {
7635 if (q >= qe)
7636 goto overflow2;
7637 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007638 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007640 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 else {
7642 if (q >= qe)
7643 goto overflow2;
7644 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007645 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646 if (*p == '\n' || *p == '\r')
7647 j = 0;
7648 }
7649
7650 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007651
7652 overflow2:
7653 Py_DECREF(u);
7654 overflow1:
7655 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7656 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657}
7658
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007659PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661\n\
7662Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007663such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007664arguments start and end are interpreted as in slice notation.\n\
7665\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007666Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667
7668static PyObject *
7669unicode_find(PyUnicodeObject *self, PyObject *args)
7670{
Jesus Ceaac451502011-04-20 17:09:23 +02007671 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007672 Py_ssize_t start;
7673 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007674 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675
Jesus Ceaac451502011-04-20 17:09:23 +02007676 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
7677 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679
Thomas Wouters477c8d52006-05-27 19:21:47 +00007680 result = stringlib_find_slice(
7681 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7682 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7683 start, end
7684 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007685
7686 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007687
Christian Heimes217cfd12007-12-02 14:31:20 +00007688 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007689}
7690
7691static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007692unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693{
7694 if (index < 0 || index >= self->length) {
7695 PyErr_SetString(PyExc_IndexError, "string index out of range");
7696 return NULL;
7697 }
7698
7699 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7700}
7701
Guido van Rossumc2504932007-09-18 19:42:40 +00007702/* Believe it or not, this produces the same value for ASCII strings
7703 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007704static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007705unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706{
Guido van Rossumc2504932007-09-18 19:42:40 +00007707 Py_ssize_t len;
7708 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007709 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007710
7711 if (self->hash != -1)
7712 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007713 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007714 p = self->str;
7715 x = *p << 7;
7716 while (--len >= 0)
7717 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007718 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007719 if (x == -1)
7720 x = -2;
7721 self->hash = x;
7722 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723}
7724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007725PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007727\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007728Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729
7730static PyObject *
7731unicode_index(PyUnicodeObject *self, PyObject *args)
7732{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007733 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +02007734 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007735 Py_ssize_t start;
7736 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737
Jesus Ceaac451502011-04-20 17:09:23 +02007738 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
7739 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007741
Thomas Wouters477c8d52006-05-27 19:21:47 +00007742 result = stringlib_find_slice(
7743 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7744 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7745 start, end
7746 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747
7748 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007749
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750 if (result < 0) {
7751 PyErr_SetString(PyExc_ValueError, "substring not found");
7752 return NULL;
7753 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007754
Christian Heimes217cfd12007-12-02 14:31:20 +00007755 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756}
7757
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007758PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007760\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007761Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007762at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763
7764static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007765unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007766{
7767 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7768 register const Py_UNICODE *e;
7769 int cased;
7770
Guido van Rossumd57fd912000-03-10 22:53:23 +00007771 /* Shortcut for single character strings */
7772 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007775 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007776 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007777 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007778
Guido van Rossumd57fd912000-03-10 22:53:23 +00007779 e = p + PyUnicode_GET_SIZE(self);
7780 cased = 0;
7781 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007783
Benjamin Peterson29060642009-01-31 22:14:21 +00007784 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7785 return PyBool_FromLong(0);
7786 else if (!cased && Py_UNICODE_ISLOWER(ch))
7787 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007788 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007789 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007790}
7791
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007792PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007795Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007796at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797
7798static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007799unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007800{
7801 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7802 register const Py_UNICODE *e;
7803 int cased;
7804
Guido van Rossumd57fd912000-03-10 22:53:23 +00007805 /* Shortcut for single character strings */
7806 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007807 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007809 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007810 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007812
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813 e = p + PyUnicode_GET_SIZE(self);
7814 cased = 0;
7815 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007816 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007817
Benjamin Peterson29060642009-01-31 22:14:21 +00007818 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7819 return PyBool_FromLong(0);
7820 else if (!cased && Py_UNICODE_ISUPPER(ch))
7821 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007823 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007824}
7825
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007826PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007828\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007829Return True if S is a titlecased string and there is at least one\n\
7830character in S, i.e. upper- and titlecase characters may only\n\
7831follow uncased characters and lowercase characters only cased ones.\n\
7832Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833
7834static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007835unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836{
7837 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7838 register const Py_UNICODE *e;
7839 int cased, previous_is_cased;
7840
Guido van Rossumd57fd912000-03-10 22:53:23 +00007841 /* Shortcut for single character strings */
7842 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007843 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7844 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007845
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007846 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007847 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007849
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850 e = p + PyUnicode_GET_SIZE(self);
7851 cased = 0;
7852 previous_is_cased = 0;
7853 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007854 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007855
Benjamin Peterson29060642009-01-31 22:14:21 +00007856 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7857 if (previous_is_cased)
7858 return PyBool_FromLong(0);
7859 previous_is_cased = 1;
7860 cased = 1;
7861 }
7862 else if (Py_UNICODE_ISLOWER(ch)) {
7863 if (!previous_is_cased)
7864 return PyBool_FromLong(0);
7865 previous_is_cased = 1;
7866 cased = 1;
7867 }
7868 else
7869 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007871 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872}
7873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007874PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007877Return True if all characters in S are whitespace\n\
7878and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879
7880static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007881unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007882{
7883 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7884 register const Py_UNICODE *e;
7885
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886 /* Shortcut for single character strings */
7887 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007888 Py_UNICODE_ISSPACE(*p))
7889 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007890
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007891 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007892 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007894
Guido van Rossumd57fd912000-03-10 22:53:23 +00007895 e = p + PyUnicode_GET_SIZE(self);
7896 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 if (!Py_UNICODE_ISSPACE(*p))
7898 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007899 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007900 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901}
7902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007903PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007904 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007905\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007906Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007907and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007908
7909static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007910unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007911{
7912 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7913 register const Py_UNICODE *e;
7914
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007915 /* Shortcut for single character strings */
7916 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007917 Py_UNICODE_ISALPHA(*p))
7918 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007919
7920 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007921 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007922 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007923
7924 e = p + PyUnicode_GET_SIZE(self);
7925 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 if (!Py_UNICODE_ISALPHA(*p))
7927 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007928 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007929 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007930}
7931
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007932PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007933 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007934\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007935Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007936and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007937
7938static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007939unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007940{
7941 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7942 register const Py_UNICODE *e;
7943
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007944 /* Shortcut for single character strings */
7945 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 Py_UNICODE_ISALNUM(*p))
7947 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007948
7949 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007950 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007951 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007952
7953 e = p + PyUnicode_GET_SIZE(self);
7954 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 if (!Py_UNICODE_ISALNUM(*p))
7956 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007957 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007958 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007959}
7960
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007961PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007963\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007964Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007965False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966
7967static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007968unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969{
7970 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7971 register const Py_UNICODE *e;
7972
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 /* Shortcut for single character strings */
7974 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 Py_UNICODE_ISDECIMAL(*p))
7976 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007977
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007978 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007979 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007981
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982 e = p + PyUnicode_GET_SIZE(self);
7983 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 if (!Py_UNICODE_ISDECIMAL(*p))
7985 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007987 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988}
7989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007990PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007993Return True if all characters in S are digits\n\
7994and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995
7996static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007997unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007998{
7999 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8000 register const Py_UNICODE *e;
8001
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002 /* Shortcut for single character strings */
8003 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 Py_UNICODE_ISDIGIT(*p))
8005 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008007 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008008 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008010
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011 e = p + PyUnicode_GET_SIZE(self);
8012 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 if (!Py_UNICODE_ISDIGIT(*p))
8014 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008016 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017}
8018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008019PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008020 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008021\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00008022Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008023False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024
8025static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008026unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027{
8028 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8029 register const Py_UNICODE *e;
8030
Guido van Rossumd57fd912000-03-10 22:53:23 +00008031 /* Shortcut for single character strings */
8032 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008033 Py_UNICODE_ISNUMERIC(*p))
8034 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008036 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008037 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008039
Guido van Rossumd57fd912000-03-10 22:53:23 +00008040 e = p + PyUnicode_GET_SIZE(self);
8041 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 if (!Py_UNICODE_ISNUMERIC(*p))
8043 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008045 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046}
8047
Martin v. Löwis47383402007-08-15 07:32:56 +00008048int
8049PyUnicode_IsIdentifier(PyObject *self)
8050{
8051 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
8052 register const Py_UNICODE *e;
8053
8054 /* Special case for empty strings */
8055 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008057
8058 /* PEP 3131 says that the first character must be in
8059 XID_Start and subsequent characters in XID_Continue,
8060 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00008061 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00008062 letters, digits, underscore). However, given the current
8063 definition of XID_Start and XID_Continue, it is sufficient
8064 to check just for these, except that _ must be allowed
8065 as starting an identifier. */
8066 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
8067 return 0;
8068
8069 e = p + PyUnicode_GET_SIZE(self);
8070 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 if (!_PyUnicode_IsXidContinue(*p))
8072 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008073 }
8074 return 1;
8075}
8076
8077PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008079\n\
8080Return True if S is a valid identifier according\n\
8081to the language definition.");
8082
8083static PyObject*
8084unicode_isidentifier(PyObject *self)
8085{
8086 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8087}
8088
Georg Brandl559e5d72008-06-11 18:37:52 +00008089PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008090 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008091\n\
8092Return True if all characters in S are considered\n\
8093printable in repr() or S is empty, False otherwise.");
8094
8095static PyObject*
8096unicode_isprintable(PyObject *self)
8097{
8098 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8099 register const Py_UNICODE *e;
8100
8101 /* Shortcut for single character strings */
8102 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8103 Py_RETURN_TRUE;
8104 }
8105
8106 e = p + PyUnicode_GET_SIZE(self);
8107 for (; p < e; p++) {
8108 if (!Py_UNICODE_ISPRINTABLE(*p)) {
8109 Py_RETURN_FALSE;
8110 }
8111 }
8112 Py_RETURN_TRUE;
8113}
8114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008115PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008116 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117\n\
8118Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008119iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120
8121static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008122unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008123{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008124 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008125}
8126
Martin v. Löwis18e16552006-02-15 17:27:45 +00008127static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128unicode_length(PyUnicodeObject *self)
8129{
8130 return self->length;
8131}
8132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008133PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008134 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008135\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008136Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008137done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138
8139static PyObject *
8140unicode_ljust(PyUnicodeObject *self, PyObject *args)
8141{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008142 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008143 Py_UNICODE fillchar = ' ';
8144
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008145 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146 return NULL;
8147
Tim Peters7a29bd52001-09-12 03:03:31 +00008148 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 Py_INCREF(self);
8150 return (PyObject*) self;
8151 }
8152
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008153 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008154}
8155
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008156PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008157 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008159Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008160
8161static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008162unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008164 return fixup(self, fixlower);
8165}
8166
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008167#define LEFTSTRIP 0
8168#define RIGHTSTRIP 1
8169#define BOTHSTRIP 2
8170
8171/* Arrays indexed by above */
8172static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8173
8174#define STRIPNAME(i) (stripformat[i]+3)
8175
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008176/* externally visible for str.strip(unicode) */
8177PyObject *
8178_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8179{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008180 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8181 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8182 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8183 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8184 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008185
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008187
Benjamin Peterson14339b62009-01-31 16:36:08 +00008188 i = 0;
8189 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8191 i++;
8192 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008193 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008194
Benjamin Peterson14339b62009-01-31 16:36:08 +00008195 j = len;
8196 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 do {
8198 j--;
8199 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8200 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008201 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008202
Benjamin Peterson14339b62009-01-31 16:36:08 +00008203 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008204 Py_INCREF(self);
8205 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008206 }
8207 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008209}
8210
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211
8212static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008213do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008214{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008215 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8216 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008217
Benjamin Peterson14339b62009-01-31 16:36:08 +00008218 i = 0;
8219 if (striptype != RIGHTSTRIP) {
8220 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8221 i++;
8222 }
8223 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008224
Benjamin Peterson14339b62009-01-31 16:36:08 +00008225 j = len;
8226 if (striptype != LEFTSTRIP) {
8227 do {
8228 j--;
8229 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8230 j++;
8231 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008232
Benjamin Peterson14339b62009-01-31 16:36:08 +00008233 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8234 Py_INCREF(self);
8235 return (PyObject*)self;
8236 }
8237 else
8238 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008239}
8240
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008241
8242static PyObject *
8243do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8244{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008245 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008246
Benjamin Peterson14339b62009-01-31 16:36:08 +00008247 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8248 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008249
Benjamin Peterson14339b62009-01-31 16:36:08 +00008250 if (sep != NULL && sep != Py_None) {
8251 if (PyUnicode_Check(sep))
8252 return _PyUnicode_XStrip(self, striptype, sep);
8253 else {
8254 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 "%s arg must be None or str",
8256 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008257 return NULL;
8258 }
8259 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008260
Benjamin Peterson14339b62009-01-31 16:36:08 +00008261 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008262}
8263
8264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008265PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008267\n\
8268Return a copy of the string S with leading and trailing\n\
8269whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008270If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008271
8272static PyObject *
8273unicode_strip(PyUnicodeObject *self, PyObject *args)
8274{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008275 if (PyTuple_GET_SIZE(args) == 0)
8276 return do_strip(self, BOTHSTRIP); /* Common case */
8277 else
8278 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008279}
8280
8281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008282PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008284\n\
8285Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008286If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008287
8288static PyObject *
8289unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8290{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008291 if (PyTuple_GET_SIZE(args) == 0)
8292 return do_strip(self, LEFTSTRIP); /* Common case */
8293 else
8294 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008295}
8296
8297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008298PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008300\n\
8301Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008302If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008303
8304static PyObject *
8305unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8306{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008307 if (PyTuple_GET_SIZE(args) == 0)
8308 return do_strip(self, RIGHTSTRIP); /* Common case */
8309 else
8310 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008311}
8312
8313
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008315unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316{
8317 PyUnicodeObject *u;
8318 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008319 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008320 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321
Georg Brandl222de0f2009-04-12 12:01:50 +00008322 if (len < 1) {
8323 Py_INCREF(unicode_empty);
8324 return (PyObject *)unicode_empty;
8325 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326
Tim Peters7a29bd52001-09-12 03:03:31 +00008327 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328 /* no repeat, return original string */
8329 Py_INCREF(str);
8330 return (PyObject*) str;
8331 }
Tim Peters8f422462000-09-09 06:13:41 +00008332
8333 /* ensure # of chars needed doesn't overflow int and # of bytes
8334 * needed doesn't overflow size_t
8335 */
8336 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008337 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008338 PyErr_SetString(PyExc_OverflowError,
8339 "repeated string is too long");
8340 return NULL;
8341 }
8342 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8343 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8344 PyErr_SetString(PyExc_OverflowError,
8345 "repeated string is too long");
8346 return NULL;
8347 }
8348 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008349 if (!u)
8350 return NULL;
8351
8352 p = u->str;
8353
Georg Brandl222de0f2009-04-12 12:01:50 +00008354 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008355 Py_UNICODE_FILL(p, str->str[0], len);
8356 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008357 Py_ssize_t done = str->length; /* number of characters copied this far */
8358 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008360 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008361 Py_UNICODE_COPY(p+done, p, n);
8362 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364 }
8365
8366 return (PyObject*) u;
8367}
8368
Alexander Belopolsky40018472011-02-26 01:02:56 +00008369PyObject *
8370PyUnicode_Replace(PyObject *obj,
8371 PyObject *subobj,
8372 PyObject *replobj,
8373 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374{
8375 PyObject *self;
8376 PyObject *str1;
8377 PyObject *str2;
8378 PyObject *result;
8379
8380 self = PyUnicode_FromObject(obj);
8381 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 str1 = PyUnicode_FromObject(subobj);
8384 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 Py_DECREF(self);
8386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008387 }
8388 str2 = PyUnicode_FromObject(replobj);
8389 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 Py_DECREF(self);
8391 Py_DECREF(str1);
8392 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008393 }
Tim Petersced69f82003-09-16 20:30:58 +00008394 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 (PyUnicodeObject *)str1,
8396 (PyUnicodeObject *)str2,
8397 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398 Py_DECREF(self);
8399 Py_DECREF(str1);
8400 Py_DECREF(str2);
8401 return result;
8402}
8403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008404PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008405 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008406\n\
8407Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008408old replaced by new. If the optional argument count is\n\
8409given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410
8411static PyObject*
8412unicode_replace(PyUnicodeObject *self, PyObject *args)
8413{
8414 PyUnicodeObject *str1;
8415 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008416 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417 PyObject *result;
8418
Martin v. Löwis18e16552006-02-15 17:27:45 +00008419 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420 return NULL;
8421 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8422 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008424 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008425 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 Py_DECREF(str1);
8427 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008428 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429
8430 result = replace(self, str1, str2, maxcount);
8431
8432 Py_DECREF(str1);
8433 Py_DECREF(str2);
8434 return result;
8435}
8436
Alexander Belopolsky40018472011-02-26 01:02:56 +00008437static PyObject *
8438unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008439{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008440 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008441 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008442 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8443 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8444
8445 /* XXX(nnorwitz): rather than over-allocating, it would be
8446 better to choose a different scheme. Perhaps scan the
8447 first N-chars of the string and allocate based on that size.
8448 */
8449 /* Initial allocation is based on the longest-possible unichr
8450 escape.
8451
8452 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8453 unichr, so in this case it's the longest unichr escape. In
8454 narrow (UTF-16) builds this is five chars per source unichr
8455 since there are two unichrs in the surrogate pair, so in narrow
8456 (UTF-16) builds it's not the longest unichr escape.
8457
8458 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8459 so in the narrow (UTF-16) build case it's the longest unichr
8460 escape.
8461 */
8462
Walter Dörwald1ab83302007-05-18 17:15:44 +00008463 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008465#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008467#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008469#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008471 if (repr == NULL)
8472 return NULL;
8473
Walter Dörwald1ab83302007-05-18 17:15:44 +00008474 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008475
8476 /* Add quote */
8477 *p++ = (findchar(s, size, '\'') &&
8478 !findchar(s, size, '"')) ? '"' : '\'';
8479 while (size-- > 0) {
8480 Py_UNICODE ch = *s++;
8481
8482 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008483 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008484 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008485 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008486 continue;
8487 }
8488
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008490 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008491 *p++ = '\\';
8492 *p++ = 't';
8493 }
8494 else if (ch == '\n') {
8495 *p++ = '\\';
8496 *p++ = 'n';
8497 }
8498 else if (ch == '\r') {
8499 *p++ = '\\';
8500 *p++ = 'r';
8501 }
8502
8503 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008504 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008505 *p++ = '\\';
8506 *p++ = 'x';
8507 *p++ = hexdigits[(ch >> 4) & 0x000F];
8508 *p++ = hexdigits[ch & 0x000F];
8509 }
8510
Georg Brandl559e5d72008-06-11 18:37:52 +00008511 /* Copy ASCII characters as-is */
8512 else if (ch < 0x7F) {
8513 *p++ = ch;
8514 }
8515
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008517 else {
8518 Py_UCS4 ucs = ch;
8519
8520#ifndef Py_UNICODE_WIDE
8521 Py_UNICODE ch2 = 0;
8522 /* Get code point from surrogate pair */
8523 if (size > 0) {
8524 ch2 = *s;
8525 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008527 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008529 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008530 size--;
8531 }
8532 }
8533#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008534 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008535 (categories Z* and C* except ASCII space)
8536 */
8537 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8538 /* Map 8-bit characters to '\xhh' */
8539 if (ucs <= 0xff) {
8540 *p++ = '\\';
8541 *p++ = 'x';
8542 *p++ = hexdigits[(ch >> 4) & 0x000F];
8543 *p++ = hexdigits[ch & 0x000F];
8544 }
8545 /* Map 21-bit characters to '\U00xxxxxx' */
8546 else if (ucs >= 0x10000) {
8547 *p++ = '\\';
8548 *p++ = 'U';
8549 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8550 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8551 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8552 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8553 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8554 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8555 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8556 *p++ = hexdigits[ucs & 0x0000000F];
8557 }
8558 /* Map 16-bit characters to '\uxxxx' */
8559 else {
8560 *p++ = '\\';
8561 *p++ = 'u';
8562 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8563 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8564 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8565 *p++ = hexdigits[ucs & 0x000F];
8566 }
8567 }
8568 /* Copy characters as-is */
8569 else {
8570 *p++ = ch;
8571#ifndef Py_UNICODE_WIDE
8572 if (ucs >= 0x10000)
8573 *p++ = ch2;
8574#endif
8575 }
8576 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008577 }
8578 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008579 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008580
8581 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008582 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008583 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584}
8585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008586PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588\n\
8589Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008590such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591arguments start and end are interpreted as in slice notation.\n\
8592\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008593Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008594
8595static PyObject *
8596unicode_rfind(PyUnicodeObject *self, PyObject *args)
8597{
Jesus Ceaac451502011-04-20 17:09:23 +02008598 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008599 Py_ssize_t start;
8600 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008601 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008602
Jesus Ceaac451502011-04-20 17:09:23 +02008603 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
8604 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008605 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606
Thomas Wouters477c8d52006-05-27 19:21:47 +00008607 result = stringlib_rfind_slice(
8608 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8609 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8610 start, end
8611 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612
8613 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008614
Christian Heimes217cfd12007-12-02 14:31:20 +00008615 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616}
8617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008618PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008620\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008621Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622
8623static PyObject *
8624unicode_rindex(PyUnicodeObject *self, PyObject *args)
8625{
Jesus Ceaac451502011-04-20 17:09:23 +02008626 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008627 Py_ssize_t start;
8628 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008629 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630
Jesus Ceaac451502011-04-20 17:09:23 +02008631 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
8632 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008633 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634
Thomas Wouters477c8d52006-05-27 19:21:47 +00008635 result = stringlib_rfind_slice(
8636 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8637 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8638 start, end
8639 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008640
8641 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008642
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643 if (result < 0) {
8644 PyErr_SetString(PyExc_ValueError, "substring not found");
8645 return NULL;
8646 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008647 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648}
8649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008650PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008653Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008654done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655
8656static PyObject *
8657unicode_rjust(PyUnicodeObject *self, PyObject *args)
8658{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008659 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008660 Py_UNICODE fillchar = ' ';
8661
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008662 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008663 return NULL;
8664
Tim Peters7a29bd52001-09-12 03:03:31 +00008665 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008666 Py_INCREF(self);
8667 return (PyObject*) self;
8668 }
8669
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008670 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671}
8672
Alexander Belopolsky40018472011-02-26 01:02:56 +00008673PyObject *
8674PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675{
8676 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008677
Guido van Rossumd57fd912000-03-10 22:53:23 +00008678 s = PyUnicode_FromObject(s);
8679 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008680 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 if (sep != NULL) {
8682 sep = PyUnicode_FromObject(sep);
8683 if (sep == NULL) {
8684 Py_DECREF(s);
8685 return NULL;
8686 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687 }
8688
8689 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8690
8691 Py_DECREF(s);
8692 Py_XDECREF(sep);
8693 return result;
8694}
8695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008696PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698\n\
8699Return a list of the words in S, using sep as the\n\
8700delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008701splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008702whitespace string is a separator and empty strings are\n\
8703removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704
8705static PyObject*
8706unicode_split(PyUnicodeObject *self, PyObject *args)
8707{
8708 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008709 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008710
Martin v. Löwis18e16552006-02-15 17:27:45 +00008711 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 return NULL;
8713
8714 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008720}
8721
Thomas Wouters477c8d52006-05-27 19:21:47 +00008722PyObject *
8723PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8724{
8725 PyObject* str_obj;
8726 PyObject* sep_obj;
8727 PyObject* out;
8728
8729 str_obj = PyUnicode_FromObject(str_in);
8730 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008732 sep_obj = PyUnicode_FromObject(sep_in);
8733 if (!sep_obj) {
8734 Py_DECREF(str_obj);
8735 return NULL;
8736 }
8737
8738 out = stringlib_partition(
8739 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8740 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8741 );
8742
8743 Py_DECREF(sep_obj);
8744 Py_DECREF(str_obj);
8745
8746 return out;
8747}
8748
8749
8750PyObject *
8751PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8752{
8753 PyObject* str_obj;
8754 PyObject* sep_obj;
8755 PyObject* out;
8756
8757 str_obj = PyUnicode_FromObject(str_in);
8758 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008760 sep_obj = PyUnicode_FromObject(sep_in);
8761 if (!sep_obj) {
8762 Py_DECREF(str_obj);
8763 return NULL;
8764 }
8765
8766 out = stringlib_rpartition(
8767 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8768 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8769 );
8770
8771 Py_DECREF(sep_obj);
8772 Py_DECREF(str_obj);
8773
8774 return out;
8775}
8776
8777PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008779\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008780Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008781the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008782found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008783
8784static PyObject*
8785unicode_partition(PyUnicodeObject *self, PyObject *separator)
8786{
8787 return PyUnicode_Partition((PyObject *)self, separator);
8788}
8789
8790PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008791 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008792\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008793Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008794the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008795separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008796
8797static PyObject*
8798unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8799{
8800 return PyUnicode_RPartition((PyObject *)self, separator);
8801}
8802
Alexander Belopolsky40018472011-02-26 01:02:56 +00008803PyObject *
8804PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008805{
8806 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008807
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008808 s = PyUnicode_FromObject(s);
8809 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008810 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 if (sep != NULL) {
8812 sep = PyUnicode_FromObject(sep);
8813 if (sep == NULL) {
8814 Py_DECREF(s);
8815 return NULL;
8816 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008817 }
8818
8819 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8820
8821 Py_DECREF(s);
8822 Py_XDECREF(sep);
8823 return result;
8824}
8825
8826PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008828\n\
8829Return a list of the words in S, using sep as the\n\
8830delimiter string, starting at the end of the string and\n\
8831working to the front. If maxsplit is given, at most maxsplit\n\
8832splits are done. If sep is not specified, any whitespace string\n\
8833is a separator.");
8834
8835static PyObject*
8836unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8837{
8838 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008839 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008840
Martin v. Löwis18e16552006-02-15 17:27:45 +00008841 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008842 return NULL;
8843
8844 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008845 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008846 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008847 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008848 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008849 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008850}
8851
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008852PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008853 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854\n\
8855Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008856Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008857is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858
8859static PyObject*
8860unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8861{
Guido van Rossum86662912000-04-11 15:38:46 +00008862 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008863
Guido van Rossum86662912000-04-11 15:38:46 +00008864 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865 return NULL;
8866
Guido van Rossum86662912000-04-11 15:38:46 +00008867 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868}
8869
8870static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008871PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872{
Walter Dörwald346737f2007-05-31 10:44:43 +00008873 if (PyUnicode_CheckExact(self)) {
8874 Py_INCREF(self);
8875 return self;
8876 } else
8877 /* Subtype -- return genuine unicode string with the same value. */
8878 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8879 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880}
8881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008882PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884\n\
8885Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008886and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887
8888static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008889unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891 return fixup(self, fixswapcase);
8892}
8893
Georg Brandlceee0772007-11-27 23:48:05 +00008894PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008896\n\
8897Return a translation table usable for str.translate().\n\
8898If there is only one argument, it must be a dictionary mapping Unicode\n\
8899ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008900Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008901If there are two arguments, they must be strings of equal length, and\n\
8902in the resulting dictionary, each character in x will be mapped to the\n\
8903character at the same position in y. If there is a third argument, it\n\
8904must be a string, whose characters will be mapped to None in the result.");
8905
8906static PyObject*
8907unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8908{
8909 PyObject *x, *y = NULL, *z = NULL;
8910 PyObject *new = NULL, *key, *value;
8911 Py_ssize_t i = 0;
8912 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008913
Georg Brandlceee0772007-11-27 23:48:05 +00008914 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8915 return NULL;
8916 new = PyDict_New();
8917 if (!new)
8918 return NULL;
8919 if (y != NULL) {
8920 /* x must be a string too, of equal length */
8921 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8922 if (!PyUnicode_Check(x)) {
8923 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8924 "be a string if there is a second argument");
8925 goto err;
8926 }
8927 if (PyUnicode_GET_SIZE(x) != ylen) {
8928 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8929 "arguments must have equal length");
8930 goto err;
8931 }
8932 /* create entries for translating chars in x to those in y */
8933 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008934 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8935 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008936 if (!key || !value)
8937 goto err;
8938 res = PyDict_SetItem(new, key, value);
8939 Py_DECREF(key);
8940 Py_DECREF(value);
8941 if (res < 0)
8942 goto err;
8943 }
8944 /* create entries for deleting chars in z */
8945 if (z != NULL) {
8946 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008947 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008948 if (!key)
8949 goto err;
8950 res = PyDict_SetItem(new, key, Py_None);
8951 Py_DECREF(key);
8952 if (res < 0)
8953 goto err;
8954 }
8955 }
8956 } else {
8957 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008958 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008959 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8960 "to maketrans it must be a dict");
8961 goto err;
8962 }
8963 /* copy entries into the new dict, converting string keys to int keys */
8964 while (PyDict_Next(x, &i, &key, &value)) {
8965 if (PyUnicode_Check(key)) {
8966 /* convert string keys to integer keys */
8967 PyObject *newkey;
8968 if (PyUnicode_GET_SIZE(key) != 1) {
8969 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8970 "table must be of length 1");
8971 goto err;
8972 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008973 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008974 if (!newkey)
8975 goto err;
8976 res = PyDict_SetItem(new, newkey, value);
8977 Py_DECREF(newkey);
8978 if (res < 0)
8979 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008980 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008981 /* just keep integer keys */
8982 if (PyDict_SetItem(new, key, value) < 0)
8983 goto err;
8984 } else {
8985 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8986 "be strings or integers");
8987 goto err;
8988 }
8989 }
8990 }
8991 return new;
8992 err:
8993 Py_DECREF(new);
8994 return NULL;
8995}
8996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008997PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999\n\
9000Return a copy of the string S, where all characters have been mapped\n\
9001through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009002Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00009003Unmapped characters are left untouched. Characters mapped to None\n\
9004are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005
9006static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009007unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008{
Georg Brandlceee0772007-11-27 23:48:05 +00009009 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010}
9011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009012PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009013 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009015Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016
9017static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009018unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020 return fixup(self, fixupper);
9021}
9022
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009023PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009024 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00009026Pad a numeric string S with zeros on the left, to fill a field\n\
9027of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028
9029static PyObject *
9030unicode_zfill(PyUnicodeObject *self, PyObject *args)
9031{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009032 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033 PyUnicodeObject *u;
9034
Martin v. Löwis18e16552006-02-15 17:27:45 +00009035 Py_ssize_t width;
9036 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009037 return NULL;
9038
9039 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00009040 if (PyUnicode_CheckExact(self)) {
9041 Py_INCREF(self);
9042 return (PyObject*) self;
9043 }
9044 else
9045 return PyUnicode_FromUnicode(
9046 PyUnicode_AS_UNICODE(self),
9047 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00009048 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009049 }
9050
9051 fill = width - self->length;
9052
9053 u = pad(self, fill, 0, '0');
9054
Walter Dörwald068325e2002-04-15 13:36:47 +00009055 if (u == NULL)
9056 return NULL;
9057
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058 if (u->str[fill] == '+' || u->str[fill] == '-') {
9059 /* move sign to beginning of string */
9060 u->str[0] = u->str[fill];
9061 u->str[fill] = '0';
9062 }
9063
9064 return (PyObject*) u;
9065}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066
9067#if 0
9068static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009069unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009070{
Christian Heimes2202f872008-02-06 14:31:34 +00009071 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009073
9074static PyObject *
9075unicode__decimal2ascii(PyObject *self)
9076{
9077 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9078 PyUnicode_GET_SIZE(self));
9079}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080#endif
9081
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009082PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009083 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009085Return True if S starts with the specified prefix, False otherwise.\n\
9086With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009087With optional end, stop comparing S at that position.\n\
9088prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089
9090static PyObject *
9091unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009094 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009095 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009096 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009097 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009098 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099
Jesus Ceaac451502011-04-20 17:09:23 +02009100 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009101 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009102 if (PyTuple_Check(subobj)) {
9103 Py_ssize_t i;
9104 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9105 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009106 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009107 if (substring == NULL)
9108 return NULL;
9109 result = tailmatch(self, substring, start, end, -1);
9110 Py_DECREF(substring);
9111 if (result) {
9112 Py_RETURN_TRUE;
9113 }
9114 }
9115 /* nothing matched */
9116 Py_RETURN_FALSE;
9117 }
9118 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009119 if (substring == NULL) {
9120 if (PyErr_ExceptionMatches(PyExc_TypeError))
9121 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
9122 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009123 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009124 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009125 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009127 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009128}
9129
9130
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009131PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009133\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009134Return True if S ends with the specified suffix, False otherwise.\n\
9135With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009136With optional end, stop comparing S at that position.\n\
9137suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138
9139static PyObject *
9140unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009141 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009143 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009145 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009146 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009147 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148
Jesus Ceaac451502011-04-20 17:09:23 +02009149 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +00009150 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009151 if (PyTuple_Check(subobj)) {
9152 Py_ssize_t i;
9153 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9154 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009156 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009157 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009158 result = tailmatch(self, substring, start, end, +1);
9159 Py_DECREF(substring);
9160 if (result) {
9161 Py_RETURN_TRUE;
9162 }
9163 }
9164 Py_RETURN_FALSE;
9165 }
9166 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +03009167 if (substring == NULL) {
9168 if (PyErr_ExceptionMatches(PyExc_TypeError))
9169 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
9170 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00009171 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +03009172 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009173 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009175 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009176}
9177
Eric Smith8c663262007-08-25 02:26:07 +00009178#include "stringlib/string_format.h"
9179
9180PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009181 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009182\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009183Return a formatted version of S, using substitutions from args and kwargs.\n\
9184The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009185
Eric Smith27bbca62010-11-04 17:06:58 +00009186PyDoc_STRVAR(format_map__doc__,
9187 "S.format_map(mapping) -> str\n\
9188\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009189Return a formatted version of S, using substitutions from mapping.\n\
9190The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009191
Eric Smith4a7d76d2008-05-30 18:10:19 +00009192static PyObject *
9193unicode__format__(PyObject* self, PyObject* args)
9194{
9195 PyObject *format_spec;
9196
9197 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9198 return NULL;
9199
9200 return _PyUnicode_FormatAdvanced(self,
9201 PyUnicode_AS_UNICODE(format_spec),
9202 PyUnicode_GET_SIZE(format_spec));
9203}
9204
Eric Smith8c663262007-08-25 02:26:07 +00009205PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009206 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009207\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009208Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009209
9210static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009211unicode__sizeof__(PyUnicodeObject *v)
9212{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009213 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9214 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009215}
9216
9217PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009218 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009219
9220static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009221unicode_getnewargs(PyUnicodeObject *v)
9222{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009223 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009224}
9225
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226static PyMethodDef unicode_methods[] = {
9227
9228 /* Order is according to common usage: often used methods should
9229 appear first, since lookup is done sequentially. */
9230
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009231 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009232 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9233 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009234 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009235 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9236 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9237 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9238 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9239 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9240 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9241 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009242 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009243 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9244 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9245 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009246 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009247 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9248 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9249 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009250 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009251 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009252 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009253 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009254 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9255 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9256 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9257 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9258 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9259 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9260 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9261 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9262 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9263 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9264 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9265 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9266 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9267 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009268 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009269 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009270 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009271 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009272 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009273 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009274 {"maketrans", (PyCFunction) unicode_maketrans,
9275 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009276 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009277#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009278 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279#endif
9280
9281#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009282 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009283 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009284 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009285#endif
9286
Benjamin Peterson14339b62009-01-31 16:36:08 +00009287 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009288 {NULL, NULL}
9289};
9290
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009291static PyObject *
9292unicode_mod(PyObject *v, PyObject *w)
9293{
Benjamin Peterson29060642009-01-31 22:14:21 +00009294 if (!PyUnicode_Check(v)) {
9295 Py_INCREF(Py_NotImplemented);
9296 return Py_NotImplemented;
9297 }
9298 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009299}
9300
9301static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009302 0, /*nb_add*/
9303 0, /*nb_subtract*/
9304 0, /*nb_multiply*/
9305 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009306};
9307
Guido van Rossumd57fd912000-03-10 22:53:23 +00009308static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009309 (lenfunc) unicode_length, /* sq_length */
9310 PyUnicode_Concat, /* sq_concat */
9311 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9312 (ssizeargfunc) unicode_getitem, /* sq_item */
9313 0, /* sq_slice */
9314 0, /* sq_ass_item */
9315 0, /* sq_ass_slice */
9316 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009317};
9318
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009319static PyObject*
9320unicode_subscript(PyUnicodeObject* self, PyObject* item)
9321{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009322 if (PyIndex_Check(item)) {
9323 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009324 if (i == -1 && PyErr_Occurred())
9325 return NULL;
9326 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009327 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009328 return unicode_getitem(self, i);
9329 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009330 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009331 Py_UNICODE* source_buf;
9332 Py_UNICODE* result_buf;
9333 PyObject* result;
9334
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009335 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009336 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009337 return NULL;
9338 }
9339
9340 if (slicelength <= 0) {
9341 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009342 } else if (start == 0 && step == 1 && slicelength == self->length &&
9343 PyUnicode_CheckExact(self)) {
9344 Py_INCREF(self);
9345 return (PyObject *)self;
9346 } else if (step == 1) {
9347 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009348 } else {
9349 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009350 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9351 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009352
Benjamin Peterson29060642009-01-31 22:14:21 +00009353 if (result_buf == NULL)
9354 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009355
9356 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9357 result_buf[i] = source_buf[cur];
9358 }
Tim Petersced69f82003-09-16 20:30:58 +00009359
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009360 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009361 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009362 return result;
9363 }
9364 } else {
9365 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9366 return NULL;
9367 }
9368}
9369
9370static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009371 (lenfunc)unicode_length, /* mp_length */
9372 (binaryfunc)unicode_subscript, /* mp_subscript */
9373 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009374};
9375
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376
Guido van Rossumd57fd912000-03-10 22:53:23 +00009377/* Helpers for PyUnicode_Format() */
9378
9379static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009380getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009382 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009384 (*p_argidx)++;
9385 if (arglen < 0)
9386 return args;
9387 else
9388 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009389 }
9390 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009391 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392 return NULL;
9393}
9394
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009395/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009396
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009397static PyObject *
9398formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009399{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009400 char *p;
9401 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009402 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009403
Guido van Rossumd57fd912000-03-10 22:53:23 +00009404 x = PyFloat_AsDouble(v);
9405 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009406 return NULL;
9407
Guido van Rossumd57fd912000-03-10 22:53:23 +00009408 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009409 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009410
Eric Smith0923d1d2009-04-16 20:16:10 +00009411 p = PyOS_double_to_string(x, type, prec,
9412 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009413 if (p == NULL)
9414 return NULL;
9415 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009416 PyMem_Free(p);
9417 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009418}
9419
Tim Peters38fd5b62000-09-21 05:43:11 +00009420static PyObject*
9421formatlong(PyObject *val, int flags, int prec, int type)
9422{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009423 char *buf;
9424 int len;
9425 PyObject *str; /* temporary string object. */
9426 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009427
Benjamin Peterson14339b62009-01-31 16:36:08 +00009428 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9429 if (!str)
9430 return NULL;
9431 result = PyUnicode_FromStringAndSize(buf, len);
9432 Py_DECREF(str);
9433 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009434}
9435
Guido van Rossumd57fd912000-03-10 22:53:23 +00009436static int
9437formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009438 size_t buflen,
9439 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009441 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009442 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009443 if (PyUnicode_GET_SIZE(v) == 1) {
9444 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9445 buf[1] = '\0';
9446 return 1;
9447 }
9448#ifndef Py_UNICODE_WIDE
9449 if (PyUnicode_GET_SIZE(v) == 2) {
9450 /* Decode a valid surrogate pair */
9451 int c0 = PyUnicode_AS_UNICODE(v)[0];
9452 int c1 = PyUnicode_AS_UNICODE(v)[1];
9453 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9454 0xDC00 <= c1 && c1 <= 0xDFFF) {
9455 buf[0] = c0;
9456 buf[1] = c1;
9457 buf[2] = '\0';
9458 return 2;
9459 }
9460 }
9461#endif
9462 goto onError;
9463 }
9464 else {
9465 /* Integer input truncated to a character */
9466 long x;
9467 x = PyLong_AsLong(v);
9468 if (x == -1 && PyErr_Occurred())
9469 goto onError;
9470
9471 if (x < 0 || x > 0x10ffff) {
9472 PyErr_SetString(PyExc_OverflowError,
9473 "%c arg not in range(0x110000)");
9474 return -1;
9475 }
9476
9477#ifndef Py_UNICODE_WIDE
9478 if (x > 0xffff) {
9479 x -= 0x10000;
9480 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9481 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9482 return 2;
9483 }
9484#endif
9485 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009486 buf[1] = '\0';
9487 return 1;
9488 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009489
Benjamin Peterson29060642009-01-31 22:14:21 +00009490 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009491 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009492 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009493 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494}
9495
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009496/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009497 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009498*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009499#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009500
Alexander Belopolsky40018472011-02-26 01:02:56 +00009501PyObject *
9502PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503{
9504 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009505 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506 int args_owned = 0;
9507 PyUnicodeObject *result = NULL;
9508 PyObject *dict = NULL;
9509 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009510
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009512 PyErr_BadInternalCall();
9513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514 }
9515 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009516 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009517 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518 fmt = PyUnicode_AS_UNICODE(uformat);
9519 fmtcnt = PyUnicode_GET_SIZE(uformat);
9520
9521 reslen = rescnt = fmtcnt + 100;
9522 result = _PyUnicode_New(reslen);
9523 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009524 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525 res = PyUnicode_AS_UNICODE(result);
9526
9527 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009528 arglen = PyTuple_Size(args);
9529 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530 }
9531 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 arglen = -1;
9533 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009535 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009536 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009537 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009538
9539 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009540 if (*fmt != '%') {
9541 if (--rescnt < 0) {
9542 rescnt = fmtcnt + 100;
9543 reslen += rescnt;
9544 if (_PyUnicode_Resize(&result, reslen) < 0)
9545 goto onError;
9546 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9547 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009548 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009550 }
9551 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009552 /* Got a format specifier */
9553 int flags = 0;
9554 Py_ssize_t width = -1;
9555 int prec = -1;
9556 Py_UNICODE c = '\0';
9557 Py_UNICODE fill;
9558 int isnumok;
9559 PyObject *v = NULL;
9560 PyObject *temp = NULL;
9561 Py_UNICODE *pbuf;
9562 Py_UNICODE sign;
9563 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009564 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565
Benjamin Peterson29060642009-01-31 22:14:21 +00009566 fmt++;
9567 if (*fmt == '(') {
9568 Py_UNICODE *keystart;
9569 Py_ssize_t keylen;
9570 PyObject *key;
9571 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009572
Benjamin Peterson29060642009-01-31 22:14:21 +00009573 if (dict == NULL) {
9574 PyErr_SetString(PyExc_TypeError,
9575 "format requires a mapping");
9576 goto onError;
9577 }
9578 ++fmt;
9579 --fmtcnt;
9580 keystart = fmt;
9581 /* Skip over balanced parentheses */
9582 while (pcount > 0 && --fmtcnt >= 0) {
9583 if (*fmt == ')')
9584 --pcount;
9585 else if (*fmt == '(')
9586 ++pcount;
9587 fmt++;
9588 }
9589 keylen = fmt - keystart - 1;
9590 if (fmtcnt < 0 || pcount > 0) {
9591 PyErr_SetString(PyExc_ValueError,
9592 "incomplete format key");
9593 goto onError;
9594 }
9595#if 0
9596 /* keys are converted to strings using UTF-8 and
9597 then looked up since Python uses strings to hold
9598 variables names etc. in its namespaces and we
9599 wouldn't want to break common idioms. */
9600 key = PyUnicode_EncodeUTF8(keystart,
9601 keylen,
9602 NULL);
9603#else
9604 key = PyUnicode_FromUnicode(keystart, keylen);
9605#endif
9606 if (key == NULL)
9607 goto onError;
9608 if (args_owned) {
9609 Py_DECREF(args);
9610 args_owned = 0;
9611 }
9612 args = PyObject_GetItem(dict, key);
9613 Py_DECREF(key);
9614 if (args == NULL) {
9615 goto onError;
9616 }
9617 args_owned = 1;
9618 arglen = -1;
9619 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009620 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009621 while (--fmtcnt >= 0) {
9622 switch (c = *fmt++) {
9623 case '-': flags |= F_LJUST; continue;
9624 case '+': flags |= F_SIGN; continue;
9625 case ' ': flags |= F_BLANK; continue;
9626 case '#': flags |= F_ALT; continue;
9627 case '0': flags |= F_ZERO; continue;
9628 }
9629 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009630 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009631 if (c == '*') {
9632 v = getnextarg(args, arglen, &argidx);
9633 if (v == NULL)
9634 goto onError;
9635 if (!PyLong_Check(v)) {
9636 PyErr_SetString(PyExc_TypeError,
9637 "* wants int");
9638 goto onError;
9639 }
9640 width = PyLong_AsLong(v);
9641 if (width == -1 && PyErr_Occurred())
9642 goto onError;
9643 if (width < 0) {
9644 flags |= F_LJUST;
9645 width = -width;
9646 }
9647 if (--fmtcnt >= 0)
9648 c = *fmt++;
9649 }
9650 else if (c >= '0' && c <= '9') {
9651 width = c - '0';
9652 while (--fmtcnt >= 0) {
9653 c = *fmt++;
9654 if (c < '0' || c > '9')
9655 break;
9656 if ((width*10) / 10 != width) {
9657 PyErr_SetString(PyExc_ValueError,
9658 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009659 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009660 }
9661 width = width*10 + (c - '0');
9662 }
9663 }
9664 if (c == '.') {
9665 prec = 0;
9666 if (--fmtcnt >= 0)
9667 c = *fmt++;
9668 if (c == '*') {
9669 v = getnextarg(args, arglen, &argidx);
9670 if (v == NULL)
9671 goto onError;
9672 if (!PyLong_Check(v)) {
9673 PyErr_SetString(PyExc_TypeError,
9674 "* wants int");
9675 goto onError;
9676 }
9677 prec = PyLong_AsLong(v);
9678 if (prec == -1 && PyErr_Occurred())
9679 goto onError;
9680 if (prec < 0)
9681 prec = 0;
9682 if (--fmtcnt >= 0)
9683 c = *fmt++;
9684 }
9685 else if (c >= '0' && c <= '9') {
9686 prec = c - '0';
9687 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009688 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009689 if (c < '0' || c > '9')
9690 break;
9691 if ((prec*10) / 10 != prec) {
9692 PyErr_SetString(PyExc_ValueError,
9693 "prec too big");
9694 goto onError;
9695 }
9696 prec = prec*10 + (c - '0');
9697 }
9698 }
9699 } /* prec */
9700 if (fmtcnt >= 0) {
9701 if (c == 'h' || c == 'l' || c == 'L') {
9702 if (--fmtcnt >= 0)
9703 c = *fmt++;
9704 }
9705 }
9706 if (fmtcnt < 0) {
9707 PyErr_SetString(PyExc_ValueError,
9708 "incomplete format");
9709 goto onError;
9710 }
9711 if (c != '%') {
9712 v = getnextarg(args, arglen, &argidx);
9713 if (v == NULL)
9714 goto onError;
9715 }
9716 sign = 0;
9717 fill = ' ';
9718 switch (c) {
9719
9720 case '%':
9721 pbuf = formatbuf;
9722 /* presume that buffer length is at least 1 */
9723 pbuf[0] = '%';
9724 len = 1;
9725 break;
9726
9727 case 's':
9728 case 'r':
9729 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009730 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009731 temp = v;
9732 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009733 }
9734 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009735 if (c == 's')
9736 temp = PyObject_Str(v);
9737 else if (c == 'r')
9738 temp = PyObject_Repr(v);
9739 else
9740 temp = PyObject_ASCII(v);
9741 if (temp == NULL)
9742 goto onError;
9743 if (PyUnicode_Check(temp))
9744 /* nothing to do */;
9745 else {
9746 Py_DECREF(temp);
9747 PyErr_SetString(PyExc_TypeError,
9748 "%s argument has non-string str()");
9749 goto onError;
9750 }
9751 }
9752 pbuf = PyUnicode_AS_UNICODE(temp);
9753 len = PyUnicode_GET_SIZE(temp);
9754 if (prec >= 0 && len > prec)
9755 len = prec;
9756 break;
9757
9758 case 'i':
9759 case 'd':
9760 case 'u':
9761 case 'o':
9762 case 'x':
9763 case 'X':
9764 if (c == 'i')
9765 c = 'd';
9766 isnumok = 0;
9767 if (PyNumber_Check(v)) {
9768 PyObject *iobj=NULL;
9769
9770 if (PyLong_Check(v)) {
9771 iobj = v;
9772 Py_INCREF(iobj);
9773 }
9774 else {
9775 iobj = PyNumber_Long(v);
9776 }
9777 if (iobj!=NULL) {
9778 if (PyLong_Check(iobj)) {
9779 isnumok = 1;
9780 temp = formatlong(iobj, flags, prec, c);
9781 Py_DECREF(iobj);
9782 if (!temp)
9783 goto onError;
9784 pbuf = PyUnicode_AS_UNICODE(temp);
9785 len = PyUnicode_GET_SIZE(temp);
9786 sign = 1;
9787 }
9788 else {
9789 Py_DECREF(iobj);
9790 }
9791 }
9792 }
9793 if (!isnumok) {
9794 PyErr_Format(PyExc_TypeError,
9795 "%%%c format: a number is required, "
9796 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9797 goto onError;
9798 }
9799 if (flags & F_ZERO)
9800 fill = '0';
9801 break;
9802
9803 case 'e':
9804 case 'E':
9805 case 'f':
9806 case 'F':
9807 case 'g':
9808 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009809 temp = formatfloat(v, flags, prec, c);
9810 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009811 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009812 pbuf = PyUnicode_AS_UNICODE(temp);
9813 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009814 sign = 1;
9815 if (flags & F_ZERO)
9816 fill = '0';
9817 break;
9818
9819 case 'c':
9820 pbuf = formatbuf;
9821 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9822 if (len < 0)
9823 goto onError;
9824 break;
9825
9826 default:
9827 PyErr_Format(PyExc_ValueError,
9828 "unsupported format character '%c' (0x%x) "
9829 "at index %zd",
9830 (31<=c && c<=126) ? (char)c : '?',
9831 (int)c,
9832 (Py_ssize_t)(fmt - 1 -
9833 PyUnicode_AS_UNICODE(uformat)));
9834 goto onError;
9835 }
9836 if (sign) {
9837 if (*pbuf == '-' || *pbuf == '+') {
9838 sign = *pbuf++;
9839 len--;
9840 }
9841 else if (flags & F_SIGN)
9842 sign = '+';
9843 else if (flags & F_BLANK)
9844 sign = ' ';
9845 else
9846 sign = 0;
9847 }
9848 if (width < len)
9849 width = len;
9850 if (rescnt - (sign != 0) < width) {
9851 reslen -= rescnt;
9852 rescnt = width + fmtcnt + 100;
9853 reslen += rescnt;
9854 if (reslen < 0) {
9855 Py_XDECREF(temp);
9856 PyErr_NoMemory();
9857 goto onError;
9858 }
9859 if (_PyUnicode_Resize(&result, reslen) < 0) {
9860 Py_XDECREF(temp);
9861 goto onError;
9862 }
9863 res = PyUnicode_AS_UNICODE(result)
9864 + reslen - rescnt;
9865 }
9866 if (sign) {
9867 if (fill != ' ')
9868 *res++ = sign;
9869 rescnt--;
9870 if (width > len)
9871 width--;
9872 }
9873 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9874 assert(pbuf[0] == '0');
9875 assert(pbuf[1] == c);
9876 if (fill != ' ') {
9877 *res++ = *pbuf++;
9878 *res++ = *pbuf++;
9879 }
9880 rescnt -= 2;
9881 width -= 2;
9882 if (width < 0)
9883 width = 0;
9884 len -= 2;
9885 }
9886 if (width > len && !(flags & F_LJUST)) {
9887 do {
9888 --rescnt;
9889 *res++ = fill;
9890 } while (--width > len);
9891 }
9892 if (fill == ' ') {
9893 if (sign)
9894 *res++ = sign;
9895 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9896 assert(pbuf[0] == '0');
9897 assert(pbuf[1] == c);
9898 *res++ = *pbuf++;
9899 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009900 }
9901 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009902 Py_UNICODE_COPY(res, pbuf, len);
9903 res += len;
9904 rescnt -= len;
9905 while (--width >= len) {
9906 --rescnt;
9907 *res++ = ' ';
9908 }
9909 if (dict && (argidx < arglen) && c != '%') {
9910 PyErr_SetString(PyExc_TypeError,
9911 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009912 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009913 goto onError;
9914 }
9915 Py_XDECREF(temp);
9916 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009917 } /* until end */
9918 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009919 PyErr_SetString(PyExc_TypeError,
9920 "not all arguments converted during string formatting");
9921 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009922 }
9923
Thomas Woutersa96affe2006-03-12 00:29:36 +00009924 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009925 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009927 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009928 }
9929 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930 return (PyObject *)result;
9931
Benjamin Peterson29060642009-01-31 22:14:21 +00009932 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933 Py_XDECREF(result);
9934 Py_DECREF(uformat);
9935 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009936 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009937 }
9938 return NULL;
9939}
9940
Jeremy Hylton938ace62002-07-17 16:30:39 +00009941static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009942unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9943
Tim Peters6d6c1a32001-08-02 04:15:00 +00009944static PyObject *
9945unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9946{
Benjamin Peterson29060642009-01-31 22:14:21 +00009947 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009948 static char *kwlist[] = {"object", "encoding", "errors", 0};
9949 char *encoding = NULL;
9950 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009951
Benjamin Peterson14339b62009-01-31 16:36:08 +00009952 if (type != &PyUnicode_Type)
9953 return unicode_subtype_new(type, args, kwds);
9954 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009955 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009956 return NULL;
9957 if (x == NULL)
9958 return (PyObject *)_PyUnicode_New(0);
9959 if (encoding == NULL && errors == NULL)
9960 return PyObject_Str(x);
9961 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009962 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009963}
9964
Guido van Rossume023fe02001-08-30 03:12:59 +00009965static PyObject *
9966unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9967{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009968 PyUnicodeObject *tmp, *pnew;
9969 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009970
Benjamin Peterson14339b62009-01-31 16:36:08 +00009971 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9972 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9973 if (tmp == NULL)
9974 return NULL;
9975 assert(PyUnicode_Check(tmp));
9976 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9977 if (pnew == NULL) {
9978 Py_DECREF(tmp);
9979 return NULL;
9980 }
9981 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9982 if (pnew->str == NULL) {
9983 _Py_ForgetReference((PyObject *)pnew);
9984 PyObject_Del(pnew);
9985 Py_DECREF(tmp);
9986 return PyErr_NoMemory();
9987 }
9988 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9989 pnew->length = n;
9990 pnew->hash = tmp->hash;
9991 Py_DECREF(tmp);
9992 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009993}
9994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009995PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009996 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009997\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009998Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009999encoding defaults to the current default string encoding.\n\
10000errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000010001
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010002static PyObject *unicode_iter(PyObject *seq);
10003
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000010005 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000010006 "str", /* tp_name */
10007 sizeof(PyUnicodeObject), /* tp_size */
10008 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010009 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010010 (destructor)unicode_dealloc, /* tp_dealloc */
10011 0, /* tp_print */
10012 0, /* tp_getattr */
10013 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010014 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010015 unicode_repr, /* tp_repr */
10016 &unicode_as_number, /* tp_as_number */
10017 &unicode_as_sequence, /* tp_as_sequence */
10018 &unicode_as_mapping, /* tp_as_mapping */
10019 (hashfunc) unicode_hash, /* tp_hash*/
10020 0, /* tp_call*/
10021 (reprfunc) unicode_str, /* tp_str */
10022 PyObject_GenericGetAttr, /* tp_getattro */
10023 0, /* tp_setattro */
10024 0, /* tp_as_buffer */
10025 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000010026 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010027 unicode_doc, /* tp_doc */
10028 0, /* tp_traverse */
10029 0, /* tp_clear */
10030 PyUnicode_RichCompare, /* tp_richcompare */
10031 0, /* tp_weaklistoffset */
10032 unicode_iter, /* tp_iter */
10033 0, /* tp_iternext */
10034 unicode_methods, /* tp_methods */
10035 0, /* tp_members */
10036 0, /* tp_getset */
10037 &PyBaseObject_Type, /* tp_base */
10038 0, /* tp_dict */
10039 0, /* tp_descr_get */
10040 0, /* tp_descr_set */
10041 0, /* tp_dictoffset */
10042 0, /* tp_init */
10043 0, /* tp_alloc */
10044 unicode_new, /* tp_new */
10045 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010046};
10047
10048/* Initialize the Unicode implementation */
10049
Thomas Wouters78890102000-07-22 19:25:51 +000010050void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010051{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010052 int i;
10053
Thomas Wouters477c8d52006-05-27 19:21:47 +000010054 /* XXX - move this array to unicodectype.c ? */
10055 Py_UNICODE linebreak[] = {
10056 0x000A, /* LINE FEED */
10057 0x000D, /* CARRIAGE RETURN */
10058 0x001C, /* FILE SEPARATOR */
10059 0x001D, /* GROUP SEPARATOR */
10060 0x001E, /* RECORD SEPARATOR */
10061 0x0085, /* NEXT LINE */
10062 0x2028, /* LINE SEPARATOR */
10063 0x2029, /* PARAGRAPH SEPARATOR */
10064 };
10065
Fred Drakee4315f52000-05-09 19:53:39 +000010066 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +000010067 free_list = NULL;
10068 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010069 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010070 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +000010071 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010072
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010073 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000010074 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000010075 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010076 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000010077
10078 /* initialize the linebreak bloom filter */
10079 bloom_linebreak = make_bloom_mask(
10080 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10081 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010082
10083 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010084}
10085
10086/* Finalize the Unicode implementation */
10087
Christian Heimesa156e092008-02-16 07:38:31 +000010088int
10089PyUnicode_ClearFreeList(void)
10090{
10091 int freelist_size = numfree;
10092 PyUnicodeObject *u;
10093
10094 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010095 PyUnicodeObject *v = u;
10096 u = *(PyUnicodeObject **)u;
10097 if (v->str)
10098 PyObject_DEL(v->str);
10099 Py_XDECREF(v->defenc);
10100 PyObject_Del(v);
10101 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010102 }
10103 free_list = NULL;
10104 assert(numfree == 0);
10105 return freelist_size;
10106}
10107
Guido van Rossumd57fd912000-03-10 22:53:23 +000010108void
Thomas Wouters78890102000-07-22 19:25:51 +000010109_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010111 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010112
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010113 Py_XDECREF(unicode_empty);
10114 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010115
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010116 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010117 if (unicode_latin1[i]) {
10118 Py_DECREF(unicode_latin1[i]);
10119 unicode_latin1[i] = NULL;
10120 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010121 }
Christian Heimesa156e092008-02-16 07:38:31 +000010122 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010123}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010124
Walter Dörwald16807132007-05-25 13:52:07 +000010125void
10126PyUnicode_InternInPlace(PyObject **p)
10127{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010128 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10129 PyObject *t;
10130 if (s == NULL || !PyUnicode_Check(s))
10131 Py_FatalError(
10132 "PyUnicode_InternInPlace: unicode strings only please!");
10133 /* If it's a subclass, we don't really know what putting
10134 it in the interned dict might do. */
10135 if (!PyUnicode_CheckExact(s))
10136 return;
10137 if (PyUnicode_CHECK_INTERNED(s))
10138 return;
10139 if (interned == NULL) {
10140 interned = PyDict_New();
10141 if (interned == NULL) {
10142 PyErr_Clear(); /* Don't leave an exception */
10143 return;
10144 }
10145 }
10146 /* It might be that the GetItem call fails even
10147 though the key is present in the dictionary,
10148 namely when this happens during a stack overflow. */
10149 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010150 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010151 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010152
Benjamin Peterson29060642009-01-31 22:14:21 +000010153 if (t) {
10154 Py_INCREF(t);
10155 Py_DECREF(*p);
10156 *p = t;
10157 return;
10158 }
Walter Dörwald16807132007-05-25 13:52:07 +000010159
Benjamin Peterson14339b62009-01-31 16:36:08 +000010160 PyThreadState_GET()->recursion_critical = 1;
10161 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10162 PyErr_Clear();
10163 PyThreadState_GET()->recursion_critical = 0;
10164 return;
10165 }
10166 PyThreadState_GET()->recursion_critical = 0;
10167 /* The two references in interned are not counted by refcnt.
10168 The deallocator will take care of this */
10169 Py_REFCNT(s) -= 2;
10170 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010171}
10172
10173void
10174PyUnicode_InternImmortal(PyObject **p)
10175{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010176 PyUnicode_InternInPlace(p);
10177 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10178 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10179 Py_INCREF(*p);
10180 }
Walter Dörwald16807132007-05-25 13:52:07 +000010181}
10182
10183PyObject *
10184PyUnicode_InternFromString(const char *cp)
10185{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010186 PyObject *s = PyUnicode_FromString(cp);
10187 if (s == NULL)
10188 return NULL;
10189 PyUnicode_InternInPlace(&s);
10190 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010191}
10192
Alexander Belopolsky40018472011-02-26 01:02:56 +000010193void
10194_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000010195{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010196 PyObject *keys;
10197 PyUnicodeObject *s;
10198 Py_ssize_t i, n;
10199 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010200
Benjamin Peterson14339b62009-01-31 16:36:08 +000010201 if (interned == NULL || !PyDict_Check(interned))
10202 return;
10203 keys = PyDict_Keys(interned);
10204 if (keys == NULL || !PyList_Check(keys)) {
10205 PyErr_Clear();
10206 return;
10207 }
Walter Dörwald16807132007-05-25 13:52:07 +000010208
Benjamin Peterson14339b62009-01-31 16:36:08 +000010209 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10210 detector, interned unicode strings are not forcibly deallocated;
10211 rather, we give them their stolen references back, and then clear
10212 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010213
Benjamin Peterson14339b62009-01-31 16:36:08 +000010214 n = PyList_GET_SIZE(keys);
10215 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010216 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010217 for (i = 0; i < n; i++) {
10218 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10219 switch (s->state) {
10220 case SSTATE_NOT_INTERNED:
10221 /* XXX Shouldn't happen */
10222 break;
10223 case SSTATE_INTERNED_IMMORTAL:
10224 Py_REFCNT(s) += 1;
10225 immortal_size += s->length;
10226 break;
10227 case SSTATE_INTERNED_MORTAL:
10228 Py_REFCNT(s) += 2;
10229 mortal_size += s->length;
10230 break;
10231 default:
10232 Py_FatalError("Inconsistent interned string state.");
10233 }
10234 s->state = SSTATE_NOT_INTERNED;
10235 }
10236 fprintf(stderr, "total size of all interned strings: "
10237 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10238 "mortal/immortal\n", mortal_size, immortal_size);
10239 Py_DECREF(keys);
10240 PyDict_Clear(interned);
10241 Py_DECREF(interned);
10242 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010243}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010244
10245
10246/********************* Unicode Iterator **************************/
10247
10248typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010249 PyObject_HEAD
10250 Py_ssize_t it_index;
10251 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010252} unicodeiterobject;
10253
10254static void
10255unicodeiter_dealloc(unicodeiterobject *it)
10256{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010257 _PyObject_GC_UNTRACK(it);
10258 Py_XDECREF(it->it_seq);
10259 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010260}
10261
10262static int
10263unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10264{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010265 Py_VISIT(it->it_seq);
10266 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010267}
10268
10269static PyObject *
10270unicodeiter_next(unicodeiterobject *it)
10271{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010272 PyUnicodeObject *seq;
10273 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010274
Benjamin Peterson14339b62009-01-31 16:36:08 +000010275 assert(it != NULL);
10276 seq = it->it_seq;
10277 if (seq == NULL)
10278 return NULL;
10279 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010280
Benjamin Peterson14339b62009-01-31 16:36:08 +000010281 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10282 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010283 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010284 if (item != NULL)
10285 ++it->it_index;
10286 return item;
10287 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010288
Benjamin Peterson14339b62009-01-31 16:36:08 +000010289 Py_DECREF(seq);
10290 it->it_seq = NULL;
10291 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010292}
10293
10294static PyObject *
10295unicodeiter_len(unicodeiterobject *it)
10296{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010297 Py_ssize_t len = 0;
10298 if (it->it_seq)
10299 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10300 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010301}
10302
10303PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10304
10305static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010306 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010307 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010308 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010309};
10310
10311PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010312 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10313 "str_iterator", /* tp_name */
10314 sizeof(unicodeiterobject), /* tp_basicsize */
10315 0, /* tp_itemsize */
10316 /* methods */
10317 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10318 0, /* tp_print */
10319 0, /* tp_getattr */
10320 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010321 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010322 0, /* tp_repr */
10323 0, /* tp_as_number */
10324 0, /* tp_as_sequence */
10325 0, /* tp_as_mapping */
10326 0, /* tp_hash */
10327 0, /* tp_call */
10328 0, /* tp_str */
10329 PyObject_GenericGetAttr, /* tp_getattro */
10330 0, /* tp_setattro */
10331 0, /* tp_as_buffer */
10332 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10333 0, /* tp_doc */
10334 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10335 0, /* tp_clear */
10336 0, /* tp_richcompare */
10337 0, /* tp_weaklistoffset */
10338 PyObject_SelfIter, /* tp_iter */
10339 (iternextfunc)unicodeiter_next, /* tp_iternext */
10340 unicodeiter_methods, /* tp_methods */
10341 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010342};
10343
10344static PyObject *
10345unicode_iter(PyObject *seq)
10346{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010347 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010348
Benjamin Peterson14339b62009-01-31 16:36:08 +000010349 if (!PyUnicode_Check(seq)) {
10350 PyErr_BadInternalCall();
10351 return NULL;
10352 }
10353 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10354 if (it == NULL)
10355 return NULL;
10356 it->it_index = 0;
10357 Py_INCREF(seq);
10358 it->it_seq = (PyUnicodeObject *)seq;
10359 _PyObject_GC_TRACK(it);
10360 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010361}
10362
Martin v. Löwis5b222132007-06-10 09:51:05 +000010363size_t
10364Py_UNICODE_strlen(const Py_UNICODE *u)
10365{
10366 int res = 0;
10367 while(*u++)
10368 res++;
10369 return res;
10370}
10371
10372Py_UNICODE*
10373Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10374{
10375 Py_UNICODE *u = s1;
10376 while ((*u++ = *s2++));
10377 return s1;
10378}
10379
10380Py_UNICODE*
10381Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10382{
10383 Py_UNICODE *u = s1;
10384 while ((*u++ = *s2++))
10385 if (n-- == 0)
10386 break;
10387 return s1;
10388}
10389
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010390Py_UNICODE*
10391Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10392{
10393 Py_UNICODE *u1 = s1;
10394 u1 += Py_UNICODE_strlen(u1);
10395 Py_UNICODE_strcpy(u1, s2);
10396 return s1;
10397}
10398
Martin v. Löwis5b222132007-06-10 09:51:05 +000010399int
10400Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10401{
10402 while (*s1 && *s2 && *s1 == *s2)
10403 s1++, s2++;
10404 if (*s1 && *s2)
10405 return (*s1 < *s2) ? -1 : +1;
10406 if (*s1)
10407 return 1;
10408 if (*s2)
10409 return -1;
10410 return 0;
10411}
10412
Victor Stinneref8d95c2010-08-16 22:03:11 +000010413int
10414Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10415{
10416 register Py_UNICODE u1, u2;
10417 for (; n != 0; n--) {
10418 u1 = *s1;
10419 u2 = *s2;
10420 if (u1 != u2)
10421 return (u1 < u2) ? -1 : +1;
10422 if (u1 == '\0')
10423 return 0;
10424 s1++;
10425 s2++;
10426 }
10427 return 0;
10428}
10429
Martin v. Löwis5b222132007-06-10 09:51:05 +000010430Py_UNICODE*
10431Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10432{
10433 const Py_UNICODE *p;
10434 for (p = s; *p; p++)
10435 if (*p == c)
10436 return (Py_UNICODE*)p;
10437 return NULL;
10438}
10439
Victor Stinner331ea922010-08-10 16:37:20 +000010440Py_UNICODE*
10441Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10442{
10443 const Py_UNICODE *p;
10444 p = s + Py_UNICODE_strlen(s);
10445 while (p != s) {
10446 p--;
10447 if (*p == c)
10448 return (Py_UNICODE*)p;
10449 }
10450 return NULL;
10451}
10452
Victor Stinner71133ff2010-09-01 23:43:53 +000010453Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010454PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010455{
10456 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10457 Py_UNICODE *copy;
10458 Py_ssize_t size;
10459
10460 /* Ensure we won't overflow the size. */
10461 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10462 PyErr_NoMemory();
10463 return NULL;
10464 }
10465 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10466 size *= sizeof(Py_UNICODE);
10467 copy = PyMem_Malloc(size);
10468 if (copy == NULL) {
10469 PyErr_NoMemory();
10470 return NULL;
10471 }
10472 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10473 return copy;
10474}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010475
Georg Brandl66c221e2010-10-14 07:04:07 +000010476/* A _string module, to export formatter_parser and formatter_field_name_split
10477 to the string.Formatter class implemented in Python. */
10478
10479static PyMethodDef _string_methods[] = {
10480 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10481 METH_O, PyDoc_STR("split the argument as a field name")},
10482 {"formatter_parser", (PyCFunction) formatter_parser,
10483 METH_O, PyDoc_STR("parse the argument as a format string")},
10484 {NULL, NULL}
10485};
10486
10487static struct PyModuleDef _string_module = {
10488 PyModuleDef_HEAD_INIT,
10489 "_string",
10490 PyDoc_STR("string helper module"),
10491 0,
10492 _string_methods,
10493 NULL,
10494 NULL,
10495 NULL,
10496 NULL
10497};
10498
10499PyMODINIT_FUNC
10500PyInit__string(void)
10501{
10502 return PyModule_Create(&_string_module);
10503}
10504
10505
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010506#ifdef __cplusplus
10507}
10508#endif