blob: 7597a46ab6ae9709ceb2c48f4d5eb0533b58c1c6 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Thomas Wouters477c8d52006-05-27 19:21:47 +00007Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
Guido van Rossum16b1ad92000-08-03 16:24:25 +000010Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012--------------------------------------------------------------------
13The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Benjamin Peterson29060642009-01-31 22:14:21 +000015 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000017
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000018By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000041
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000042#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000043#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000044#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Limit for the Unicode object free list */
51
Christian Heimes2202f872008-02-06 14:31:34 +000052#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
54/* Limit for the Unicode object free list stay alive optimization.
55
56 The implementation will keep allocated Unicode memory intact for
57 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000058 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000059
Christian Heimes2202f872008-02-06 14:31:34 +000060 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000061 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000062 malloc()-overhead) bytes of unused garbage.
63
64 Setting the limit to 0 effectively turns the feature off.
65
Guido van Rossumfd4b9572000-04-10 13:51:10 +000066 Note: This is an experimental feature ! If you get core dumps when
67 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000068
69*/
70
Guido van Rossumfd4b9572000-04-10 13:51:10 +000071#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000072
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
83 The globals are initialized by the _PyUnicode_Init() API and should
84 not be used before calling that API.
85
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Walter Dörwald16807132007-05-25 13:52:07 +000093/* This dictionary holds all interned unicode strings. Note that references
94 to strings in this dictionary are *not* counted in the string's ob_refcnt.
95 When the interned string reaches a refcnt of 0 the string deallocation
96 function will delete the reference from this dictionary.
97
98 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +000099 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000100*/
101static PyObject *interned;
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* Free list for Unicode objects */
Christian Heimes2202f872008-02-06 14:31:34 +0000104static PyUnicodeObject *free_list;
105static int numfree;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111 shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
Christian Heimes190d79e2008-01-30 11:58:22 +0000114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000116 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000117/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000118/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000119/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000120/* case 0x000C: * FORM FEED */
121/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000122 0, 1, 1, 1, 1, 1, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000124/* case 0x001C: * FILE SEPARATOR */
125/* case 0x001D: * GROUP SEPARATOR */
126/* case 0x001E: * RECORD SEPARATOR */
127/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000128 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000129/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000130 1, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000134
Benjamin Peterson14339b62009-01-31 16:36:08 +0000135 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000143};
144
Alexander Belopolsky40018472011-02-26 01:02:56 +0000145static PyObject *
146unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000147 PyObject **errorHandler,const char *encoding, const char *reason,
148 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
149 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
150
Alexander Belopolsky40018472011-02-26 01:02:56 +0000151static void
152raise_encode_exception(PyObject **exceptionObject,
153 const char *encoding,
154 const Py_UNICODE *unicode, Py_ssize_t size,
155 Py_ssize_t startpos, Py_ssize_t endpos,
156 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000157
Christian Heimes190d79e2008-01-30 11:58:22 +0000158/* Same for linebreaks */
159static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000160 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000161/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000162/* 0x000B, * LINE TABULATION */
163/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000164/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000165 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000166 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000167/* 0x001C, * FILE SEPARATOR */
168/* 0x001D, * GROUP SEPARATOR */
169/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000170 0, 0, 0, 0, 1, 1, 1, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000175
Benjamin Peterson14339b62009-01-31 16:36:08 +0000176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000184};
185
186
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000187Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000188PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000189{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000190#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000191 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000192#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 /* This is actually an illegal character, so it should
194 not be passed to unichr. */
195 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000196#endif
197}
198
Thomas Wouters477c8d52006-05-27 19:21:47 +0000199/* --- Bloom Filters ----------------------------------------------------- */
200
201/* stuff to implement simple "bloom filters" for Unicode characters.
202 to keep things simple, we use a single bitmask, using the least 5
203 bits from each unicode characters as the bit index. */
204
205/* the linebreak mask is set up by Unicode_Init below */
206
Antoine Pitrouf068f942010-01-13 14:19:12 +0000207#if LONG_BIT >= 128
208#define BLOOM_WIDTH 128
209#elif LONG_BIT >= 64
210#define BLOOM_WIDTH 64
211#elif LONG_BIT >= 32
212#define BLOOM_WIDTH 32
213#else
214#error "LONG_BIT is smaller than 32"
215#endif
216
Thomas Wouters477c8d52006-05-27 19:21:47 +0000217#define BLOOM_MASK unsigned long
218
219static BLOOM_MASK bloom_linebreak;
220
Antoine Pitrouf068f942010-01-13 14:19:12 +0000221#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
222#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000223
Benjamin Peterson29060642009-01-31 22:14:21 +0000224#define BLOOM_LINEBREAK(ch) \
225 ((ch) < 128U ? ascii_linebreak[(ch)] : \
226 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000227
Alexander Belopolsky40018472011-02-26 01:02:56 +0000228Py_LOCAL_INLINE(BLOOM_MASK)
229make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000230{
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
Antoine Pitrouf068f942010-01-13 14:19:12 +0000233 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
Antoine Pitrouf2c54842010-01-13 08:07:53 +0000238 BLOOM_ADD(mask, ptr[i]);
Thomas Wouters477c8d52006-05-27 19:21:47 +0000239
240 return mask;
241}
242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243Py_LOCAL_INLINE(int)
244unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000245{
246 Py_ssize_t i;
247
248 for (i = 0; i < setlen; i++)
249 if (set[i] == chr)
250 return 1;
251
252 return 0;
253}
254
Benjamin Peterson29060642009-01-31 22:14:21 +0000255#define BLOOM_MEMBER(mask, chr, set, setlen) \
Thomas Wouters477c8d52006-05-27 19:21:47 +0000256 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
257
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258/* --- Unicode Object ----------------------------------------------------- */
259
Alexander Belopolsky40018472011-02-26 01:02:56 +0000260static int
261unicode_resize(register PyUnicodeObject *unicode,
262 Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263{
264 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000265
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000266 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 if (unicode->length == length)
Benjamin Peterson29060642009-01-31 22:14:21 +0000268 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000270 /* Resizing shared object (unicode_empty or single character
271 objects) in-place is not allowed. Use PyUnicode_Resize()
272 instead ! */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 if (unicode == unicode_empty ||
Benjamin Peterson29060642009-01-31 22:14:21 +0000275 (unicode->length == 1 &&
276 unicode->str[0] < 256U &&
277 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson142957c2008-07-04 19:55:29 +0000279 "can't resize shared str objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000280 return -1;
281 }
282
Thomas Wouters477c8d52006-05-27 19:21:47 +0000283 /* We allocate one more byte to make sure the string is Ux0000 terminated.
284 The overallocation is also used by fastsearch, which assumes that it's
285 safe to look at str[length] (without making any assumptions about what
286 it contains). */
287
Guido van Rossumd57fd912000-03-10 22:53:23 +0000288 oldstr = unicode->str;
Christian Heimesb186d002008-03-18 15:15:01 +0000289 unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson29060642009-01-31 22:14:21 +0000290 sizeof(Py_UNICODE) * (length + 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000292 unicode->str = (Py_UNICODE *)oldstr;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000293 PyErr_NoMemory();
294 return -1;
295 }
296 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000297 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000298
Benjamin Peterson29060642009-01-31 22:14:21 +0000299 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000300 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000301 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000302 Py_CLEAR(unicode->defenc);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000303 }
304 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000305
Guido van Rossumd57fd912000-03-10 22:53:23 +0000306 return 0;
307}
308
309/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000310 Ux0000 terminated; some code (e.g. new_identifier)
311 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000312
313 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000314 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000315
316*/
317
Alexander Belopolsky40018472011-02-26 01:02:56 +0000318static PyUnicodeObject *
319_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000320{
321 register PyUnicodeObject *unicode;
322
Thomas Wouters477c8d52006-05-27 19:21:47 +0000323 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000324 if (length == 0 && unicode_empty != NULL) {
325 Py_INCREF(unicode_empty);
326 return unicode_empty;
327 }
328
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000329 /* Ensure we won't overflow the size. */
330 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
331 return (PyUnicodeObject *)PyErr_NoMemory();
332 }
333
Guido van Rossumd57fd912000-03-10 22:53:23 +0000334 /* Unicode freelist & memory allocation */
Christian Heimes2202f872008-02-06 14:31:34 +0000335 if (free_list) {
336 unicode = free_list;
337 free_list = *(PyUnicodeObject **)unicode;
338 numfree--;
Benjamin Peterson29060642009-01-31 22:14:21 +0000339 if (unicode->str) {
340 /* Keep-Alive optimization: we only upsize the buffer,
341 never downsize it. */
342 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000343 unicode_resize(unicode, length) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000344 PyObject_DEL(unicode->str);
345 unicode->str = NULL;
346 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000347 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000348 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000349 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumad98db12001-06-14 17:52:02 +0000351 }
352 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000353 }
354 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000355 size_t new_size;
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000356 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000357 if (unicode == NULL)
358 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +0000359 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
360 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000361 }
362
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000363 if (!unicode->str) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000364 PyErr_NoMemory();
365 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000366 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000367 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000368 * the caller fails before initializing str -- unicode_resize()
369 * reads str[0], and the Keep-Alive optimization can keep memory
370 * allocated for str alive across a call to unicode_dealloc(unicode).
371 * We don't want unicode_resize to read uninitialized memory in
372 * that case.
373 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000374 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 unicode->str[length] = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 unicode->length = length;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000377 unicode->hash = -1;
Walter Dörwald16807132007-05-25 13:52:07 +0000378 unicode->state = 0;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000379 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000380 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000381
Benjamin Peterson29060642009-01-31 22:14:21 +0000382 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000383 /* XXX UNREF/NEWREF interface should be more symmetrical */
384 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000385 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000386 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000388}
389
Alexander Belopolsky40018472011-02-26 01:02:56 +0000390static void
391unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392{
Walter Dörwald16807132007-05-25 13:52:07 +0000393 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000394 case SSTATE_NOT_INTERNED:
395 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000396
Benjamin Peterson29060642009-01-31 22:14:21 +0000397 case SSTATE_INTERNED_MORTAL:
398 /* revive dead object temporarily for DelItem */
399 Py_REFCNT(unicode) = 3;
400 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
401 Py_FatalError(
402 "deletion of interned string failed");
403 break;
Walter Dörwald16807132007-05-25 13:52:07 +0000404
Benjamin Peterson29060642009-01-31 22:14:21 +0000405 case SSTATE_INTERNED_IMMORTAL:
406 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +0000407
Benjamin Peterson29060642009-01-31 22:14:21 +0000408 default:
409 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +0000410 }
411
Guido van Rossum604ddf82001-12-06 20:03:56 +0000412 if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000413 numfree < PyUnicode_MAXFREELIST) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000414 /* Keep-Alive optimization */
Benjamin Peterson29060642009-01-31 22:14:21 +0000415 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
416 PyObject_DEL(unicode->str);
417 unicode->str = NULL;
418 unicode->length = 0;
419 }
420 if (unicode->defenc) {
Georg Brandl8ee604b2010-07-29 14:23:06 +0000421 Py_CLEAR(unicode->defenc);
Benjamin Peterson29060642009-01-31 22:14:21 +0000422 }
423 /* Add to free list */
Christian Heimes2202f872008-02-06 14:31:34 +0000424 *(PyUnicodeObject **)unicode = free_list;
425 free_list = unicode;
426 numfree++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 }
428 else {
Benjamin Peterson29060642009-01-31 22:14:21 +0000429 PyObject_DEL(unicode->str);
430 Py_XDECREF(unicode->defenc);
431 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 }
433}
434
Alexander Belopolsky40018472011-02-26 01:02:56 +0000435static int
436_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000437{
438 register PyUnicodeObject *v;
439
440 /* Argument checks */
441 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000442 PyErr_BadInternalCall();
443 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000444 }
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000445 v = *unicode;
Christian Heimes90aa7642007-12-19 02:45:37 +0000446 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000447 PyErr_BadInternalCall();
448 return -1;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000449 }
450
451 /* Resizing unicode_empty and single character objects is not
452 possible since these are being shared. We simply return a fresh
453 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000454 if (v->length != length &&
Benjamin Peterson29060642009-01-31 22:14:21 +0000455 (v == unicode_empty || v->length == 1)) {
456 PyUnicodeObject *w = _PyUnicode_New(length);
457 if (w == NULL)
458 return -1;
459 Py_UNICODE_COPY(w->str, v->str,
460 length < v->length ? length : v->length);
461 Py_DECREF(*unicode);
462 *unicode = w;
463 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000464 }
465
466 /* Note that we don't have to modify *unicode for unshared Unicode
467 objects, since we can modify them in-place. */
468 return unicode_resize(v, length);
469}
470
Alexander Belopolsky40018472011-02-26 01:02:56 +0000471int
472PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +0000473{
474 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
475}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000476
Alexander Belopolsky40018472011-02-26 01:02:56 +0000477PyObject *
478PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000479{
480 PyUnicodeObject *unicode;
481
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000482 /* If the Unicode data is known at construction time, we can apply
483 some optimizations which share commonly used objects. */
484 if (u != NULL) {
485
Benjamin Peterson29060642009-01-31 22:14:21 +0000486 /* Optimization for empty strings */
487 if (size == 0 && unicode_empty != NULL) {
488 Py_INCREF(unicode_empty);
489 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000490 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000491
492 /* Single character Unicode objects in the Latin-1 range are
493 shared when using this constructor */
494 if (size == 1 && *u < 256) {
495 unicode = unicode_latin1[*u];
496 if (!unicode) {
497 unicode = _PyUnicode_New(1);
498 if (!unicode)
499 return NULL;
500 unicode->str[0] = *u;
501 unicode_latin1[*u] = unicode;
502 }
503 Py_INCREF(unicode);
504 return (PyObject *)unicode;
505 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 unicode = _PyUnicode_New(size);
509 if (!unicode)
510 return NULL;
511
512 /* Copy the Unicode data into the new object */
513 if (u != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +0000514 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000515
516 return (PyObject *)unicode;
517}
518
Alexander Belopolsky40018472011-02-26 01:02:56 +0000519PyObject *
520PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000521{
522 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +0000523
Benjamin Peterson14339b62009-01-31 16:36:08 +0000524 if (size < 0) {
525 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +0000526 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +0000527 return NULL;
528 }
Christian Heimes33fe8092008-04-13 13:53:33 +0000529
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000530 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +0000531 some optimizations which share commonly used objects.
532 Also, this means the input must be UTF-8, so fall back to the
533 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000534 if (u != NULL) {
535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536 /* Optimization for empty strings */
537 if (size == 0 && unicode_empty != NULL) {
538 Py_INCREF(unicode_empty);
539 return (PyObject *)unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000540 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000541
542 /* Single characters are shared when using this constructor.
543 Restrict to ASCII, since the input must be UTF-8. */
544 if (size == 1 && Py_CHARMASK(*u) < 128) {
545 unicode = unicode_latin1[Py_CHARMASK(*u)];
546 if (!unicode) {
547 unicode = _PyUnicode_New(1);
548 if (!unicode)
549 return NULL;
550 unicode->str[0] = Py_CHARMASK(*u);
551 unicode_latin1[Py_CHARMASK(*u)] = unicode;
552 }
553 Py_INCREF(unicode);
554 return (PyObject *)unicode;
555 }
Martin v. Löwis9c121062007-08-05 20:26:11 +0000556
557 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000558 }
559
Walter Dörwald55507312007-05-18 13:12:10 +0000560 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000561 if (!unicode)
562 return NULL;
563
Walter Dörwaldacaa5a12007-05-05 12:00:46 +0000564 return (PyObject *)unicode;
565}
566
Alexander Belopolsky40018472011-02-26 01:02:56 +0000567PyObject *
568PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +0000569{
570 size_t size = strlen(u);
571 if (size > PY_SSIZE_T_MAX) {
572 PyErr_SetString(PyExc_OverflowError, "input too long");
573 return NULL;
574 }
575
576 return PyUnicode_FromStringAndSize(u, size);
577}
578
Guido van Rossumd57fd912000-03-10 22:53:23 +0000579#ifdef HAVE_WCHAR_H
580
Mark Dickinson081dfee2009-03-18 14:47:41 +0000581#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
582# define CONVERT_WCHAR_TO_SURROGATES
583#endif
584
585#ifdef CONVERT_WCHAR_TO_SURROGATES
586
587/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
588 to convert from UTF32 to UTF16. */
589
Alexander Belopolsky40018472011-02-26 01:02:56 +0000590PyObject *
591PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +0000592{
593 PyUnicodeObject *unicode;
594 register Py_ssize_t i;
595 Py_ssize_t alloc;
596 const wchar_t *orig_w;
597
598 if (w == NULL) {
599 if (size == 0)
600 return PyUnicode_FromStringAndSize(NULL, 0);
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 if (size == -1) {
606 size = wcslen(w);
607 }
608
609 alloc = size;
610 orig_w = w;
611 for (i = size; i > 0; i--) {
612 if (*w > 0xFFFF)
613 alloc++;
614 w++;
615 }
616 w = orig_w;
617 unicode = _PyUnicode_New(alloc);
618 if (!unicode)
619 return NULL;
620
621 /* Copy the wchar_t data into the new object */
622 {
623 register Py_UNICODE *u;
624 u = PyUnicode_AS_UNICODE(unicode);
625 for (i = size; i > 0; i--) {
626 if (*w > 0xFFFF) {
627 wchar_t ordinal = *w++;
628 ordinal -= 0x10000;
629 *u++ = 0xD800 | (ordinal >> 10);
630 *u++ = 0xDC00 | (ordinal & 0x3FF);
631 }
632 else
633 *u++ = *w++;
634 }
635 }
636 return (PyObject *)unicode;
637}
638
639#else
640
Alexander Belopolsky40018472011-02-26 01:02:56 +0000641PyObject *
642PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643{
644 PyUnicodeObject *unicode;
645
646 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +0000647 if (size == 0)
648 return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +0000649 PyErr_BadInternalCall();
650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 }
652
Martin v. Löwis790465f2008-04-05 20:41:37 +0000653 if (size == -1) {
654 size = wcslen(w);
655 }
656
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657 unicode = _PyUnicode_New(size);
658 if (!unicode)
659 return NULL;
660
661 /* Copy the wchar_t data into the new object */
Daniel Stutzbach8515eae2010-08-24 21:57:33 +0000662#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Guido van Rossumd57fd912000-03-10 22:53:23 +0000663 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000664#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000665 {
Benjamin Peterson29060642009-01-31 22:14:21 +0000666 register Py_UNICODE *u;
667 register Py_ssize_t i;
668 u = PyUnicode_AS_UNICODE(unicode);
669 for (i = size; i > 0; i--)
670 *u++ = *w++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000671 }
672#endif
673
674 return (PyObject *)unicode;
675}
676
Mark Dickinson081dfee2009-03-18 14:47:41 +0000677#endif /* CONVERT_WCHAR_TO_SURROGATES */
678
679#undef CONVERT_WCHAR_TO_SURROGATES
680
Walter Dörwald346737f2007-05-31 10:44:43 +0000681static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000682makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
683 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +0000684{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000685 *fmt++ = '%';
686 if (width) {
687 if (zeropad)
688 *fmt++ = '0';
689 fmt += sprintf(fmt, "%d", width);
690 }
691 if (precision)
692 fmt += sprintf(fmt, ".%d", precision);
693 if (longflag)
694 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000695 else if (longlongflag) {
696 /* longlongflag should only ever be nonzero on machines with
697 HAVE_LONG_LONG defined */
698#ifdef HAVE_LONG_LONG
699 char *f = PY_FORMAT_LONG_LONG;
700 while (*f)
701 *fmt++ = *f++;
702#else
703 /* we shouldn't ever get here */
704 assert(0);
705 *fmt++ = 'l';
706#endif
707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000708 else if (size_tflag) {
709 char *f = PY_FORMAT_SIZE_T;
710 while (*f)
711 *fmt++ = *f++;
712 }
713 *fmt++ = c;
714 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +0000715}
716
Victor Stinner96865452011-03-01 23:44:09 +0000717/* helper for PyUnicode_FromFormatV() */
718
719static const char*
720parse_format_flags(const char *f,
721 int *p_width, int *p_precision,
722 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
723{
724 int width, precision, longflag, longlongflag, size_tflag;
725
726 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
727 f++;
728 width = 0;
729 while (Py_ISDIGIT((unsigned)*f))
730 width = (width*10) + *f++ - '0';
731 precision = 0;
732 if (*f == '.') {
733 f++;
734 while (Py_ISDIGIT((unsigned)*f))
735 precision = (precision*10) + *f++ - '0';
736 if (*f == '%') {
737 /* "%.3%s" => f points to "3" */
738 f--;
739 }
740 }
741 if (*f == '\0') {
742 /* bogus format "%.1" => go backward, f points to "1" */
743 f--;
744 }
745 if (p_width != NULL)
746 *p_width = width;
747 if (p_precision != NULL)
748 *p_precision = precision;
749
750 /* Handle %ld, %lu, %lld and %llu. */
751 longflag = 0;
752 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +0000753 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +0000754
755 if (*f == 'l') {
756 if (f[1] == 'd' || f[1] == 'u') {
757 longflag = 1;
758 ++f;
759 }
760#ifdef HAVE_LONG_LONG
761 else if (f[1] == 'l' &&
762 (f[2] == 'd' || f[2] == 'u')) {
763 longlongflag = 1;
764 f += 2;
765 }
766#endif
767 }
768 /* handle the size_t flag. */
769 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
770 size_tflag = 1;
771 ++f;
772 }
773 if (p_longflag != NULL)
774 *p_longflag = longflag;
775 if (p_longlongflag != NULL)
776 *p_longlongflag = longlongflag;
777 if (p_size_tflag != NULL)
778 *p_size_tflag = size_tflag;
779 return f;
780}
781
Walter Dörwaldd2034312007-05-18 16:29:38 +0000782#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
783
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000784/* size of fixed-size buffer for formatting single arguments */
785#define ITEM_BUFFER_LEN 21
786/* maximum number of characters required for output of %ld. 21 characters
787 allows for 64-bit integers (in decimal) and an optional sign. */
788#define MAX_LONG_CHARS 21
789/* maximum number of characters required for output of %lld.
790 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
791 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
792#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
793
Walter Dörwaldd2034312007-05-18 16:29:38 +0000794PyObject *
795PyUnicode_FromFormatV(const char *format, va_list vargs)
796{
Benjamin Peterson14339b62009-01-31 16:36:08 +0000797 va_list count;
798 Py_ssize_t callcount = 0;
799 PyObject **callresults = NULL;
800 PyObject **callresult = NULL;
801 Py_ssize_t n = 0;
802 int width = 0;
803 int precision = 0;
804 int zeropad;
805 const char* f;
806 Py_UNICODE *s;
807 PyObject *string;
808 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000809 char buffer[ITEM_BUFFER_LEN+1];
Benjamin Peterson14339b62009-01-31 16:36:08 +0000810 /* use abuffer instead of buffer, if we need more space
811 * (which can happen if there's a format specifier with width). */
812 char *abuffer = NULL;
813 char *realbuffer;
814 Py_ssize_t abuffersize = 0;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000815 char fmt[61]; /* should be enough for %0width.precisionlld */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000816 const char *copy;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000817
Victor Stinner4a2b7a12010-08-13 14:03:48 +0000818 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000819 /* step 1: count the number of %S/%R/%A/%s format specifications
820 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
821 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
822 * result in an array) */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000823 for (f = format; *f; f++) {
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000824 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +0000825 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
826 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
827 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000828 ++callcount;
829 }
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000830 else if (128 <= (unsigned char)*f) {
831 PyErr_Format(PyExc_ValueError,
832 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
Victor Stinner4c7db312010-09-12 07:51:18 +0000833 "string, got a non-ASCII byte: 0x%02x",
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000834 (unsigned char)*f);
Benjamin Petersond4ac96a2010-09-12 16:40:53 +0000835 return NULL;
Benjamin Peterson9be0b2e2010-09-12 03:40:54 +0000836 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000837 }
838 /* step 2: allocate memory for the results of
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000839 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000840 if (callcount) {
841 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
842 if (!callresults) {
843 PyErr_NoMemory();
844 return NULL;
845 }
846 callresult = callresults;
847 }
848 /* step 3: figure out how large a buffer we need */
849 for (f = format; *f; f++) {
850 if (*f == '%') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000851#ifdef HAVE_LONG_LONG
Victor Stinner96865452011-03-01 23:44:09 +0000852 int longlongflag;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000853#endif
Victor Stinner96865452011-03-01 23:44:09 +0000854 const char* p;
Walter Dörwaldd2034312007-05-18 16:29:38 +0000855
Victor Stinner96865452011-03-01 23:44:09 +0000856 p = f;
857 f = parse_format_flags(f, &width, NULL,
858 NULL, &longlongflag, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +0000859
Benjamin Peterson14339b62009-01-31 16:36:08 +0000860 switch (*f) {
861 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000862 {
863#ifndef Py_UNICODE_WIDE
864 int ordinal = va_arg(count, int);
865 if (ordinal > 0xffff)
866 n += 2;
867 else
868 n++;
869#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000870 (void)va_arg(count, int);
Victor Stinner5ed8b2c2011-02-21 21:13:44 +0000871 n++;
872#endif
873 break;
874 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000875 case '%':
876 n++;
877 break;
878 case 'd': case 'u': case 'i': case 'x':
879 (void) va_arg(count, int);
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000880#ifdef HAVE_LONG_LONG
881 if (longlongflag) {
882 if (width < MAX_LONG_LONG_CHARS)
883 width = MAX_LONG_LONG_CHARS;
884 }
885 else
886#endif
887 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
888 including sign. Decimal takes the most space. This
889 isn't enough for octal. If a width is specified we
890 need more (which we allocate later). */
891 if (width < MAX_LONG_CHARS)
892 width = MAX_LONG_CHARS;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000893 n += width;
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000894 /* XXX should allow for large precision here too. */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000895 if (abuffersize < width)
896 abuffersize = width;
897 break;
898 case 's':
899 {
900 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +0000901 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +0000902 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
903 if (!str)
904 goto fail;
905 n += PyUnicode_GET_SIZE(str);
906 /* Remember the str and switch to the next slot */
907 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000908 break;
909 }
910 case 'U':
911 {
912 PyObject *obj = va_arg(count, PyObject *);
913 assert(obj && PyUnicode_Check(obj));
914 n += PyUnicode_GET_SIZE(obj);
915 break;
916 }
917 case 'V':
918 {
919 PyObject *obj = va_arg(count, PyObject *);
920 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000921 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +0000922 assert(obj || str);
923 assert(!obj || PyUnicode_Check(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +0000924 if (obj) {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000925 n += PyUnicode_GET_SIZE(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +0000926 *callresult++ = NULL;
927 }
928 else {
929 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
930 if (!str_obj)
931 goto fail;
932 n += PyUnicode_GET_SIZE(str_obj);
933 *callresult++ = str_obj;
934 }
Benjamin Peterson14339b62009-01-31 16:36:08 +0000935 break;
936 }
937 case 'S':
938 {
939 PyObject *obj = va_arg(count, PyObject *);
940 PyObject *str;
941 assert(obj);
942 str = PyObject_Str(obj);
943 if (!str)
944 goto fail;
945 n += PyUnicode_GET_SIZE(str);
946 /* Remember the str and switch to the next slot */
947 *callresult++ = str;
948 break;
949 }
950 case 'R':
951 {
952 PyObject *obj = va_arg(count, PyObject *);
953 PyObject *repr;
954 assert(obj);
955 repr = PyObject_Repr(obj);
956 if (!repr)
957 goto fail;
958 n += PyUnicode_GET_SIZE(repr);
959 /* Remember the repr and switch to the next slot */
960 *callresult++ = repr;
961 break;
962 }
963 case 'A':
964 {
965 PyObject *obj = va_arg(count, PyObject *);
966 PyObject *ascii;
967 assert(obj);
968 ascii = PyObject_ASCII(obj);
969 if (!ascii)
970 goto fail;
971 n += PyUnicode_GET_SIZE(ascii);
972 /* Remember the repr and switch to the next slot */
973 *callresult++ = ascii;
974 break;
975 }
976 case 'p':
977 (void) va_arg(count, int);
978 /* maximum 64-bit pointer representation:
979 * 0xffffffffffffffff
980 * so 19 characters is enough.
981 * XXX I count 18 -- what's the extra for?
982 */
983 n += 19;
984 break;
985 default:
986 /* if we stumble upon an unknown
987 formatting code, copy the rest of
988 the format string to the output
989 string. (we cannot just skip the
990 code, since there's no way to know
991 what's in the argument list) */
992 n += strlen(p);
993 goto expand;
994 }
995 } else
996 n++;
997 }
Benjamin Peterson29060642009-01-31 22:14:21 +0000998 expand:
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +0000999 if (abuffersize > ITEM_BUFFER_LEN) {
1000 /* add 1 for sprintf's trailing null byte */
1001 abuffer = PyObject_Malloc(abuffersize + 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001002 if (!abuffer) {
1003 PyErr_NoMemory();
1004 goto fail;
1005 }
1006 realbuffer = abuffer;
1007 }
1008 else
1009 realbuffer = buffer;
1010 /* step 4: fill the buffer */
1011 /* Since we've analyzed how much space we need for the worst case,
1012 we don't have to resize the string.
1013 There can be no errors beyond this point. */
1014 string = PyUnicode_FromUnicode(NULL, n);
1015 if (!string)
1016 goto fail;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001017
Benjamin Peterson14339b62009-01-31 16:36:08 +00001018 s = PyUnicode_AS_UNICODE(string);
1019 callresult = callresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001020
Benjamin Peterson14339b62009-01-31 16:36:08 +00001021 for (f = format; *f; f++) {
1022 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001023 const char* p;
1024 int longflag;
1025 int longlongflag;
1026 int size_tflag;
1027
1028 p = f;
1029 zeropad = (f[1] == '0');
1030 f = parse_format_flags(f, &width, &precision,
1031 &longflag, &longlongflag, &size_tflag);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001032
Benjamin Peterson14339b62009-01-31 16:36:08 +00001033 switch (*f) {
1034 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001035 {
1036 int ordinal = va_arg(vargs, int);
1037#ifndef Py_UNICODE_WIDE
1038 if (ordinal > 0xffff) {
1039 ordinal -= 0x10000;
1040 *s++ = 0xD800 | (ordinal >> 10);
1041 *s++ = 0xDC00 | (ordinal & 0x3FF);
1042 } else
1043#endif
1044 *s++ = ordinal;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001045 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00001046 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001047 case 'd':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001048 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1049 width, precision, 'd');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001050 if (longflag)
1051 sprintf(realbuffer, fmt, va_arg(vargs, long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001052#ifdef HAVE_LONG_LONG
1053 else if (longlongflag)
1054 sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1055#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001056 else if (size_tflag)
1057 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1058 else
1059 sprintf(realbuffer, fmt, va_arg(vargs, int));
1060 appendstring(realbuffer);
1061 break;
1062 case 'u':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001063 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1064 width, precision, 'u');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001065 if (longflag)
1066 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001067#ifdef HAVE_LONG_LONG
1068 else if (longlongflag)
1069 sprintf(realbuffer, fmt, va_arg(vargs,
1070 unsigned PY_LONG_LONG));
1071#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001072 else if (size_tflag)
1073 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1074 else
1075 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1076 appendstring(realbuffer);
1077 break;
1078 case 'i':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001079 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001080 sprintf(realbuffer, fmt, va_arg(vargs, int));
1081 appendstring(realbuffer);
1082 break;
1083 case 'x':
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001084 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
Benjamin Peterson14339b62009-01-31 16:36:08 +00001085 sprintf(realbuffer, fmt, va_arg(vargs, int));
1086 appendstring(realbuffer);
1087 break;
1088 case 's':
1089 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001090 /* unused, since we already have the result */
1091 (void) va_arg(vargs, char *);
1092 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1093 PyUnicode_GET_SIZE(*callresult));
1094 s += PyUnicode_GET_SIZE(*callresult);
1095 /* We're done with the unicode()/repr() => forget it */
1096 Py_DECREF(*callresult);
1097 /* switch to next unicode()/repr() result */
1098 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001099 break;
1100 }
1101 case 'U':
1102 {
1103 PyObject *obj = va_arg(vargs, PyObject *);
1104 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1105 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1106 s += size;
1107 break;
1108 }
1109 case 'V':
1110 {
1111 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00001112 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001113 if (obj) {
1114 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1115 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1116 s += size;
1117 } else {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001118 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1119 PyUnicode_GET_SIZE(*callresult));
1120 s += PyUnicode_GET_SIZE(*callresult);
1121 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001122 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00001123 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001124 break;
1125 }
1126 case 'S':
1127 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00001128 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00001129 {
1130 Py_UNICODE *ucopy;
1131 Py_ssize_t usize;
1132 Py_ssize_t upos;
1133 /* unused, since we already have the result */
1134 (void) va_arg(vargs, PyObject *);
1135 ucopy = PyUnicode_AS_UNICODE(*callresult);
1136 usize = PyUnicode_GET_SIZE(*callresult);
1137 for (upos = 0; upos<usize;)
1138 *s++ = ucopy[upos++];
1139 /* We're done with the unicode()/repr() => forget it */
1140 Py_DECREF(*callresult);
1141 /* switch to next unicode()/repr() result */
1142 ++callresult;
1143 break;
1144 }
1145 case 'p':
1146 sprintf(buffer, "%p", va_arg(vargs, void*));
1147 /* %p is ill-defined: ensure leading 0x. */
1148 if (buffer[1] == 'X')
1149 buffer[1] = 'x';
1150 else if (buffer[1] != 'x') {
1151 memmove(buffer+2, buffer, strlen(buffer)+1);
1152 buffer[0] = '0';
1153 buffer[1] = 'x';
1154 }
1155 appendstring(buffer);
1156 break;
1157 case '%':
1158 *s++ = '%';
1159 break;
1160 default:
1161 appendstring(p);
1162 goto end;
1163 }
Victor Stinner1205f272010-09-11 00:54:47 +00001164 }
Victor Stinner1205f272010-09-11 00:54:47 +00001165 else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001166 *s++ = *f;
1167 }
Walter Dörwaldd2034312007-05-18 16:29:38 +00001168
Benjamin Peterson29060642009-01-31 22:14:21 +00001169 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001170 if (callresults)
1171 PyObject_Free(callresults);
1172 if (abuffer)
1173 PyObject_Free(abuffer);
1174 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1175 return string;
Benjamin Peterson29060642009-01-31 22:14:21 +00001176 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00001177 if (callresults) {
1178 PyObject **callresult2 = callresults;
1179 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00001180 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00001181 ++callresult2;
1182 }
1183 PyObject_Free(callresults);
1184 }
1185 if (abuffer)
1186 PyObject_Free(abuffer);
1187 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001188}
1189
1190#undef appendstring
1191
1192PyObject *
1193PyUnicode_FromFormat(const char *format, ...)
1194{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001195 PyObject* ret;
1196 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001197
1198#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00001199 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001200#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00001201 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001202#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00001203 ret = PyUnicode_FromFormatV(format, vargs);
1204 va_end(vargs);
1205 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001206}
1207
Victor Stinner5593d8a2010-10-02 11:11:27 +00001208/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1209 convert a Unicode object to a wide character string.
1210
1211 - If w is NULL: return the number of wide characters (including the nul
1212 character) required to convert the unicode object. Ignore size argument.
1213
1214 - Otherwise: return the number of wide characters (excluding the nul
1215 character) written into w. Write at most size wide characters (including
1216 the nul character). */
1217static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00001218unicode_aswidechar(PyUnicodeObject *unicode,
1219 wchar_t *w,
1220 Py_ssize_t size)
1221{
1222#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
Victor Stinner5593d8a2010-10-02 11:11:27 +00001223 Py_ssize_t res;
1224 if (w != NULL) {
1225 res = PyUnicode_GET_SIZE(unicode);
1226 if (size > res)
1227 size = res + 1;
1228 else
1229 res = size;
1230 memcpy(w, unicode->str, size * sizeof(wchar_t));
1231 return res;
1232 }
1233 else
1234 return PyUnicode_GET_SIZE(unicode) + 1;
1235#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1236 register const Py_UNICODE *u;
1237 const Py_UNICODE *uend;
1238 const wchar_t *worig, *wend;
1239 Py_ssize_t nchar;
1240
Victor Stinner137c34c2010-09-29 10:25:54 +00001241 u = PyUnicode_AS_UNICODE(unicode);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001242 uend = u + PyUnicode_GET_SIZE(unicode);
1243 if (w != NULL) {
1244 worig = w;
1245 wend = w + size;
1246 while (u != uend && w != wend) {
1247 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1248 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1249 {
1250 *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1251 u += 2;
1252 }
1253 else {
1254 *w = *u;
1255 u++;
1256 }
1257 w++;
1258 }
1259 if (w != wend)
1260 *w = L'\0';
1261 return w - worig;
1262 }
1263 else {
1264 nchar = 1; /* nul character at the end */
1265 while (u != uend) {
1266 if (0xD800 <= u[0] && u[0] <= 0xDBFF
1267 && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1268 u += 2;
1269 else
1270 u++;
1271 nchar++;
1272 }
1273 }
1274 return nchar;
1275#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1276 register Py_UNICODE *u, *uend, ordinal;
1277 register Py_ssize_t i;
1278 wchar_t *worig, *wend;
1279 Py_ssize_t nchar;
1280
1281 u = PyUnicode_AS_UNICODE(unicode);
1282 uend = u + PyUnicode_GET_SIZE(u);
1283 if (w != NULL) {
1284 worig = w;
1285 wend = w + size;
1286 while (u != uend && w != wend) {
1287 ordinal = *u;
1288 if (ordinal > 0xffff) {
1289 ordinal -= 0x10000;
1290 *w++ = 0xD800 | (ordinal >> 10);
1291 *w++ = 0xDC00 | (ordinal & 0x3FF);
1292 }
1293 else
1294 *w++ = ordinal;
1295 u++;
1296 }
1297 if (w != wend)
1298 *w = 0;
1299 return w - worig;
1300 }
1301 else {
1302 nchar = 1; /* nul character */
1303 while (u != uend) {
1304 if (*u > 0xffff)
1305 nchar += 2;
1306 else
1307 nchar++;
1308 u++;
1309 }
1310 return nchar;
1311 }
1312#else
1313# error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
Victor Stinner137c34c2010-09-29 10:25:54 +00001314#endif
1315}
1316
1317Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001318PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001319 wchar_t *w,
1320 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321{
1322 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001323 PyErr_BadInternalCall();
1324 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00001326 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327}
1328
Victor Stinner137c34c2010-09-29 10:25:54 +00001329wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001330PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00001331 Py_ssize_t *size)
1332{
1333 wchar_t* buffer;
1334 Py_ssize_t buflen;
1335
1336 if (unicode == NULL) {
1337 PyErr_BadInternalCall();
1338 return NULL;
1339 }
1340
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001341 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001342 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00001343 PyErr_NoMemory();
1344 return NULL;
1345 }
1346
Victor Stinner137c34c2010-09-29 10:25:54 +00001347 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1348 if (buffer == NULL) {
1349 PyErr_NoMemory();
1350 return NULL;
1351 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00001352 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Victor Stinner5593d8a2010-10-02 11:11:27 +00001353 if (size != NULL)
1354 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00001355 return buffer;
1356}
1357
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358#endif
1359
Alexander Belopolsky40018472011-02-26 01:02:56 +00001360PyObject *
1361PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001362{
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001363 Py_UNICODE s[2];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001364
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001365 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001366 PyErr_SetString(PyExc_ValueError,
1367 "chr() arg not in range(0x110000)");
1368 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001369 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00001370
1371#ifndef Py_UNICODE_WIDE
1372 if (ordinal > 0xffff) {
1373 ordinal -= 0x10000;
1374 s[0] = 0xD800 | (ordinal >> 10);
1375 s[1] = 0xDC00 | (ordinal & 0x3FF);
1376 return PyUnicode_FromUnicode(s, 2);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001377 }
1378#endif
1379
Hye-Shik Chang40574832004-04-06 07:24:51 +00001380 s[0] = (Py_UNICODE)ordinal;
1381 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00001382}
1383
Alexander Belopolsky40018472011-02-26 01:02:56 +00001384PyObject *
1385PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001387 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00001388 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001389 if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001390 Py_INCREF(obj);
1391 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001392 }
1393 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001394 /* For a Unicode subtype that's not a Unicode object,
1395 return a true Unicode object with the same data. */
1396 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1397 PyUnicode_GET_SIZE(obj));
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001398 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00001399 PyErr_Format(PyExc_TypeError,
1400 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00001401 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001402 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001403}
1404
Alexander Belopolsky40018472011-02-26 01:02:56 +00001405PyObject *
1406PyUnicode_FromEncodedObject(register PyObject *obj,
1407 const char *encoding,
1408 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001409{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001410 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001411 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00001412
Guido van Rossumd57fd912000-03-10 22:53:23 +00001413 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001414 PyErr_BadInternalCall();
1415 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001417
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001418 /* Decoding bytes objects is the most common case and should be fast */
1419 if (PyBytes_Check(obj)) {
1420 if (PyBytes_GET_SIZE(obj) == 0) {
1421 Py_INCREF(unicode_empty);
1422 v = (PyObject *) unicode_empty;
1423 }
1424 else {
1425 v = PyUnicode_Decode(
1426 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1427 encoding, errors);
1428 }
1429 return v;
1430 }
1431
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001432 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001433 PyErr_SetString(PyExc_TypeError,
1434 "decoding str is not supported");
1435 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001436 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00001437
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001438 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1439 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1440 PyErr_Format(PyExc_TypeError,
1441 "coercing to str: need bytes, bytearray "
1442 "or buffer-like object, %.80s found",
1443 Py_TYPE(obj)->tp_name);
1444 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00001445 }
Tim Petersced69f82003-09-16 20:30:58 +00001446
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001447 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001448 Py_INCREF(unicode_empty);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001449 v = (PyObject *) unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001450 }
Tim Petersced69f82003-09-16 20:30:58 +00001451 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001452 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00001453
Antoine Pitroub0fa8312010-09-01 15:10:12 +00001454 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00001455 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456}
1457
Victor Stinner600d3be2010-06-10 12:00:55 +00001458/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00001459 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1460 1 on success. */
1461static int
1462normalize_encoding(const char *encoding,
1463 char *lower,
1464 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001465{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001466 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00001467 char *l;
1468 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001469
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001470 e = encoding;
1471 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00001472 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00001473 while (*e) {
1474 if (l == l_end)
1475 return 0;
David Malcolm96960882010-11-05 17:23:41 +00001476 if (Py_ISUPPER(*e)) {
1477 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00001478 }
1479 else if (*e == '_') {
1480 *l++ = '-';
1481 e++;
1482 }
1483 else {
1484 *l++ = *e++;
1485 }
1486 }
1487 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00001488 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00001489}
1490
Alexander Belopolsky40018472011-02-26 01:02:56 +00001491PyObject *
1492PyUnicode_Decode(const char *s,
1493 Py_ssize_t size,
1494 const char *encoding,
1495 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00001496{
1497 PyObject *buffer = NULL, *unicode;
1498 Py_buffer info;
1499 char lower[11]; /* Enough for any encoding shortcut */
1500
1501 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001502 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001503
1504 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001505 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001506 if ((strcmp(lower, "utf-8") == 0) ||
1507 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001508 return PyUnicode_DecodeUTF8(s, size, errors);
1509 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001510 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001511 (strcmp(lower, "iso-8859-1") == 0))
1512 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001513#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001514 else if (strcmp(lower, "mbcs") == 0)
1515 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001516#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001517 else if (strcmp(lower, "ascii") == 0)
1518 return PyUnicode_DecodeASCII(s, size, errors);
1519 else if (strcmp(lower, "utf-16") == 0)
1520 return PyUnicode_DecodeUTF16(s, size, errors, 0);
1521 else if (strcmp(lower, "utf-32") == 0)
1522 return PyUnicode_DecodeUTF32(s, size, errors, 0);
1523 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001524
1525 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001526 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00001527 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00001528 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00001529 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001530 if (buffer == NULL)
1531 goto onError;
1532 unicode = PyCodec_Decode(buffer, encoding, errors);
1533 if (unicode == NULL)
1534 goto onError;
1535 if (!PyUnicode_Check(unicode)) {
1536 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001537 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00001538 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001539 Py_DECREF(unicode);
1540 goto onError;
1541 }
1542 Py_DECREF(buffer);
1543 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001544
Benjamin Peterson29060642009-01-31 22:14:21 +00001545 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001546 Py_XDECREF(buffer);
1547 return NULL;
1548}
1549
Alexander Belopolsky40018472011-02-26 01:02:56 +00001550PyObject *
1551PyUnicode_AsDecodedObject(PyObject *unicode,
1552 const char *encoding,
1553 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001554{
1555 PyObject *v;
1556
1557 if (!PyUnicode_Check(unicode)) {
1558 PyErr_BadArgument();
1559 goto onError;
1560 }
1561
1562 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001563 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001564
1565 /* Decode via the codec registry */
1566 v = PyCodec_Decode(unicode, encoding, errors);
1567 if (v == NULL)
1568 goto onError;
1569 return v;
1570
Benjamin Peterson29060642009-01-31 22:14:21 +00001571 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001572 return NULL;
1573}
1574
Alexander Belopolsky40018472011-02-26 01:02:56 +00001575PyObject *
1576PyUnicode_AsDecodedUnicode(PyObject *unicode,
1577 const char *encoding,
1578 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001579{
1580 PyObject *v;
1581
1582 if (!PyUnicode_Check(unicode)) {
1583 PyErr_BadArgument();
1584 goto onError;
1585 }
1586
1587 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001588 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001589
1590 /* Decode via the codec registry */
1591 v = PyCodec_Decode(unicode, encoding, errors);
1592 if (v == NULL)
1593 goto onError;
1594 if (!PyUnicode_Check(v)) {
1595 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001596 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001597 Py_TYPE(v)->tp_name);
1598 Py_DECREF(v);
1599 goto onError;
1600 }
1601 return v;
1602
Benjamin Peterson29060642009-01-31 22:14:21 +00001603 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001604 return NULL;
1605}
1606
Alexander Belopolsky40018472011-02-26 01:02:56 +00001607PyObject *
1608PyUnicode_Encode(const Py_UNICODE *s,
1609 Py_ssize_t size,
1610 const char *encoding,
1611 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612{
1613 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00001614
Guido van Rossumd57fd912000-03-10 22:53:23 +00001615 unicode = PyUnicode_FromUnicode(s, size);
1616 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001617 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1619 Py_DECREF(unicode);
1620 return v;
1621}
1622
Alexander Belopolsky40018472011-02-26 01:02:56 +00001623PyObject *
1624PyUnicode_AsEncodedObject(PyObject *unicode,
1625 const char *encoding,
1626 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001627{
1628 PyObject *v;
1629
1630 if (!PyUnicode_Check(unicode)) {
1631 PyErr_BadArgument();
1632 goto onError;
1633 }
1634
1635 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001636 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001637
1638 /* Encode via the codec registry */
1639 v = PyCodec_Encode(unicode, encoding, errors);
1640 if (v == NULL)
1641 goto onError;
1642 return v;
1643
Benjamin Peterson29060642009-01-31 22:14:21 +00001644 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00001645 return NULL;
1646}
1647
Victor Stinnerad158722010-10-27 00:25:46 +00001648PyObject *
1649PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00001650{
Victor Stinner313a1202010-06-11 23:56:51 +00001651#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinnerad158722010-10-27 00:25:46 +00001652 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1653 PyUnicode_GET_SIZE(unicode),
1654 NULL);
1655#elif defined(__APPLE__)
1656 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1657 PyUnicode_GET_SIZE(unicode),
1658 "surrogateescape");
1659#else
1660 if (Py_FileSystemDefaultEncoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00001661 return PyUnicode_AsEncodedString(unicode,
1662 Py_FileSystemDefaultEncoding,
1663 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00001664 }
1665 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001666 /* locale encoding with surrogateescape */
1667 wchar_t *wchar;
1668 char *bytes;
1669 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00001670 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001671
1672 wchar = PyUnicode_AsWideCharString(unicode, NULL);
1673 if (wchar == NULL)
1674 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001675 bytes = _Py_wchar2char(wchar, &error_pos);
1676 if (bytes == NULL) {
1677 if (error_pos != (size_t)-1) {
1678 char *errmsg = strerror(errno);
1679 PyObject *exc = NULL;
1680 if (errmsg == NULL)
1681 errmsg = "Py_wchar2char() failed";
1682 raise_encode_exception(&exc,
1683 "filesystemencoding",
1684 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1685 error_pos, error_pos+1,
1686 errmsg);
1687 Py_XDECREF(exc);
1688 }
1689 else
1690 PyErr_NoMemory();
1691 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001692 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00001693 }
1694 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001695
1696 bytes_obj = PyBytes_FromString(bytes);
1697 PyMem_Free(bytes);
1698 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00001699 }
Victor Stinnerad158722010-10-27 00:25:46 +00001700#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00001701}
1702
Alexander Belopolsky40018472011-02-26 01:02:56 +00001703PyObject *
1704PyUnicode_AsEncodedString(PyObject *unicode,
1705 const char *encoding,
1706 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707{
1708 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00001709 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00001710
Guido van Rossumd57fd912000-03-10 22:53:23 +00001711 if (!PyUnicode_Check(unicode)) {
1712 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001713 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714 }
Fred Drakee4315f52000-05-09 19:53:39 +00001715
Tim Petersced69f82003-09-16 20:30:58 +00001716 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001717 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1718 PyUnicode_GET_SIZE(unicode),
1719 errors);
Fred Drakee4315f52000-05-09 19:53:39 +00001720
1721 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00001722 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001723 if ((strcmp(lower, "utf-8") == 0) ||
1724 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00001725 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1726 PyUnicode_GET_SIZE(unicode),
1727 errors);
1728 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00001729 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00001730 (strcmp(lower, "iso-8859-1") == 0))
1731 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1732 PyUnicode_GET_SIZE(unicode),
1733 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001734#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Victor Stinner37296e82010-06-10 13:36:23 +00001735 else if (strcmp(lower, "mbcs") == 0)
1736 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1737 PyUnicode_GET_SIZE(unicode),
1738 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00001739#endif
Victor Stinner37296e82010-06-10 13:36:23 +00001740 else if (strcmp(lower, "ascii") == 0)
1741 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1742 PyUnicode_GET_SIZE(unicode),
1743 errors);
1744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745
1746 /* Encode via the codec registry */
1747 v = PyCodec_Encode(unicode, encoding, errors);
1748 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001749 return NULL;
1750
1751 /* The normal path */
1752 if (PyBytes_Check(v))
1753 return v;
1754
1755 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001756 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001757 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001758 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001759
1760 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1761 "encoder %s returned bytearray instead of bytes",
1762 encoding);
1763 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001764 Py_DECREF(v);
1765 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001766 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001767
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00001768 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1769 Py_DECREF(v);
1770 return b;
1771 }
1772
1773 PyErr_Format(PyExc_TypeError,
1774 "encoder did not return a bytes object (type=%.400s)",
1775 Py_TYPE(v)->tp_name);
1776 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001777 return NULL;
1778}
1779
Alexander Belopolsky40018472011-02-26 01:02:56 +00001780PyObject *
1781PyUnicode_AsEncodedUnicode(PyObject *unicode,
1782 const char *encoding,
1783 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001784{
1785 PyObject *v;
1786
1787 if (!PyUnicode_Check(unicode)) {
1788 PyErr_BadArgument();
1789 goto onError;
1790 }
1791
1792 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001793 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001794
1795 /* Encode via the codec registry */
1796 v = PyCodec_Encode(unicode, encoding, errors);
1797 if (v == NULL)
1798 goto onError;
1799 if (!PyUnicode_Check(v)) {
1800 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00001801 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00001802 Py_TYPE(v)->tp_name);
1803 Py_DECREF(v);
1804 goto onError;
1805 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001806 return v;
Tim Petersced69f82003-09-16 20:30:58 +00001807
Benjamin Peterson29060642009-01-31 22:14:21 +00001808 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809 return NULL;
1810}
1811
Alexander Belopolsky40018472011-02-26 01:02:56 +00001812PyObject *
1813_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1814 const char *errors)
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001815{
1816 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001817 if (v)
1818 return v;
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001819 if (errors != NULL)
1820 Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
Guido van Rossum98297ee2007-11-06 21:34:58 +00001821 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Guido van Rossum06610092007-08-16 21:02:22 +00001822 PyUnicode_GET_SIZE(unicode),
1823 NULL);
Guido van Rossum98297ee2007-11-06 21:34:58 +00001824 if (!v)
Guido van Rossumf15a29f2007-05-04 00:41:39 +00001825 return NULL;
Guido van Rossume7a0d392007-07-12 07:53:00 +00001826 ((PyUnicodeObject *)unicode)->defenc = v;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00001827 return v;
1828}
1829
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001830PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00001831PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001832 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00001833 return PyUnicode_DecodeFSDefaultAndSize(s, size);
1834}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001835
Christian Heimes5894ba72007-11-04 11:43:14 +00001836PyObject*
1837PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1838{
Victor Stinnerad158722010-10-27 00:25:46 +00001839#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1840 return PyUnicode_DecodeMBCS(s, size, NULL);
1841#elif defined(__APPLE__)
1842 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1843#else
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001844 /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1845 can be undefined. If it is case, decode using UTF-8. The following assumes
1846 that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1847 bootstrapping process where the codecs aren't ready yet.
1848 */
1849 if (Py_FileSystemDefaultEncoding) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001850 return PyUnicode_Decode(s, size,
1851 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00001852 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001853 }
1854 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001855 /* locale encoding with surrogateescape */
1856 wchar_t *wchar;
1857 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00001858 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001859
1860 if (s[size] != '\0' || size != strlen(s)) {
1861 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1862 return NULL;
1863 }
1864
Victor Stinner168e1172010-10-16 23:16:16 +00001865 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001866 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00001867 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001868
Victor Stinner168e1172010-10-16 23:16:16 +00001869 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00001870 PyMem_Free(wchar);
1871 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001872 }
Victor Stinnerad158722010-10-27 00:25:46 +00001873#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00001874}
1875
Martin v. Löwis011e8422009-05-05 04:43:17 +00001876
1877int
1878PyUnicode_FSConverter(PyObject* arg, void* addr)
1879{
1880 PyObject *output = NULL;
1881 Py_ssize_t size;
1882 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001883 if (arg == NULL) {
1884 Py_DECREF(*(PyObject**)addr);
1885 return 1;
1886 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00001887 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00001888 output = arg;
1889 Py_INCREF(output);
1890 }
1891 else {
1892 arg = PyUnicode_FromObject(arg);
1893 if (!arg)
1894 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00001895 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001896 Py_DECREF(arg);
1897 if (!output)
1898 return 0;
1899 if (!PyBytes_Check(output)) {
1900 Py_DECREF(output);
1901 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1902 return 0;
1903 }
1904 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00001905 size = PyBytes_GET_SIZE(output);
1906 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001907 if (size != strlen(data)) {
1908 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1909 Py_DECREF(output);
1910 return 0;
1911 }
1912 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00001913 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001914}
1915
1916
Victor Stinner47fcb5b2010-08-13 23:59:58 +00001917int
1918PyUnicode_FSDecoder(PyObject* arg, void* addr)
1919{
1920 PyObject *output = NULL;
1921 Py_ssize_t size;
1922 void *data;
1923 if (arg == NULL) {
1924 Py_DECREF(*(PyObject**)addr);
1925 return 1;
1926 }
1927 if (PyUnicode_Check(arg)) {
1928 output = arg;
1929 Py_INCREF(output);
1930 }
1931 else {
1932 arg = PyBytes_FromObject(arg);
1933 if (!arg)
1934 return 0;
1935 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1936 PyBytes_GET_SIZE(arg));
1937 Py_DECREF(arg);
1938 if (!output)
1939 return 0;
1940 if (!PyUnicode_Check(output)) {
1941 Py_DECREF(output);
1942 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1943 return 0;
1944 }
1945 }
1946 size = PyUnicode_GET_SIZE(output);
1947 data = PyUnicode_AS_UNICODE(output);
1948 if (size != Py_UNICODE_strlen(data)) {
1949 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1950 Py_DECREF(output);
1951 return 0;
1952 }
1953 *(PyObject**)addr = output;
1954 return Py_CLEANUP_SUPPORTED;
1955}
1956
1957
Martin v. Löwis5b222132007-06-10 09:51:05 +00001958char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001959_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001960{
Christian Heimesf3863112007-11-22 07:46:41 +00001961 PyObject *bytes;
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00001962 if (!PyUnicode_Check(unicode)) {
1963 PyErr_BadArgument();
1964 return NULL;
1965 }
Christian Heimesf3863112007-11-22 07:46:41 +00001966 bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1967 if (bytes == NULL)
Martin v. Löwis5b222132007-06-10 09:51:05 +00001968 return NULL;
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001969 if (psize != NULL)
Christian Heimes72b710a2008-05-26 13:28:38 +00001970 *psize = PyBytes_GET_SIZE(bytes);
1971 return PyBytes_AS_STRING(bytes);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001972}
1973
1974char*
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001975_PyUnicode_AsString(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00001976{
Marc-André Lemburg4cc0f242008-08-07 18:54:33 +00001977 return _PyUnicode_AsStringAndSize(unicode, NULL);
Martin v. Löwis5b222132007-06-10 09:51:05 +00001978}
1979
Alexander Belopolsky40018472011-02-26 01:02:56 +00001980Py_UNICODE *
1981PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982{
1983 if (!PyUnicode_Check(unicode)) {
1984 PyErr_BadArgument();
1985 goto onError;
1986 }
1987 return PyUnicode_AS_UNICODE(unicode);
1988
Benjamin Peterson29060642009-01-31 22:14:21 +00001989 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990 return NULL;
1991}
1992
Alexander Belopolsky40018472011-02-26 01:02:56 +00001993Py_ssize_t
1994PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995{
1996 if (!PyUnicode_Check(unicode)) {
1997 PyErr_BadArgument();
1998 goto onError;
1999 }
2000 return PyUnicode_GET_SIZE(unicode);
2001
Benjamin Peterson29060642009-01-31 22:14:21 +00002002 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002003 return -1;
2004}
2005
Alexander Belopolsky40018472011-02-26 01:02:56 +00002006const char *
2007PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00002008{
Victor Stinner42cb4622010-09-01 19:39:01 +00002009 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00002010}
2011
Victor Stinner554f3f02010-06-16 23:33:54 +00002012/* create or adjust a UnicodeDecodeError */
2013static void
2014make_decode_exception(PyObject **exceptionObject,
2015 const char *encoding,
2016 const char *input, Py_ssize_t length,
2017 Py_ssize_t startpos, Py_ssize_t endpos,
2018 const char *reason)
2019{
2020 if (*exceptionObject == NULL) {
2021 *exceptionObject = PyUnicodeDecodeError_Create(
2022 encoding, input, length, startpos, endpos, reason);
2023 }
2024 else {
2025 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2026 goto onError;
2027 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2028 goto onError;
2029 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2030 goto onError;
2031 }
2032 return;
2033
2034onError:
2035 Py_DECREF(*exceptionObject);
2036 *exceptionObject = NULL;
2037}
2038
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002039/* error handling callback helper:
2040 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00002041 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002042 and adjust various state variables.
2043 return 0 on success, -1 on error
2044*/
2045
Alexander Belopolsky40018472011-02-26 01:02:56 +00002046static int
2047unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2048 const char *encoding, const char *reason,
2049 const char **input, const char **inend, Py_ssize_t *startinpos,
2050 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2051 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002052{
Benjamin Peterson142957c2008-07-04 19:55:29 +00002053 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002054
2055 PyObject *restuple = NULL;
2056 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002057 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002058 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002059 Py_ssize_t requiredsize;
2060 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002062 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002063 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002064 int res = -1;
2065
2066 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002067 *errorHandler = PyCodec_LookupError(errors);
2068 if (*errorHandler == NULL)
2069 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002070 }
2071
Victor Stinner554f3f02010-06-16 23:33:54 +00002072 make_decode_exception(exceptionObject,
2073 encoding,
2074 *input, *inend - *input,
2075 *startinpos, *endinpos,
2076 reason);
2077 if (*exceptionObject == NULL)
2078 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002079
2080 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2081 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002082 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002083 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00002084 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00002085 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002086 }
2087 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00002088 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002089
2090 /* Copy back the bytes variables, which might have been modified by the
2091 callback */
2092 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2093 if (!inputobj)
2094 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00002095 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002096 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00002097 }
Christian Heimes72b710a2008-05-26 13:28:38 +00002098 *input = PyBytes_AS_STRING(inputobj);
2099 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002100 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00002101 /* we can DECREF safely, as the exception has another reference,
2102 so the object won't go away. */
2103 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00002104
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002105 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002106 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002107 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002108 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2109 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002110 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002111
2112 /* need more space? (at least enough for what we
2113 have+the replacement+the rest of the string (starting
2114 at the new input position), so we won't have to check space
2115 when there are no errors in the rest of the string) */
2116 repptr = PyUnicode_AS_UNICODE(repunicode);
2117 repsize = PyUnicode_GET_SIZE(repunicode);
2118 requiredsize = *outpos + repsize + insize-newpos;
2119 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002120 if (requiredsize<2*outsize)
2121 requiredsize = 2*outsize;
2122 if (_PyUnicode_Resize(output, requiredsize) < 0)
2123 goto onError;
2124 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002125 }
2126 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002127 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002128 Py_UNICODE_COPY(*outptr, repptr, repsize);
2129 *outptr += repsize;
2130 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00002131
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002132 /* we made it! */
2133 res = 0;
2134
Benjamin Peterson29060642009-01-31 22:14:21 +00002135 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002136 Py_XDECREF(restuple);
2137 return res;
2138}
2139
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002140/* --- UTF-7 Codec -------------------------------------------------------- */
2141
Antoine Pitrou244651a2009-05-04 18:56:13 +00002142/* See RFC2152 for details. We encode conservatively and decode liberally. */
2143
2144/* Three simple macros defining base-64. */
2145
2146/* Is c a base-64 character? */
2147
2148#define IS_BASE64(c) \
2149 (((c) >= 'A' && (c) <= 'Z') || \
2150 ((c) >= 'a' && (c) <= 'z') || \
2151 ((c) >= '0' && (c) <= '9') || \
2152 (c) == '+' || (c) == '/')
2153
2154/* given that c is a base-64 character, what is its base-64 value? */
2155
2156#define FROM_BASE64(c) \
2157 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
2158 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
2159 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
2160 (c) == '+' ? 62 : 63)
2161
2162/* What is the base-64 character of the bottom 6 bits of n? */
2163
2164#define TO_BASE64(n) \
2165 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2166
2167/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2168 * decoded as itself. We are permissive on decoding; the only ASCII
2169 * byte not decoding to itself is the + which begins a base64
2170 * string. */
2171
2172#define DECODE_DIRECT(c) \
2173 ((c) <= 127 && (c) != '+')
2174
2175/* The UTF-7 encoder treats ASCII characters differently according to
2176 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2177 * the above). See RFC2152. This array identifies these different
2178 * sets:
2179 * 0 : "Set D"
2180 * alphanumeric and '(),-./:?
2181 * 1 : "Set O"
2182 * !"#$%&*;<=>@[]^_`{|}
2183 * 2 : "whitespace"
2184 * ht nl cr sp
2185 * 3 : special (must be base64 encoded)
2186 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2187 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002188
Tim Petersced69f82003-09-16 20:30:58 +00002189static
Antoine Pitrou244651a2009-05-04 18:56:13 +00002190char utf7_category[128] = {
2191/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
2192 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
2193/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
2194 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2195/* sp ! " # $ % & ' ( ) * + , - . / */
2196 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
2197/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
2198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
2199/* @ A B C D E F G H I J K L M N O */
2200 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2201/* P Q R S T U V W X Y Z [ \ ] ^ _ */
2202 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
2203/* ` a b c d e f g h i j k l m n o */
2204 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2205/* p q r s t u v w x y z { | } ~ del */
2206 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002207};
2208
Antoine Pitrou244651a2009-05-04 18:56:13 +00002209/* ENCODE_DIRECT: this character should be encoded as itself. The
2210 * answer depends on whether we are encoding set O as itself, and also
2211 * on whether we are encoding whitespace as itself. RFC2152 makes it
2212 * clear that the answers to these questions vary between
2213 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00002214
Antoine Pitrou244651a2009-05-04 18:56:13 +00002215#define ENCODE_DIRECT(c, directO, directWS) \
2216 ((c) < 128 && (c) > 0 && \
2217 ((utf7_category[(c)] == 0) || \
2218 (directWS && (utf7_category[(c)] == 2)) || \
2219 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002220
Alexander Belopolsky40018472011-02-26 01:02:56 +00002221PyObject *
2222PyUnicode_DecodeUTF7(const char *s,
2223 Py_ssize_t size,
2224 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002225{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002226 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2227}
2228
Antoine Pitrou244651a2009-05-04 18:56:13 +00002229/* The decoder. The only state we preserve is our read position,
2230 * i.e. how many characters we have consumed. So if we end in the
2231 * middle of a shift sequence we have to back off the read position
2232 * and the output to the beginning of the sequence, otherwise we lose
2233 * all the shift state (seen bits, number of bits seen, high
2234 * surrogate). */
2235
Alexander Belopolsky40018472011-02-26 01:02:56 +00002236PyObject *
2237PyUnicode_DecodeUTF7Stateful(const char *s,
2238 Py_ssize_t size,
2239 const char *errors,
2240 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002241{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002242 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002243 Py_ssize_t startinpos;
2244 Py_ssize_t endinpos;
2245 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002246 const char *e;
2247 PyUnicodeObject *unicode;
2248 Py_UNICODE *p;
2249 const char *errmsg = "";
2250 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002251 Py_UNICODE *shiftOutStart;
2252 unsigned int base64bits = 0;
2253 unsigned long base64buffer = 0;
2254 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002255 PyObject *errorHandler = NULL;
2256 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002257
2258 unicode = _PyUnicode_New(size);
2259 if (!unicode)
2260 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002261 if (size == 0) {
2262 if (consumed)
2263 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002264 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002265 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002266
2267 p = unicode->str;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002268 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002269 e = s + size;
2270
2271 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002272 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00002273 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00002274 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002275
Antoine Pitrou244651a2009-05-04 18:56:13 +00002276 if (inShift) { /* in a base-64 section */
2277 if (IS_BASE64(ch)) { /* consume a base-64 character */
2278 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2279 base64bits += 6;
2280 s++;
2281 if (base64bits >= 16) {
2282 /* we have enough bits for a UTF-16 value */
2283 Py_UNICODE outCh = (Py_UNICODE)
2284 (base64buffer >> (base64bits-16));
2285 base64bits -= 16;
2286 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2287 if (surrogate) {
2288 /* expecting a second surrogate */
2289 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2290#ifdef Py_UNICODE_WIDE
2291 *p++ = (((surrogate & 0x3FF)<<10)
2292 | (outCh & 0x3FF)) + 0x10000;
2293#else
2294 *p++ = surrogate;
2295 *p++ = outCh;
2296#endif
2297 surrogate = 0;
2298 }
2299 else {
2300 surrogate = 0;
2301 errmsg = "second surrogate missing";
2302 goto utf7Error;
2303 }
2304 }
2305 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2306 /* first surrogate */
2307 surrogate = outCh;
2308 }
2309 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2310 errmsg = "unexpected second surrogate";
2311 goto utf7Error;
2312 }
2313 else {
2314 *p++ = outCh;
2315 }
2316 }
2317 }
2318 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002319 inShift = 0;
2320 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002321 if (surrogate) {
2322 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00002323 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002324 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002325 if (base64bits > 0) { /* left-over bits */
2326 if (base64bits >= 6) {
2327 /* We've seen at least one base-64 character */
2328 errmsg = "partial character in shift sequence";
2329 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002330 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002331 else {
2332 /* Some bits remain; they should be zero */
2333 if (base64buffer != 0) {
2334 errmsg = "non-zero padding bits in shift sequence";
2335 goto utf7Error;
2336 }
2337 }
2338 }
2339 if (ch != '-') {
2340 /* '-' is absorbed; other terminating
2341 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002342 *p++ = ch;
2343 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002344 }
2345 }
2346 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002347 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002348 s++; /* consume '+' */
2349 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002350 s++;
2351 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00002352 }
2353 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002354 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002355 shiftOutStart = p;
2356 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002357 }
2358 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002359 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002360 *p++ = ch;
2361 s++;
2362 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002363 else {
2364 startinpos = s-starts;
2365 s++;
2366 errmsg = "unexpected special character";
2367 goto utf7Error;
2368 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002369 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002370utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002371 outpos = p-PyUnicode_AS_UNICODE(unicode);
2372 endinpos = s-starts;
2373 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00002374 errors, &errorHandler,
2375 "utf7", errmsg,
2376 &starts, &e, &startinpos, &endinpos, &exc, &s,
2377 &unicode, &outpos, &p))
2378 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002379 }
2380
Antoine Pitrou244651a2009-05-04 18:56:13 +00002381 /* end of string */
2382
2383 if (inShift && !consumed) { /* in shift sequence, no more to follow */
2384 /* if we're in an inconsistent state, that's an error */
2385 if (surrogate ||
2386 (base64bits >= 6) ||
2387 (base64bits > 0 && base64buffer != 0)) {
2388 outpos = p-PyUnicode_AS_UNICODE(unicode);
2389 endinpos = size;
2390 if (unicode_decode_call_errorhandler(
2391 errors, &errorHandler,
2392 "utf7", "unterminated shift sequence",
2393 &starts, &e, &startinpos, &endinpos, &exc, &s,
2394 &unicode, &outpos, &p))
2395 goto onError;
2396 if (s < e)
2397 goto restart;
2398 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002399 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002400
2401 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002402 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00002403 if (inShift) {
2404 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002405 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002406 }
2407 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002408 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002409 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00002410 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002411
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002412 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002413 goto onError;
2414
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002415 Py_XDECREF(errorHandler);
2416 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002417 return (PyObject *)unicode;
2418
Benjamin Peterson29060642009-01-31 22:14:21 +00002419 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002420 Py_XDECREF(errorHandler);
2421 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002422 Py_DECREF(unicode);
2423 return NULL;
2424}
2425
2426
Alexander Belopolsky40018472011-02-26 01:02:56 +00002427PyObject *
2428PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2429 Py_ssize_t size,
2430 int base64SetO,
2431 int base64WhiteSpace,
2432 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002433{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002434 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002435 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002436 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002437 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002438 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002439 unsigned int base64bits = 0;
2440 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002441 char * out;
2442 char * start;
2443
2444 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00002445 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002446
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00002447 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00002448 return PyErr_NoMemory();
2449
Antoine Pitrou244651a2009-05-04 18:56:13 +00002450 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002451 if (v == NULL)
2452 return NULL;
2453
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002454 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002455 for (;i < size; ++i) {
2456 Py_UNICODE ch = s[i];
2457
Antoine Pitrou244651a2009-05-04 18:56:13 +00002458 if (inShift) {
2459 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2460 /* shifting out */
2461 if (base64bits) { /* output remaining bits */
2462 *out++ = TO_BASE64(base64buffer << (6-base64bits));
2463 base64buffer = 0;
2464 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002465 }
2466 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00002467 /* Characters not in the BASE64 set implicitly unshift the sequence
2468 so no '-' is required, except if the character is itself a '-' */
2469 if (IS_BASE64(ch) || ch == '-') {
2470 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002471 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002472 *out++ = (char) ch;
2473 }
2474 else {
2475 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00002476 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002477 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002478 else { /* not in a shift sequence */
2479 if (ch == '+') {
2480 *out++ = '+';
2481 *out++ = '-';
2482 }
2483 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2484 *out++ = (char) ch;
2485 }
2486 else {
2487 *out++ = '+';
2488 inShift = 1;
2489 goto encode_char;
2490 }
2491 }
2492 continue;
2493encode_char:
2494#ifdef Py_UNICODE_WIDE
2495 if (ch >= 0x10000) {
2496 /* code first surrogate */
2497 base64bits += 16;
2498 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2499 while (base64bits >= 6) {
2500 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2501 base64bits -= 6;
2502 }
2503 /* prepare second surrogate */
2504 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
2505 }
2506#endif
2507 base64bits += 16;
2508 base64buffer = (base64buffer << 16) | ch;
2509 while (base64bits >= 6) {
2510 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2511 base64bits -= 6;
2512 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00002513 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00002514 if (base64bits)
2515 *out++= TO_BASE64(base64buffer << (6-base64bits) );
2516 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002517 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00002518 if (_PyBytes_Resize(&v, out - start) < 0)
2519 return NULL;
2520 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002521}
2522
Antoine Pitrou244651a2009-05-04 18:56:13 +00002523#undef IS_BASE64
2524#undef FROM_BASE64
2525#undef TO_BASE64
2526#undef DECODE_DIRECT
2527#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00002528
Guido van Rossumd57fd912000-03-10 22:53:23 +00002529/* --- UTF-8 Codec -------------------------------------------------------- */
2530
Tim Petersced69f82003-09-16 20:30:58 +00002531static
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00002533 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
2534 illegal prefix. See RFC 3629 for details */
2535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00002538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00002542 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00002546 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2547 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2548 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2549 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2550 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551};
2552
Alexander Belopolsky40018472011-02-26 01:02:56 +00002553PyObject *
2554PyUnicode_DecodeUTF8(const char *s,
2555 Py_ssize_t size,
2556 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557{
Walter Dörwald69652032004-09-07 20:24:22 +00002558 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2559}
2560
Antoine Pitrouab868312009-01-10 15:40:25 +00002561/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2562#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2563
2564/* Mask to quickly check whether a C 'long' contains a
2565 non-ASCII, UTF8-encoded char. */
2566#if (SIZEOF_LONG == 8)
2567# define ASCII_CHAR_MASK 0x8080808080808080L
2568#elif (SIZEOF_LONG == 4)
2569# define ASCII_CHAR_MASK 0x80808080L
2570#else
2571# error C 'long' size should be either 4 or 8!
2572#endif
2573
Alexander Belopolsky40018472011-02-26 01:02:56 +00002574PyObject *
2575PyUnicode_DecodeUTF8Stateful(const char *s,
2576 Py_ssize_t size,
2577 const char *errors,
2578 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00002579{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002580 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002581 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00002582 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00002583 Py_ssize_t startinpos;
2584 Py_ssize_t endinpos;
2585 Py_ssize_t outpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00002586 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002587 PyUnicodeObject *unicode;
2588 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002589 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002590 PyObject *errorHandler = NULL;
2591 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002592
2593 /* Note: size will always be longer than the resulting Unicode
2594 character count */
2595 unicode = _PyUnicode_New(size);
2596 if (!unicode)
2597 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00002598 if (size == 0) {
2599 if (consumed)
2600 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00002602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603
2604 /* Unpack UTF-8 encoded data */
2605 p = unicode->str;
2606 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00002607 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608
2609 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002610 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002611
2612 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00002613 /* Fast path for runs of ASCII characters. Given that common UTF-8
2614 input will consist of an overwhelming majority of ASCII
2615 characters, we try to optimize for this case by checking
2616 as many characters as a C 'long' can contain.
2617 First, check if we can do an aligned read, as most CPUs have
2618 a penalty for unaligned reads.
2619 */
2620 if (!((size_t) s & LONG_PTR_MASK)) {
2621 /* Help register allocation */
2622 register const char *_s = s;
2623 register Py_UNICODE *_p = p;
2624 while (_s < aligned_end) {
2625 /* Read a whole long at a time (either 4 or 8 bytes),
2626 and do a fast unrolled copy if it only contains ASCII
2627 characters. */
2628 unsigned long data = *(unsigned long *) _s;
2629 if (data & ASCII_CHAR_MASK)
2630 break;
2631 _p[0] = (unsigned char) _s[0];
2632 _p[1] = (unsigned char) _s[1];
2633 _p[2] = (unsigned char) _s[2];
2634 _p[3] = (unsigned char) _s[3];
2635#if (SIZEOF_LONG == 8)
2636 _p[4] = (unsigned char) _s[4];
2637 _p[5] = (unsigned char) _s[5];
2638 _p[6] = (unsigned char) _s[6];
2639 _p[7] = (unsigned char) _s[7];
2640#endif
2641 _s += SIZEOF_LONG;
2642 _p += SIZEOF_LONG;
2643 }
2644 s = _s;
2645 p = _p;
2646 if (s == e)
2647 break;
2648 ch = (unsigned char)*s;
2649 }
2650 }
2651
2652 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002653 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002654 s++;
2655 continue;
2656 }
2657
2658 n = utf8_code_length[ch];
2659
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002660 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002661 if (consumed)
2662 break;
2663 else {
2664 errmsg = "unexpected end of data";
2665 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002666 endinpos = startinpos+1;
2667 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2668 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002669 goto utf8Error;
2670 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002671 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672
2673 switch (n) {
2674
2675 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00002676 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002677 startinpos = s-starts;
2678 endinpos = startinpos+1;
2679 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680
2681 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002682 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00002683 startinpos = s-starts;
2684 endinpos = startinpos+1;
2685 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002686
2687 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00002688 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00002689 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002690 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002691 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00002692 goto utf8Error;
2693 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002695 assert ((ch > 0x007F) && (ch <= 0x07FF));
2696 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 break;
2698
2699 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00002700 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2701 will result in surrogates in range d800-dfff. Surrogates are
2702 not valid UTF-8 so they are rejected.
2703 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2704 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00002705 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002706 (s[2] & 0xc0) != 0x80 ||
2707 ((unsigned char)s[0] == 0xE0 &&
2708 (unsigned char)s[1] < 0xA0) ||
2709 ((unsigned char)s[0] == 0xED &&
2710 (unsigned char)s[1] > 0x9F)) {
2711 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002712 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002713 endinpos = startinpos + 1;
2714
2715 /* if s[1] first two bits are 1 and 0, then the invalid
2716 continuation byte is s[2], so increment endinpos by 1,
2717 if not, s[1] is invalid and endinpos doesn't need to
2718 be incremented. */
2719 if ((s[1] & 0xC0) == 0x80)
2720 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00002721 goto utf8Error;
2722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00002724 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2725 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002726 break;
2727
2728 case 4:
2729 if ((s[1] & 0xc0) != 0x80 ||
2730 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00002731 (s[3] & 0xc0) != 0x80 ||
2732 ((unsigned char)s[0] == 0xF0 &&
2733 (unsigned char)s[1] < 0x90) ||
2734 ((unsigned char)s[0] == 0xF4 &&
2735 (unsigned char)s[1] > 0x8F)) {
2736 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00002737 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00002738 endinpos = startinpos + 1;
2739 if ((s[1] & 0xC0) == 0x80) {
2740 endinpos++;
2741 if ((s[2] & 0xC0) == 0x80)
2742 endinpos++;
2743 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002744 goto utf8Error;
2745 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002746 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00002747 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2748 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2749
Fredrik Lundh8f455852001-06-27 18:59:43 +00002750#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00002751 *p++ = (Py_UNICODE)ch;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002752#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002753 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00002754
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002755 /* translate from 10000..10FFFF to 0..FFFF */
2756 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00002757
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002758 /* high surrogate = top 10 bits added to D800 */
2759 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00002760
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002761 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00002762 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002763#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765 }
2766 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00002767 continue;
Tim Petersced69f82003-09-16 20:30:58 +00002768
Benjamin Peterson29060642009-01-31 22:14:21 +00002769 utf8Error:
2770 outpos = p-PyUnicode_AS_UNICODE(unicode);
2771 if (unicode_decode_call_errorhandler(
2772 errors, &errorHandler,
2773 "utf8", errmsg,
2774 &starts, &e, &startinpos, &endinpos, &exc, &s,
2775 &unicode, &outpos, &p))
2776 goto onError;
2777 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 }
Walter Dörwald69652032004-09-07 20:24:22 +00002779 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00002780 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781
2782 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002783 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 goto onError;
2785
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002786 Py_XDECREF(errorHandler);
2787 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788 return (PyObject *)unicode;
2789
Benjamin Peterson29060642009-01-31 22:14:21 +00002790 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002791 Py_XDECREF(errorHandler);
2792 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002793 Py_DECREF(unicode);
2794 return NULL;
2795}
2796
Antoine Pitrouab868312009-01-10 15:40:25 +00002797#undef ASCII_CHAR_MASK
2798
Victor Stinnerf933e1a2010-10-20 22:58:25 +00002799#ifdef __APPLE__
2800
2801/* Simplified UTF-8 decoder using surrogateescape error handler,
2802 used to decode the command line arguments on Mac OS X. */
2803
2804wchar_t*
2805_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2806{
2807 int n;
2808 const char *e;
2809 wchar_t *unicode, *p;
2810
2811 /* Note: size will always be longer than the resulting Unicode
2812 character count */
2813 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2814 PyErr_NoMemory();
2815 return NULL;
2816 }
2817 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2818 if (!unicode)
2819 return NULL;
2820
2821 /* Unpack UTF-8 encoded data */
2822 p = unicode;
2823 e = s + size;
2824 while (s < e) {
2825 Py_UCS4 ch = (unsigned char)*s;
2826
2827 if (ch < 0x80) {
2828 *p++ = (wchar_t)ch;
2829 s++;
2830 continue;
2831 }
2832
2833 n = utf8_code_length[ch];
2834 if (s + n > e) {
2835 goto surrogateescape;
2836 }
2837
2838 switch (n) {
2839 case 0:
2840 case 1:
2841 goto surrogateescape;
2842
2843 case 2:
2844 if ((s[1] & 0xc0) != 0x80)
2845 goto surrogateescape;
2846 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2847 assert ((ch > 0x007F) && (ch <= 0x07FF));
2848 *p++ = (wchar_t)ch;
2849 break;
2850
2851 case 3:
2852 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2853 will result in surrogates in range d800-dfff. Surrogates are
2854 not valid UTF-8 so they are rejected.
2855 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2856 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2857 if ((s[1] & 0xc0) != 0x80 ||
2858 (s[2] & 0xc0) != 0x80 ||
2859 ((unsigned char)s[0] == 0xE0 &&
2860 (unsigned char)s[1] < 0xA0) ||
2861 ((unsigned char)s[0] == 0xED &&
2862 (unsigned char)s[1] > 0x9F)) {
2863
2864 goto surrogateescape;
2865 }
2866 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2867 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2868 *p++ = (Py_UNICODE)ch;
2869 break;
2870
2871 case 4:
2872 if ((s[1] & 0xc0) != 0x80 ||
2873 (s[2] & 0xc0) != 0x80 ||
2874 (s[3] & 0xc0) != 0x80 ||
2875 ((unsigned char)s[0] == 0xF0 &&
2876 (unsigned char)s[1] < 0x90) ||
2877 ((unsigned char)s[0] == 0xF4 &&
2878 (unsigned char)s[1] > 0x8F)) {
2879 goto surrogateescape;
2880 }
2881 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2882 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2883 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2884
2885#if SIZEOF_WCHAR_T == 4
2886 *p++ = (wchar_t)ch;
2887#else
2888 /* compute and append the two surrogates: */
2889
2890 /* translate from 10000..10FFFF to 0..FFFF */
2891 ch -= 0x10000;
2892
2893 /* high surrogate = top 10 bits added to D800 */
2894 *p++ = (wchar_t)(0xD800 + (ch >> 10));
2895
2896 /* low surrogate = bottom 10 bits added to DC00 */
2897 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2898#endif
2899 break;
2900 }
2901 s += n;
2902 continue;
2903
2904 surrogateescape:
2905 *p++ = 0xDC00 + ch;
2906 s++;
2907 }
2908 *p = L'\0';
2909 return unicode;
2910}
2911
2912#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00002913
Tim Peters602f7402002-04-27 18:03:26 +00002914/* Allocation strategy: if the string is short, convert into a stack buffer
2915 and allocate exactly as much space needed at the end. Else allocate the
2916 maximum possible needed (4 result bytes per Unicode character), and return
2917 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002918*/
Tim Peters7e3d9612002-04-21 03:26:37 +00002919PyObject *
2920PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00002921 Py_ssize_t size,
2922 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923{
Tim Peters602f7402002-04-27 18:03:26 +00002924#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00002925
Guido van Rossum98297ee2007-11-06 21:34:58 +00002926 Py_ssize_t i; /* index into s of next input byte */
2927 PyObject *result; /* result string object */
2928 char *p; /* next free byte in output buffer */
2929 Py_ssize_t nallocated; /* number of result bytes allocated */
2930 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00002931 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002932 PyObject *errorHandler = NULL;
2933 PyObject *exc = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00002934
Tim Peters602f7402002-04-27 18:03:26 +00002935 assert(s != NULL);
2936 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002937
Tim Peters602f7402002-04-27 18:03:26 +00002938 if (size <= MAX_SHORT_UNICHARS) {
2939 /* Write into the stack buffer; nallocated can't overflow.
2940 * At the end, we'll allocate exactly as much heap space as it
2941 * turns out we need.
2942 */
2943 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002944 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00002945 p = stackbuf;
2946 }
2947 else {
2948 /* Overallocate on the heap, and give the excess back at the end. */
2949 nallocated = size * 4;
2950 if (nallocated / 4 != size) /* overflow! */
2951 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00002952 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002953 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00002954 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00002955 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00002956 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002957
Tim Peters602f7402002-04-27 18:03:26 +00002958 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00002959 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002960
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00002961 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00002962 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00002964
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00002966 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00002967 *p++ = (char)(0xc0 | (ch >> 6));
2968 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002969 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00002970#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00002971 /* Special case: check for high and low surrogate */
2972 if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2973 Py_UCS4 ch2 = s[i];
2974 /* Combine the two surrogates to form a UCS4 value */
2975 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2976 i++;
2977
2978 /* Encode UCS4 Unicode ordinals */
2979 *p++ = (char)(0xf0 | (ch >> 18));
2980 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Tim Peters602f7402002-04-27 18:03:26 +00002981 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2982 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00002983 } else {
Victor Stinner445a6232010-04-22 20:01:57 +00002984#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00002985 Py_ssize_t newpos;
2986 PyObject *rep;
2987 Py_ssize_t repsize, k;
2988 rep = unicode_encode_call_errorhandler
2989 (errors, &errorHandler, "utf-8", "surrogates not allowed",
2990 s, size, &exc, i-1, i, &newpos);
2991 if (!rep)
2992 goto error;
2993
2994 if (PyBytes_Check(rep))
2995 repsize = PyBytes_GET_SIZE(rep);
2996 else
2997 repsize = PyUnicode_GET_SIZE(rep);
2998
2999 if (repsize > 4) {
3000 Py_ssize_t offset;
3001
3002 if (result == NULL)
3003 offset = p - stackbuf;
3004 else
3005 offset = p - PyBytes_AS_STRING(result);
3006
3007 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
3008 /* integer overflow */
3009 PyErr_NoMemory();
3010 goto error;
3011 }
3012 nallocated += repsize - 4;
3013 if (result != NULL) {
3014 if (_PyBytes_Resize(&result, nallocated) < 0)
3015 goto error;
3016 } else {
3017 result = PyBytes_FromStringAndSize(NULL, nallocated);
3018 if (result == NULL)
3019 goto error;
3020 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3021 }
3022 p = PyBytes_AS_STRING(result) + offset;
3023 }
3024
3025 if (PyBytes_Check(rep)) {
3026 char *prep = PyBytes_AS_STRING(rep);
3027 for(k = repsize; k > 0; k--)
3028 *p++ = *prep++;
3029 } else /* rep is unicode */ {
3030 Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3031 Py_UNICODE c;
3032
3033 for(k=0; k<repsize; k++) {
3034 c = prep[k];
3035 if (0x80 <= c) {
3036 raise_encode_exception(&exc, "utf-8", s, size,
3037 i-1, i, "surrogates not allowed");
3038 goto error;
3039 }
3040 *p++ = (char)prep[k];
3041 }
3042 }
3043 Py_DECREF(rep);
Victor Stinner445a6232010-04-22 20:01:57 +00003044#ifndef Py_UNICODE_WIDE
Victor Stinner31be90b2010-04-22 19:38:16 +00003045 }
Victor Stinner445a6232010-04-22 20:01:57 +00003046#endif
Victor Stinner31be90b2010-04-22 19:38:16 +00003047 } else if (ch < 0x10000) {
3048 *p++ = (char)(0xe0 | (ch >> 12));
3049 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3050 *p++ = (char)(0x80 | (ch & 0x3f));
3051 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00003052 /* Encode UCS4 Unicode ordinals */
3053 *p++ = (char)(0xf0 | (ch >> 18));
3054 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3055 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3056 *p++ = (char)(0x80 | (ch & 0x3f));
3057 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058 }
Tim Peters0eca65c2002-04-21 17:28:06 +00003059
Guido van Rossum98297ee2007-11-06 21:34:58 +00003060 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00003061 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003062 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00003063 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003064 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003065 }
3066 else {
Christian Heimesf3863112007-11-22 07:46:41 +00003067 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00003068 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00003069 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00003070 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00003071 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003072 Py_XDECREF(errorHandler);
3073 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003074 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00003075 error:
3076 Py_XDECREF(errorHandler);
3077 Py_XDECREF(exc);
3078 Py_XDECREF(result);
3079 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00003080
Tim Peters602f7402002-04-27 18:03:26 +00003081#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082}
3083
Alexander Belopolsky40018472011-02-26 01:02:56 +00003084PyObject *
3085PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 if (!PyUnicode_Check(unicode)) {
3088 PyErr_BadArgument();
3089 return NULL;
3090 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00003091 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003092 PyUnicode_GET_SIZE(unicode),
3093 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003094}
3095
Walter Dörwald41980ca2007-08-16 21:55:45 +00003096/* --- UTF-32 Codec ------------------------------------------------------- */
3097
3098PyObject *
3099PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003100 Py_ssize_t size,
3101 const char *errors,
3102 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003103{
3104 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3105}
3106
3107PyObject *
3108PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003109 Py_ssize_t size,
3110 const char *errors,
3111 int *byteorder,
3112 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003113{
3114 const char *starts = s;
3115 Py_ssize_t startinpos;
3116 Py_ssize_t endinpos;
3117 Py_ssize_t outpos;
3118 PyUnicodeObject *unicode;
3119 Py_UNICODE *p;
3120#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003121 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00003122 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003123#else
3124 const int pairs = 0;
3125#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00003126 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003127 int bo = 0; /* assume native ordering by default */
3128 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00003129 /* Offsets from q for retrieving bytes in the right order. */
3130#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3131 int iorder[] = {0, 1, 2, 3};
3132#else
3133 int iorder[] = {3, 2, 1, 0};
3134#endif
3135 PyObject *errorHandler = NULL;
3136 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00003137
Walter Dörwald41980ca2007-08-16 21:55:45 +00003138 q = (unsigned char *)s;
3139 e = q + size;
3140
3141 if (byteorder)
3142 bo = *byteorder;
3143
3144 /* Check for BOM marks (U+FEFF) in the input and adjust current
3145 byte order setting accordingly. In native mode, the leading BOM
3146 mark is skipped, in all other modes, it is copied to the output
3147 stream as-is (giving a ZWNBSP character). */
3148 if (bo == 0) {
3149 if (size >= 4) {
3150 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00003151 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003152#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003153 if (bom == 0x0000FEFF) {
3154 q += 4;
3155 bo = -1;
3156 }
3157 else if (bom == 0xFFFE0000) {
3158 q += 4;
3159 bo = 1;
3160 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003161#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003162 if (bom == 0x0000FEFF) {
3163 q += 4;
3164 bo = 1;
3165 }
3166 else if (bom == 0xFFFE0000) {
3167 q += 4;
3168 bo = -1;
3169 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003170#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003171 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003172 }
3173
3174 if (bo == -1) {
3175 /* force LE */
3176 iorder[0] = 0;
3177 iorder[1] = 1;
3178 iorder[2] = 2;
3179 iorder[3] = 3;
3180 }
3181 else if (bo == 1) {
3182 /* force BE */
3183 iorder[0] = 3;
3184 iorder[1] = 2;
3185 iorder[2] = 1;
3186 iorder[3] = 0;
3187 }
3188
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00003189 /* On narrow builds we split characters outside the BMP into two
3190 codepoints => count how much extra space we need. */
3191#ifndef Py_UNICODE_WIDE
3192 for (qq = q; qq < e; qq += 4)
3193 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3194 pairs++;
3195#endif
3196
3197 /* This might be one to much, because of a BOM */
3198 unicode = _PyUnicode_New((size+3)/4+pairs);
3199 if (!unicode)
3200 return NULL;
3201 if (size == 0)
3202 return (PyObject *)unicode;
3203
3204 /* Unpack UTF-32 encoded data */
3205 p = unicode->str;
3206
Walter Dörwald41980ca2007-08-16 21:55:45 +00003207 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003208 Py_UCS4 ch;
3209 /* remaining bytes at the end? (size should be divisible by 4) */
3210 if (e-q<4) {
3211 if (consumed)
3212 break;
3213 errmsg = "truncated data";
3214 startinpos = ((const char *)q)-starts;
3215 endinpos = ((const char *)e)-starts;
3216 goto utf32Error;
3217 /* The remaining input chars are ignored if the callback
3218 chooses to skip the input */
3219 }
3220 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3221 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00003222
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 if (ch >= 0x110000)
3224 {
3225 errmsg = "codepoint not in range(0x110000)";
3226 startinpos = ((const char *)q)-starts;
3227 endinpos = startinpos+4;
3228 goto utf32Error;
3229 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003230#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003231 if (ch >= 0x10000)
3232 {
3233 *p++ = 0xD800 | ((ch-0x10000) >> 10);
3234 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3235 }
3236 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00003237#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 *p++ = ch;
3239 q += 4;
3240 continue;
3241 utf32Error:
3242 outpos = p-PyUnicode_AS_UNICODE(unicode);
3243 if (unicode_decode_call_errorhandler(
3244 errors, &errorHandler,
3245 "utf32", errmsg,
3246 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3247 &unicode, &outpos, &p))
3248 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003249 }
3250
3251 if (byteorder)
3252 *byteorder = bo;
3253
3254 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003255 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003256
3257 /* Adjust length */
3258 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3259 goto onError;
3260
3261 Py_XDECREF(errorHandler);
3262 Py_XDECREF(exc);
3263 return (PyObject *)unicode;
3264
Benjamin Peterson29060642009-01-31 22:14:21 +00003265 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00003266 Py_DECREF(unicode);
3267 Py_XDECREF(errorHandler);
3268 Py_XDECREF(exc);
3269 return NULL;
3270}
3271
3272PyObject *
3273PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003274 Py_ssize_t size,
3275 const char *errors,
3276 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003277{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003278 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003279 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003280 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003281#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003282 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003283#else
3284 const int pairs = 0;
3285#endif
3286 /* Offsets from p for storing byte pairs in the right order. */
3287#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3288 int iorder[] = {0, 1, 2, 3};
3289#else
3290 int iorder[] = {3, 2, 1, 0};
3291#endif
3292
Benjamin Peterson29060642009-01-31 22:14:21 +00003293#define STORECHAR(CH) \
3294 do { \
3295 p[iorder[3]] = ((CH) >> 24) & 0xff; \
3296 p[iorder[2]] = ((CH) >> 16) & 0xff; \
3297 p[iorder[1]] = ((CH) >> 8) & 0xff; \
3298 p[iorder[0]] = (CH) & 0xff; \
3299 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00003300 } while(0)
3301
3302 /* In narrow builds we can output surrogate pairs as one codepoint,
3303 so we need less space. */
3304#ifndef Py_UNICODE_WIDE
3305 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003306 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3307 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3308 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003309#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003310 nsize = (size - pairs + (byteorder == 0));
3311 bytesize = nsize * 4;
3312 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003313 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003314 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003315 if (v == NULL)
3316 return NULL;
3317
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003318 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003319 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003320 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003321 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003322 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003323
3324 if (byteorder == -1) {
3325 /* force LE */
3326 iorder[0] = 0;
3327 iorder[1] = 1;
3328 iorder[2] = 2;
3329 iorder[3] = 3;
3330 }
3331 else if (byteorder == 1) {
3332 /* force BE */
3333 iorder[0] = 3;
3334 iorder[1] = 2;
3335 iorder[2] = 1;
3336 iorder[3] = 0;
3337 }
3338
3339 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003340 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003341#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003342 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3343 Py_UCS4 ch2 = *s;
3344 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3345 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3346 s++;
3347 size--;
3348 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003349 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00003350#endif
3351 STORECHAR(ch);
3352 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003353
3354 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003355 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00003356#undef STORECHAR
3357}
3358
Alexander Belopolsky40018472011-02-26 01:02:56 +00003359PyObject *
3360PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00003361{
3362 if (!PyUnicode_Check(unicode)) {
3363 PyErr_BadArgument();
3364 return NULL;
3365 }
3366 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003367 PyUnicode_GET_SIZE(unicode),
3368 NULL,
3369 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00003370}
3371
Guido van Rossumd57fd912000-03-10 22:53:23 +00003372/* --- UTF-16 Codec ------------------------------------------------------- */
3373
Tim Peters772747b2001-08-09 22:21:55 +00003374PyObject *
3375PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003376 Py_ssize_t size,
3377 const char *errors,
3378 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003379{
Walter Dörwald69652032004-09-07 20:24:22 +00003380 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3381}
3382
Antoine Pitrouab868312009-01-10 15:40:25 +00003383/* Two masks for fast checking of whether a C 'long' may contain
3384 UTF16-encoded surrogate characters. This is an efficient heuristic,
3385 assuming that non-surrogate characters with a code point >= 0x8000 are
3386 rare in most input.
3387 FAST_CHAR_MASK is used when the input is in native byte ordering,
3388 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00003389*/
Antoine Pitrouab868312009-01-10 15:40:25 +00003390#if (SIZEOF_LONG == 8)
3391# define FAST_CHAR_MASK 0x8000800080008000L
3392# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3393#elif (SIZEOF_LONG == 4)
3394# define FAST_CHAR_MASK 0x80008000L
3395# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3396#else
3397# error C 'long' size should be either 4 or 8!
3398#endif
3399
Walter Dörwald69652032004-09-07 20:24:22 +00003400PyObject *
3401PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003402 Py_ssize_t size,
3403 const char *errors,
3404 int *byteorder,
3405 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00003406{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003407 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003408 Py_ssize_t startinpos;
3409 Py_ssize_t endinpos;
3410 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411 PyUnicodeObject *unicode;
3412 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00003413 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00003414 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00003415 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00003416 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00003417 /* Offsets from q for retrieving byte pairs in the right order. */
3418#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3419 int ihi = 1, ilo = 0;
3420#else
3421 int ihi = 0, ilo = 1;
3422#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003423 PyObject *errorHandler = NULL;
3424 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425
3426 /* Note: size will always be longer than the resulting Unicode
3427 character count */
3428 unicode = _PyUnicode_New(size);
3429 if (!unicode)
3430 return NULL;
3431 if (size == 0)
3432 return (PyObject *)unicode;
3433
3434 /* Unpack UTF-16 encoded data */
3435 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00003436 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00003437 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003438
3439 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00003440 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003442 /* Check for BOM marks (U+FEFF) in the input and adjust current
3443 byte order setting accordingly. In native mode, the leading BOM
3444 mark is skipped, in all other modes, it is copied to the output
3445 stream as-is (giving a ZWNBSP character). */
3446 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00003447 if (size >= 2) {
3448 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003449#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00003450 if (bom == 0xFEFF) {
3451 q += 2;
3452 bo = -1;
3453 }
3454 else if (bom == 0xFFFE) {
3455 q += 2;
3456 bo = 1;
3457 }
Tim Petersced69f82003-09-16 20:30:58 +00003458#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003459 if (bom == 0xFEFF) {
3460 q += 2;
3461 bo = 1;
3462 }
3463 else if (bom == 0xFFFE) {
3464 q += 2;
3465 bo = -1;
3466 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003467#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003468 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00003469 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003470
Tim Peters772747b2001-08-09 22:21:55 +00003471 if (bo == -1) {
3472 /* force LE */
3473 ihi = 1;
3474 ilo = 0;
3475 }
3476 else if (bo == 1) {
3477 /* force BE */
3478 ihi = 0;
3479 ilo = 1;
3480 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003481#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3482 native_ordering = ilo < ihi;
3483#else
3484 native_ordering = ilo > ihi;
3485#endif
Tim Peters772747b2001-08-09 22:21:55 +00003486
Antoine Pitrouab868312009-01-10 15:40:25 +00003487 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00003488 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003489 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00003490 /* First check for possible aligned read of a C 'long'. Unaligned
3491 reads are more expensive, better to defer to another iteration. */
3492 if (!((size_t) q & LONG_PTR_MASK)) {
3493 /* Fast path for runs of non-surrogate chars. */
3494 register const unsigned char *_q = q;
3495 Py_UNICODE *_p = p;
3496 if (native_ordering) {
3497 /* Native ordering is simple: as long as the input cannot
3498 possibly contain a surrogate char, do an unrolled copy
3499 of several 16-bit code points to the target object.
3500 The non-surrogate check is done on several input bytes
3501 at a time (as many as a C 'long' can contain). */
3502 while (_q < aligned_end) {
3503 unsigned long data = * (unsigned long *) _q;
3504 if (data & FAST_CHAR_MASK)
3505 break;
3506 _p[0] = ((unsigned short *) _q)[0];
3507 _p[1] = ((unsigned short *) _q)[1];
3508#if (SIZEOF_LONG == 8)
3509 _p[2] = ((unsigned short *) _q)[2];
3510 _p[3] = ((unsigned short *) _q)[3];
3511#endif
3512 _q += SIZEOF_LONG;
3513 _p += SIZEOF_LONG / 2;
3514 }
3515 }
3516 else {
3517 /* Byteswapped ordering is similar, but we must decompose
3518 the copy bytewise, and take care of zero'ing out the
3519 upper bytes if the target object is in 32-bit units
3520 (that is, in UCS-4 builds). */
3521 while (_q < aligned_end) {
3522 unsigned long data = * (unsigned long *) _q;
3523 if (data & SWAPPED_FAST_CHAR_MASK)
3524 break;
3525 /* Zero upper bytes in UCS-4 builds */
3526#if (Py_UNICODE_SIZE > 2)
3527 _p[0] = 0;
3528 _p[1] = 0;
3529#if (SIZEOF_LONG == 8)
3530 _p[2] = 0;
3531 _p[3] = 0;
3532#endif
3533#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003534 /* Issue #4916; UCS-4 builds on big endian machines must
3535 fill the two last bytes of each 4-byte unit. */
3536#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3537# define OFF 2
3538#else
3539# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00003540#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00003541 ((unsigned char *) _p)[OFF + 1] = _q[0];
3542 ((unsigned char *) _p)[OFF + 0] = _q[1];
3543 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3544 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3545#if (SIZEOF_LONG == 8)
3546 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3547 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3548 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3549 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3550#endif
3551#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00003552 _q += SIZEOF_LONG;
3553 _p += SIZEOF_LONG / 2;
3554 }
3555 }
3556 p = _p;
3557 q = _q;
3558 if (q >= e)
3559 break;
3560 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003561 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003562
Benjamin Peterson14339b62009-01-31 16:36:08 +00003563 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00003564
3565 if (ch < 0xD800 || ch > 0xDFFF) {
3566 *p++ = ch;
3567 continue;
3568 }
3569
3570 /* UTF-16 code pair: */
3571 if (q > e) {
3572 errmsg = "unexpected end of data";
3573 startinpos = (((const char *)q) - 2) - starts;
3574 endinpos = ((const char *)e) + 1 - starts;
3575 goto utf16Error;
3576 }
3577 if (0xD800 <= ch && ch <= 0xDBFF) {
3578 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3579 q += 2;
3580 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00003581#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003582 *p++ = ch;
3583 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003584#else
Benjamin Peterson29060642009-01-31 22:14:21 +00003585 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003586#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00003587 continue;
3588 }
3589 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003590 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00003591 startinpos = (((const char *)q)-4)-starts;
3592 endinpos = startinpos+2;
3593 goto utf16Error;
3594 }
3595
Benjamin Peterson14339b62009-01-31 16:36:08 +00003596 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003597 errmsg = "illegal encoding";
3598 startinpos = (((const char *)q)-2)-starts;
3599 endinpos = startinpos+2;
3600 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003601
Benjamin Peterson29060642009-01-31 22:14:21 +00003602 utf16Error:
3603 outpos = p - PyUnicode_AS_UNICODE(unicode);
3604 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00003605 errors,
3606 &errorHandler,
3607 "utf16", errmsg,
3608 &starts,
3609 (const char **)&e,
3610 &startinpos,
3611 &endinpos,
3612 &exc,
3613 (const char **)&q,
3614 &unicode,
3615 &outpos,
3616 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00003617 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003618 }
Antoine Pitrouab868312009-01-10 15:40:25 +00003619 /* remaining byte at the end? (size should be even) */
3620 if (e == q) {
3621 if (!consumed) {
3622 errmsg = "truncated data";
3623 startinpos = ((const char *)q) - starts;
3624 endinpos = ((const char *)e) + 1 - starts;
3625 outpos = p - PyUnicode_AS_UNICODE(unicode);
3626 if (unicode_decode_call_errorhandler(
3627 errors,
3628 &errorHandler,
3629 "utf16", errmsg,
3630 &starts,
3631 (const char **)&e,
3632 &startinpos,
3633 &endinpos,
3634 &exc,
3635 (const char **)&q,
3636 &unicode,
3637 &outpos,
3638 &p))
3639 goto onError;
3640 /* The remaining input chars are ignored if the callback
3641 chooses to skip the input */
3642 }
3643 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644
3645 if (byteorder)
3646 *byteorder = bo;
3647
Walter Dörwald69652032004-09-07 20:24:22 +00003648 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00003649 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00003650
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003652 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 goto onError;
3654
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655 Py_XDECREF(errorHandler);
3656 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 return (PyObject *)unicode;
3658
Benjamin Peterson29060642009-01-31 22:14:21 +00003659 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661 Py_XDECREF(errorHandler);
3662 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 return NULL;
3664}
3665
Antoine Pitrouab868312009-01-10 15:40:25 +00003666#undef FAST_CHAR_MASK
3667#undef SWAPPED_FAST_CHAR_MASK
3668
Tim Peters772747b2001-08-09 22:21:55 +00003669PyObject *
3670PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 Py_ssize_t size,
3672 const char *errors,
3673 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003675 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00003676 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003677 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003678#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003679 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003680#else
3681 const int pairs = 0;
3682#endif
Tim Peters772747b2001-08-09 22:21:55 +00003683 /* Offsets from p for storing byte pairs in the right order. */
3684#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3685 int ihi = 1, ilo = 0;
3686#else
3687 int ihi = 0, ilo = 1;
3688#endif
3689
Benjamin Peterson29060642009-01-31 22:14:21 +00003690#define STORECHAR(CH) \
3691 do { \
3692 p[ihi] = ((CH) >> 8) & 0xff; \
3693 p[ilo] = (CH) & 0xff; \
3694 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00003695 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003697#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003698 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00003699 if (s[i] >= 0x10000)
3700 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003701#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003702 /* 2 * (size + pairs + (byteorder == 0)) */
3703 if (size > PY_SSIZE_T_MAX ||
3704 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00003705 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003706 nsize = size + pairs + (byteorder == 0);
3707 bytesize = nsize * 2;
3708 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00003709 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003710 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 if (v == NULL)
3712 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003714 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003715 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003716 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003717 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00003718 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00003719
3720 if (byteorder == -1) {
3721 /* force LE */
3722 ihi = 1;
3723 ilo = 0;
3724 }
3725 else if (byteorder == 1) {
3726 /* force BE */
3727 ihi = 0;
3728 ilo = 1;
3729 }
3730
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003731 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003732 Py_UNICODE ch = *s++;
3733 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003734#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00003735 if (ch >= 0x10000) {
3736 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3737 ch = 0xD800 | ((ch-0x10000) >> 10);
3738 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00003739#endif
Tim Peters772747b2001-08-09 22:21:55 +00003740 STORECHAR(ch);
3741 if (ch2)
3742 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003743 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003744
3745 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003746 return v;
Tim Peters772747b2001-08-09 22:21:55 +00003747#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748}
3749
Alexander Belopolsky40018472011-02-26 01:02:56 +00003750PyObject *
3751PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752{
3753 if (!PyUnicode_Check(unicode)) {
3754 PyErr_BadArgument();
3755 return NULL;
3756 }
3757 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00003758 PyUnicode_GET_SIZE(unicode),
3759 NULL,
3760 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003761}
3762
3763/* --- Unicode Escape Codec ----------------------------------------------- */
3764
Fredrik Lundh06d12682001-01-24 07:59:11 +00003765static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00003766
Alexander Belopolsky40018472011-02-26 01:02:56 +00003767PyObject *
3768PyUnicode_DecodeUnicodeEscape(const char *s,
3769 Py_ssize_t size,
3770 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003771{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003772 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003773 Py_ssize_t startinpos;
3774 Py_ssize_t endinpos;
3775 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003776 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003778 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003780 char* message;
3781 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003782 PyObject *errorHandler = NULL;
3783 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003784
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 /* Escaped strings will always be longer than the resulting
3786 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 length after conversion to the true value.
3788 (but if the error callback returns a long replacement string
3789 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790 v = _PyUnicode_New(size);
3791 if (v == NULL)
3792 goto onError;
3793 if (size == 0)
3794 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003795
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003797 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003798
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799 while (s < end) {
3800 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00003801 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003802 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803
3804 /* Non-escape characters are interpreted as Unicode ordinals */
3805 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003806 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003807 continue;
3808 }
3809
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003810 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 /* \ - Escapes */
3812 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003813 c = *s++;
3814 if (s > end)
3815 c = '\0'; /* Invalid after \ */
3816 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817
Benjamin Peterson29060642009-01-31 22:14:21 +00003818 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003819 case '\n': break;
3820 case '\\': *p++ = '\\'; break;
3821 case '\'': *p++ = '\''; break;
3822 case '\"': *p++ = '\"'; break;
3823 case 'b': *p++ = '\b'; break;
3824 case 'f': *p++ = '\014'; break; /* FF */
3825 case 't': *p++ = '\t'; break;
3826 case 'n': *p++ = '\n'; break;
3827 case 'r': *p++ = '\r'; break;
3828 case 'v': *p++ = '\013'; break; /* VT */
3829 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3830
Benjamin Peterson29060642009-01-31 22:14:21 +00003831 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 case '0': case '1': case '2': case '3':
3833 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003834 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003835 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003836 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00003837 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003838 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00003840 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841 break;
3842
Benjamin Peterson29060642009-01-31 22:14:21 +00003843 /* hex escapes */
3844 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003845 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003846 digits = 2;
3847 message = "truncated \\xXX escape";
3848 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849
Benjamin Peterson29060642009-01-31 22:14:21 +00003850 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003852 digits = 4;
3853 message = "truncated \\uXXXX escape";
3854 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855
Benjamin Peterson29060642009-01-31 22:14:21 +00003856 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00003857 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00003858 digits = 8;
3859 message = "truncated \\UXXXXXXXX escape";
3860 hexescape:
3861 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003862 outpos = p-PyUnicode_AS_UNICODE(v);
3863 if (s+digits>end) {
3864 endinpos = size;
3865 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003866 errors, &errorHandler,
3867 "unicodeescape", "end of string in escape sequence",
3868 &starts, &end, &startinpos, &endinpos, &exc, &s,
3869 &v, &outpos, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003870 goto onError;
3871 goto nextByte;
3872 }
3873 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00003874 c = (unsigned char) s[i];
David Malcolm96960882010-11-05 17:23:41 +00003875 if (!Py_ISXDIGIT(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003876 endinpos = (s+i+1)-starts;
3877 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003878 errors, &errorHandler,
3879 "unicodeescape", message,
3880 &starts, &end, &startinpos, &endinpos, &exc, &s,
3881 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003882 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003883 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00003884 }
3885 chr = (chr<<4) & ~0xF;
3886 if (c >= '0' && c <= '9')
3887 chr += c - '0';
3888 else if (c >= 'a' && c <= 'f')
3889 chr += 10 + c - 'a';
3890 else
3891 chr += 10 + c - 'A';
3892 }
3893 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00003894 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003895 /* _decoding_error will have already written into the
3896 target buffer. */
3897 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003898 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00003899 /* when we get here, chr is a 32-bit unicode character */
3900 if (chr <= 0xffff)
3901 /* UCS-2 character */
3902 *p++ = (Py_UNICODE) chr;
3903 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00003904 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00003905 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00003906#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003907 *p++ = chr;
3908#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00003909 chr -= 0x10000L;
3910 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00003911 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00003912#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00003913 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914 endinpos = s-starts;
3915 outpos = p-PyUnicode_AS_UNICODE(v);
3916 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003917 errors, &errorHandler,
3918 "unicodeescape", "illegal Unicode character",
3919 &starts, &end, &startinpos, &endinpos, &exc, &s,
3920 &v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00003921 goto onError;
3922 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003923 break;
3924
Benjamin Peterson29060642009-01-31 22:14:21 +00003925 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00003926 case 'N':
3927 message = "malformed \\N character escape";
3928 if (ucnhash_CAPI == NULL) {
3929 /* load the unicode data module */
Benjamin Petersonb173f782009-05-05 22:31:58 +00003930 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00003931 if (ucnhash_CAPI == NULL)
3932 goto ucnhashError;
3933 }
3934 if (*s == '{') {
3935 const char *start = s+1;
3936 /* look for the closing brace */
3937 while (*s != '}' && s < end)
3938 s++;
3939 if (s > start && s < end && *s == '}') {
3940 /* found a name. look it up in the unicode database */
3941 message = "unknown Unicode character name";
3942 s++;
Martin v. Löwis480f1bb2006-03-09 23:38:20 +00003943 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003944 goto store;
3945 }
3946 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947 endinpos = s-starts;
3948 outpos = p-PyUnicode_AS_UNICODE(v);
3949 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 errors, &errorHandler,
3951 "unicodeescape", message,
3952 &starts, &end, &startinpos, &endinpos, &exc, &s,
3953 &v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00003954 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00003955 break;
3956
3957 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00003958 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003959 message = "\\ at end of string";
3960 s--;
3961 endinpos = s-starts;
3962 outpos = p-PyUnicode_AS_UNICODE(v);
3963 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003964 errors, &errorHandler,
3965 "unicodeescape", message,
3966 &starts, &end, &startinpos, &endinpos, &exc, &s,
3967 &v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00003968 goto onError;
3969 }
3970 else {
3971 *p++ = '\\';
3972 *p++ = (unsigned char)s[-1];
3973 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00003974 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 }
Benjamin Peterson29060642009-01-31 22:14:21 +00003976 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00003979 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00003981 Py_XDECREF(errorHandler);
3982 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00003984
Benjamin Peterson29060642009-01-31 22:14:21 +00003985 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00003986 PyErr_SetString(
3987 PyExc_UnicodeError,
3988 "\\N escapes not supported (can't load unicodedata module)"
3989 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00003990 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003991 Py_XDECREF(errorHandler);
3992 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00003993 return NULL;
3994
Benjamin Peterson29060642009-01-31 22:14:21 +00003995 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003997 Py_XDECREF(errorHandler);
3998 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999 return NULL;
4000}
4001
4002/* Return a Unicode-Escape string version of the Unicode object.
4003
4004 If quotes is true, the string is enclosed in u"" or u'' quotes as
4005 appropriate.
4006
4007*/
4008
Thomas Wouters477c8d52006-05-27 19:21:47 +00004009Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004010 Py_ssize_t size,
4011 Py_UNICODE ch)
Thomas Wouters477c8d52006-05-27 19:21:47 +00004012{
4013 /* like wcschr, but doesn't stop at NULL characters */
4014
4015 while (size-- > 0) {
4016 if (*s == ch)
4017 return s;
4018 s++;
4019 }
4020
4021 return NULL;
4022}
Barry Warsaw51ac5802000-03-20 16:36:48 +00004023
Walter Dörwald79e913e2007-05-12 11:08:06 +00004024static const char *hexdigits = "0123456789abcdef";
4025
Alexander Belopolsky40018472011-02-26 01:02:56 +00004026PyObject *
4027PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
4028 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004029{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004030 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004032
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004033#ifdef Py_UNICODE_WIDE
4034 const Py_ssize_t expandsize = 10;
4035#else
4036 const Py_ssize_t expandsize = 6;
4037#endif
4038
Thomas Wouters89f507f2006-12-13 04:49:30 +00004039 /* XXX(nnorwitz): rather than over-allocating, it would be
4040 better to choose a different scheme. Perhaps scan the
4041 first N-chars of the string and allocate based on that size.
4042 */
4043 /* Initial allocation is based on the longest-possible unichr
4044 escape.
4045
4046 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4047 unichr, so in this case it's the longest unichr escape. In
4048 narrow (UTF-16) builds this is five chars per source unichr
4049 since there are two unichrs in the surrogate pair, so in narrow
4050 (UTF-16) builds it's not the longest unichr escape.
4051
4052 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4053 so in the narrow (UTF-16) build case it's the longest unichr
4054 escape.
4055 */
4056
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004057 if (size == 0)
4058 return PyBytes_FromStringAndSize(NULL, 0);
4059
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004060 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004061 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004062
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004063 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00004064 2
4065 + expandsize*size
4066 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067 if (repr == NULL)
4068 return NULL;
4069
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004070 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 while (size-- > 0) {
4073 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004074
Walter Dörwald79e913e2007-05-12 11:08:06 +00004075 /* Escape backslashes */
4076 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 *p++ = '\\';
4078 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00004079 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004080 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004081
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00004082#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004083 /* Map 21-bit characters to '\U00xxxxxx' */
4084 else if (ch >= 0x10000) {
4085 *p++ = '\\';
4086 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004087 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4088 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4089 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4090 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4091 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4092 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4093 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4094 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00004095 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004096 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004097#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004098 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4099 else if (ch >= 0xD800 && ch < 0xDC00) {
4100 Py_UNICODE ch2;
4101 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00004102
Benjamin Peterson29060642009-01-31 22:14:21 +00004103 ch2 = *s++;
4104 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004105 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004106 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4107 *p++ = '\\';
4108 *p++ = 'U';
4109 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4110 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4111 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4112 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4113 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4114 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4115 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4116 *p++ = hexdigits[ucs & 0x0000000F];
4117 continue;
4118 }
4119 /* Fall through: isolated surrogates are copied as-is */
4120 s--;
4121 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004122 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00004123#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004124
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00004126 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127 *p++ = '\\';
4128 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004129 *p++ = hexdigits[(ch >> 12) & 0x000F];
4130 *p++ = hexdigits[(ch >> 8) & 0x000F];
4131 *p++ = hexdigits[(ch >> 4) & 0x000F];
4132 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004133 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004134
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004135 /* Map special whitespace to '\t', \n', '\r' */
4136 else if (ch == '\t') {
4137 *p++ = '\\';
4138 *p++ = 't';
4139 }
4140 else if (ch == '\n') {
4141 *p++ = '\\';
4142 *p++ = 'n';
4143 }
4144 else if (ch == '\r') {
4145 *p++ = '\\';
4146 *p++ = 'r';
4147 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004148
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004149 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00004150 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004151 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00004152 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00004153 *p++ = hexdigits[(ch >> 4) & 0x000F];
4154 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00004155 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00004156
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157 /* Copy everything else as-is */
4158 else
4159 *p++ = (char) ch;
4160 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004161
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004162 assert(p - PyBytes_AS_STRING(repr) > 0);
4163 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4164 return NULL;
4165 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166}
4167
Alexander Belopolsky40018472011-02-26 01:02:56 +00004168PyObject *
4169PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004171 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172 if (!PyUnicode_Check(unicode)) {
4173 PyErr_BadArgument();
4174 return NULL;
4175 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00004176 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4177 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004178 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179}
4180
4181/* --- Raw Unicode Escape Codec ------------------------------------------- */
4182
Alexander Belopolsky40018472011-02-26 01:02:56 +00004183PyObject *
4184PyUnicode_DecodeRawUnicodeEscape(const char *s,
4185 Py_ssize_t size,
4186 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004187{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004189 Py_ssize_t startinpos;
4190 Py_ssize_t endinpos;
4191 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194 const char *end;
4195 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 PyObject *errorHandler = NULL;
4197 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004198
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 /* Escaped strings will always be longer than the resulting
4200 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 length after conversion to the true value. (But decoding error
4202 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004203 v = _PyUnicode_New(size);
4204 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004205 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004206 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004207 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209 end = s + size;
4210 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004211 unsigned char c;
4212 Py_UCS4 x;
4213 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004214 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215
Benjamin Peterson29060642009-01-31 22:14:21 +00004216 /* Non-escape characters are interpreted as Unicode ordinals */
4217 if (*s != '\\') {
4218 *p++ = (unsigned char)*s++;
4219 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004220 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004221 startinpos = s-starts;
4222
4223 /* \u-escapes are only interpreted iff the number of leading
4224 backslashes if odd */
4225 bs = s;
4226 for (;s < end;) {
4227 if (*s != '\\')
4228 break;
4229 *p++ = (unsigned char)*s++;
4230 }
4231 if (((s - bs) & 1) == 0 ||
4232 s >= end ||
4233 (*s != 'u' && *s != 'U')) {
4234 continue;
4235 }
4236 p--;
4237 count = *s=='u' ? 4 : 8;
4238 s++;
4239
4240 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4241 outpos = p-PyUnicode_AS_UNICODE(v);
4242 for (x = 0, i = 0; i < count; ++i, ++s) {
4243 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00004244 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004245 endinpos = s-starts;
4246 if (unicode_decode_call_errorhandler(
4247 errors, &errorHandler,
4248 "rawunicodeescape", "truncated \\uXXXX",
4249 &starts, &end, &startinpos, &endinpos, &exc, &s,
4250 &v, &outpos, &p))
4251 goto onError;
4252 goto nextByte;
4253 }
4254 x = (x<<4) & ~0xF;
4255 if (c >= '0' && c <= '9')
4256 x += c - '0';
4257 else if (c >= 'a' && c <= 'f')
4258 x += 10 + c - 'a';
4259 else
4260 x += 10 + c - 'A';
4261 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00004262 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00004263 /* UCS-2 character */
4264 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004265 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004266 /* UCS-4 character. Either store directly, or as
4267 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00004268#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004269 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004270#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004271 x -= 0x10000L;
4272 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4273 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00004274#endif
4275 } else {
4276 endinpos = s-starts;
4277 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004278 if (unicode_decode_call_errorhandler(
4279 errors, &errorHandler,
4280 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00004281 &starts, &end, &startinpos, &endinpos, &exc, &s,
4282 &v, &outpos, &p))
4283 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004284 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004285 nextByte:
4286 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287 }
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004288 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004289 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004290 Py_XDECREF(errorHandler);
4291 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004293
Benjamin Peterson29060642009-01-31 22:14:21 +00004294 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004295 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004296 Py_XDECREF(errorHandler);
4297 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298 return NULL;
4299}
4300
Alexander Belopolsky40018472011-02-26 01:02:56 +00004301PyObject *
4302PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4303 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004304{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004305 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306 char *p;
4307 char *q;
4308
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004309#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004310 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004311#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004312 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004313#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00004314
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004315 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004316 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00004317
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004318 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319 if (repr == NULL)
4320 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00004321 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004322 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004324 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325 while (size-- > 0) {
4326 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004327#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004328 /* Map 32-bit characters to '\Uxxxxxxxx' */
4329 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004330 *p++ = '\\';
4331 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004332 *p++ = hexdigits[(ch >> 28) & 0xf];
4333 *p++ = hexdigits[(ch >> 24) & 0xf];
4334 *p++ = hexdigits[(ch >> 20) & 0xf];
4335 *p++ = hexdigits[(ch >> 16) & 0xf];
4336 *p++ = hexdigits[(ch >> 12) & 0xf];
4337 *p++ = hexdigits[(ch >> 8) & 0xf];
4338 *p++ = hexdigits[(ch >> 4) & 0xf];
4339 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00004340 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004341 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00004342#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004343 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4344 if (ch >= 0xD800 && ch < 0xDC00) {
4345 Py_UNICODE ch2;
4346 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00004347
Benjamin Peterson29060642009-01-31 22:14:21 +00004348 ch2 = *s++;
4349 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00004350 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004351 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4352 *p++ = '\\';
4353 *p++ = 'U';
4354 *p++ = hexdigits[(ucs >> 28) & 0xf];
4355 *p++ = hexdigits[(ucs >> 24) & 0xf];
4356 *p++ = hexdigits[(ucs >> 20) & 0xf];
4357 *p++ = hexdigits[(ucs >> 16) & 0xf];
4358 *p++ = hexdigits[(ucs >> 12) & 0xf];
4359 *p++ = hexdigits[(ucs >> 8) & 0xf];
4360 *p++ = hexdigits[(ucs >> 4) & 0xf];
4361 *p++ = hexdigits[ucs & 0xf];
4362 continue;
4363 }
4364 /* Fall through: isolated surrogates are copied as-is */
4365 s--;
4366 size++;
4367 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00004368#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004369 /* Map 16-bit characters to '\uxxxx' */
4370 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371 *p++ = '\\';
4372 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00004373 *p++ = hexdigits[(ch >> 12) & 0xf];
4374 *p++ = hexdigits[(ch >> 8) & 0xf];
4375 *p++ = hexdigits[(ch >> 4) & 0xf];
4376 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00004377 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004378 /* Copy everything else as-is */
4379 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380 *p++ = (char) ch;
4381 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004382 size = p - q;
4383
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004384 assert(size > 0);
4385 if (_PyBytes_Resize(&repr, size) < 0)
4386 return NULL;
4387 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388}
4389
Alexander Belopolsky40018472011-02-26 01:02:56 +00004390PyObject *
4391PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004393 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00004395 PyErr_BadArgument();
4396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397 }
Walter Dörwald711005d2007-05-12 12:03:26 +00004398 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4399 PyUnicode_GET_SIZE(unicode));
4400
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00004401 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402}
4403
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004404/* --- Unicode Internal Codec ------------------------------------------- */
4405
Alexander Belopolsky40018472011-02-26 01:02:56 +00004406PyObject *
4407_PyUnicode_DecodeUnicodeInternal(const char *s,
4408 Py_ssize_t size,
4409 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004410{
4411 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004412 Py_ssize_t startinpos;
4413 Py_ssize_t endinpos;
4414 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004415 PyUnicodeObject *v;
4416 Py_UNICODE *p;
4417 const char *end;
4418 const char *reason;
4419 PyObject *errorHandler = NULL;
4420 PyObject *exc = NULL;
4421
Neal Norwitzd43069c2006-01-08 01:12:10 +00004422#ifdef Py_UNICODE_WIDE
4423 Py_UNICODE unimax = PyUnicode_GetMax();
4424#endif
4425
Thomas Wouters89f507f2006-12-13 04:49:30 +00004426 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004427 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4428 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004430 if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004432 p = PyUnicode_AS_UNICODE(v);
4433 end = s + size;
4434
4435 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00004436 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004437 /* We have to sanity check the raw data, otherwise doom looms for
4438 some malformed UCS-4 data. */
4439 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00004440#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004441 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00004442#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004443 end-s < Py_UNICODE_SIZE
4444 )
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004446 startinpos = s - starts;
4447 if (end-s < Py_UNICODE_SIZE) {
4448 endinpos = end-starts;
4449 reason = "truncated input";
4450 }
4451 else {
4452 endinpos = s - starts + Py_UNICODE_SIZE;
4453 reason = "illegal code point (> 0x10FFFF)";
4454 }
4455 outpos = p - PyUnicode_AS_UNICODE(v);
4456 if (unicode_decode_call_errorhandler(
4457 errors, &errorHandler,
4458 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00004459 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00004460 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004461 goto onError;
4462 }
4463 }
4464 else {
4465 p++;
4466 s += Py_UNICODE_SIZE;
4467 }
4468 }
4469
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004470 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004471 goto onError;
4472 Py_XDECREF(errorHandler);
4473 Py_XDECREF(exc);
4474 return (PyObject *)v;
4475
Benjamin Peterson29060642009-01-31 22:14:21 +00004476 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00004477 Py_XDECREF(v);
4478 Py_XDECREF(errorHandler);
4479 Py_XDECREF(exc);
4480 return NULL;
4481}
4482
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483/* --- Latin-1 Codec ------------------------------------------------------ */
4484
Alexander Belopolsky40018472011-02-26 01:02:56 +00004485PyObject *
4486PyUnicode_DecodeLatin1(const char *s,
4487 Py_ssize_t size,
4488 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489{
4490 PyUnicodeObject *v;
4491 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00004492 const char *e, *unrolled_end;
Tim Petersced69f82003-09-16 20:30:58 +00004493
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004495 if (size == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004496 Py_UNICODE r = *(unsigned char*)s;
4497 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004498 }
4499
Guido van Rossumd57fd912000-03-10 22:53:23 +00004500 v = _PyUnicode_New(size);
4501 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004502 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004504 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505 p = PyUnicode_AS_UNICODE(v);
Antoine Pitrouab868312009-01-10 15:40:25 +00004506 e = s + size;
4507 /* Unrolling the copy makes it much faster by reducing the looping
4508 overhead. This is similar to what many memcpy() implementations do. */
4509 unrolled_end = e - 4;
4510 while (s < unrolled_end) {
4511 p[0] = (unsigned char) s[0];
4512 p[1] = (unsigned char) s[1];
4513 p[2] = (unsigned char) s[2];
4514 p[3] = (unsigned char) s[3];
4515 s += 4;
4516 p += 4;
4517 }
4518 while (s < e)
4519 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004521
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523 Py_XDECREF(v);
4524 return NULL;
4525}
4526
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004527/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004528static void
4529make_encode_exception(PyObject **exceptionObject,
4530 const char *encoding,
4531 const Py_UNICODE *unicode, Py_ssize_t size,
4532 Py_ssize_t startpos, Py_ssize_t endpos,
4533 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004535 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004536 *exceptionObject = PyUnicodeEncodeError_Create(
4537 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004538 }
4539 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4541 goto onError;
4542 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4543 goto onError;
4544 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4545 goto onError;
4546 return;
4547 onError:
4548 Py_DECREF(*exceptionObject);
4549 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550 }
4551}
4552
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004553/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004554static void
4555raise_encode_exception(PyObject **exceptionObject,
4556 const char *encoding,
4557 const Py_UNICODE *unicode, Py_ssize_t size,
4558 Py_ssize_t startpos, Py_ssize_t endpos,
4559 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004560{
4561 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004562 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004563 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004564 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004565}
4566
4567/* error handling callback helper:
4568 build arguments, call the callback and check the arguments,
4569 put the result into newpos and return the replacement string, which
4570 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004571static PyObject *
4572unicode_encode_call_errorhandler(const char *errors,
4573 PyObject **errorHandler,
4574 const char *encoding, const char *reason,
4575 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4576 Py_ssize_t startpos, Py_ssize_t endpos,
4577 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004579 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004580
4581 PyObject *restuple;
4582 PyObject *resunicode;
4583
4584 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004585 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004586 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004587 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004588 }
4589
4590 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00004591 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004592 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004594
4595 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00004596 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004597 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004598 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004599 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004600 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004601 Py_DECREF(restuple);
4602 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004603 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004604 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00004605 &resunicode, newpos)) {
4606 Py_DECREF(restuple);
4607 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004608 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004609 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4610 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4611 Py_DECREF(restuple);
4612 return NULL;
4613 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004614 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004615 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004616 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004617 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4618 Py_DECREF(restuple);
4619 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004620 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004621 Py_INCREF(resunicode);
4622 Py_DECREF(restuple);
4623 return resunicode;
4624}
4625
Alexander Belopolsky40018472011-02-26 01:02:56 +00004626static PyObject *
4627unicode_encode_ucs1(const Py_UNICODE *p,
4628 Py_ssize_t size,
4629 const char *errors,
4630 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004631{
4632 /* output object */
4633 PyObject *res;
4634 /* pointers to the beginning and end+1 of input */
4635 const Py_UNICODE *startp = p;
4636 const Py_UNICODE *endp = p + size;
4637 /* pointer to the beginning of the unencodable characters */
4638 /* const Py_UNICODE *badp = NULL; */
4639 /* pointer into the output */
4640 char *str;
4641 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00004642 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004643 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4644 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 PyObject *errorHandler = NULL;
4646 PyObject *exc = NULL;
4647 /* the following variable is used for caching string comparisons
4648 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4649 int known_errorHandler = -1;
4650
4651 /* allocate enough for a simple encoding without
4652 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00004653 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00004654 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004655 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004656 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004657 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004658 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004659 ressize = size;
4660
4661 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004662 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004663
Benjamin Peterson29060642009-01-31 22:14:21 +00004664 /* can we encode this? */
4665 if (c<limit) {
4666 /* no overflow check, because we know that the space is enough */
4667 *str++ = (char)c;
4668 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004669 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004670 else {
4671 Py_ssize_t unicodepos = p-startp;
4672 Py_ssize_t requiredsize;
4673 PyObject *repunicode;
4674 Py_ssize_t repsize;
4675 Py_ssize_t newpos;
4676 Py_ssize_t respos;
4677 Py_UNICODE *uni2;
4678 /* startpos for collecting unencodable chars */
4679 const Py_UNICODE *collstart = p;
4680 const Py_UNICODE *collend = p;
4681 /* find all unecodable characters */
4682 while ((collend < endp) && ((*collend)>=limit))
4683 ++collend;
4684 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4685 if (known_errorHandler==-1) {
4686 if ((errors==NULL) || (!strcmp(errors, "strict")))
4687 known_errorHandler = 1;
4688 else if (!strcmp(errors, "replace"))
4689 known_errorHandler = 2;
4690 else if (!strcmp(errors, "ignore"))
4691 known_errorHandler = 3;
4692 else if (!strcmp(errors, "xmlcharrefreplace"))
4693 known_errorHandler = 4;
4694 else
4695 known_errorHandler = 0;
4696 }
4697 switch (known_errorHandler) {
4698 case 1: /* strict */
4699 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4700 goto onError;
4701 case 2: /* replace */
4702 while (collstart++<collend)
4703 *str++ = '?'; /* fall through */
4704 case 3: /* ignore */
4705 p = collend;
4706 break;
4707 case 4: /* xmlcharrefreplace */
4708 respos = str - PyBytes_AS_STRING(res);
4709 /* determine replacement size (temporarily (mis)uses p) */
4710 for (p = collstart, repsize = 0; p < collend; ++p) {
4711 if (*p<10)
4712 repsize += 2+1+1;
4713 else if (*p<100)
4714 repsize += 2+2+1;
4715 else if (*p<1000)
4716 repsize += 2+3+1;
4717 else if (*p<10000)
4718 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004719#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004720 else
4721 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00004722#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004723 else if (*p<100000)
4724 repsize += 2+5+1;
4725 else if (*p<1000000)
4726 repsize += 2+6+1;
4727 else
4728 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00004729#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004730 }
4731 requiredsize = respos+repsize+(endp-collend);
4732 if (requiredsize > ressize) {
4733 if (requiredsize<2*ressize)
4734 requiredsize = 2*ressize;
4735 if (_PyBytes_Resize(&res, requiredsize))
4736 goto onError;
4737 str = PyBytes_AS_STRING(res) + respos;
4738 ressize = requiredsize;
4739 }
4740 /* generate replacement (temporarily (mis)uses p) */
4741 for (p = collstart; p < collend; ++p) {
4742 str += sprintf(str, "&#%d;", (int)*p);
4743 }
4744 p = collend;
4745 break;
4746 default:
4747 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4748 encoding, reason, startp, size, &exc,
4749 collstart-startp, collend-startp, &newpos);
4750 if (repunicode == NULL)
4751 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004752 if (PyBytes_Check(repunicode)) {
4753 /* Directly copy bytes result to output. */
4754 repsize = PyBytes_Size(repunicode);
4755 if (repsize > 1) {
4756 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004757 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004758 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4759 Py_DECREF(repunicode);
4760 goto onError;
4761 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00004762 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004763 ressize += repsize-1;
4764 }
4765 memcpy(str, PyBytes_AsString(repunicode), repsize);
4766 str += repsize;
4767 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004768 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00004769 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004770 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004771 /* need more space? (at least enough for what we
4772 have+the replacement+the rest of the string, so
4773 we won't have to check space for encodable characters) */
4774 respos = str - PyBytes_AS_STRING(res);
4775 repsize = PyUnicode_GET_SIZE(repunicode);
4776 requiredsize = respos+repsize+(endp-collend);
4777 if (requiredsize > ressize) {
4778 if (requiredsize<2*ressize)
4779 requiredsize = 2*ressize;
4780 if (_PyBytes_Resize(&res, requiredsize)) {
4781 Py_DECREF(repunicode);
4782 goto onError;
4783 }
4784 str = PyBytes_AS_STRING(res) + respos;
4785 ressize = requiredsize;
4786 }
4787 /* check if there is anything unencodable in the replacement
4788 and copy it to the output */
4789 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4790 c = *uni2;
4791 if (c >= limit) {
4792 raise_encode_exception(&exc, encoding, startp, size,
4793 unicodepos, unicodepos+1, reason);
4794 Py_DECREF(repunicode);
4795 goto onError;
4796 }
4797 *str = (char)c;
4798 }
4799 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004800 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00004801 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004802 }
4803 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004804 /* Resize if we allocated to much */
4805 size = str - PyBytes_AS_STRING(res);
4806 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00004807 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004808 if (_PyBytes_Resize(&res, size) < 0)
4809 goto onError;
4810 }
4811
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004812 Py_XDECREF(errorHandler);
4813 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004814 return res;
4815
4816 onError:
4817 Py_XDECREF(res);
4818 Py_XDECREF(errorHandler);
4819 Py_XDECREF(exc);
4820 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004821}
4822
Alexander Belopolsky40018472011-02-26 01:02:56 +00004823PyObject *
4824PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4825 Py_ssize_t size,
4826 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004828 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829}
4830
Alexander Belopolsky40018472011-02-26 01:02:56 +00004831PyObject *
4832PyUnicode_AsLatin1String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833{
4834 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004835 PyErr_BadArgument();
4836 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837 }
4838 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004839 PyUnicode_GET_SIZE(unicode),
4840 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841}
4842
4843/* --- 7-bit ASCII Codec -------------------------------------------------- */
4844
Alexander Belopolsky40018472011-02-26 01:02:56 +00004845PyObject *
4846PyUnicode_DecodeASCII(const char *s,
4847 Py_ssize_t size,
4848 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004850 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851 PyUnicodeObject *v;
4852 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004853 Py_ssize_t startinpos;
4854 Py_ssize_t endinpos;
4855 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856 const char *e;
4857 PyObject *errorHandler = NULL;
4858 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004859
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004861 if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004862 Py_UNICODE r = *(unsigned char*)s;
4863 return PyUnicode_FromUnicode(&r, 1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004864 }
Tim Petersced69f82003-09-16 20:30:58 +00004865
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866 v = _PyUnicode_New(size);
4867 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004868 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004870 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004872 e = s + size;
4873 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004874 register unsigned char c = (unsigned char)*s;
4875 if (c < 128) {
4876 *p++ = c;
4877 ++s;
4878 }
4879 else {
4880 startinpos = s-starts;
4881 endinpos = startinpos + 1;
4882 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4883 if (unicode_decode_call_errorhandler(
4884 errors, &errorHandler,
4885 "ascii", "ordinal not in range(128)",
4886 &starts, &e, &startinpos, &endinpos, &exc, &s,
4887 &v, &outpos, &p))
4888 goto onError;
4889 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890 }
Martin v. Löwis5b222132007-06-10 09:51:05 +00004891 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4893 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004894 Py_XDECREF(errorHandler);
4895 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00004897
Benjamin Peterson29060642009-01-31 22:14:21 +00004898 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004900 Py_XDECREF(errorHandler);
4901 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902 return NULL;
4903}
4904
Alexander Belopolsky40018472011-02-26 01:02:56 +00004905PyObject *
4906PyUnicode_EncodeASCII(const Py_UNICODE *p,
4907 Py_ssize_t size,
4908 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004910 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911}
4912
Alexander Belopolsky40018472011-02-26 01:02:56 +00004913PyObject *
4914PyUnicode_AsASCIIString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915{
4916 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004917 PyErr_BadArgument();
4918 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 }
4920 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004921 PyUnicode_GET_SIZE(unicode),
4922 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923}
4924
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00004925#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004926
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00004927/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00004928
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00004929#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004930#define NEED_RETRY
4931#endif
4932
4933/* XXX This code is limited to "true" double-byte encodings, as
4934 a) it assumes an incomplete character consists of a single byte, and
4935 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00004936 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004937
Alexander Belopolsky40018472011-02-26 01:02:56 +00004938static int
4939is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004940{
4941 const char *curr = s + offset;
4942
4943 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004944 const char *prev = CharPrev(s, curr);
4945 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004946 }
4947 return 0;
4948}
4949
4950/*
4951 * Decode MBCS string into unicode object. If 'final' is set, converts
4952 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4953 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00004954static int
4955decode_mbcs(PyUnicodeObject **v,
4956 const char *s, /* MBCS string */
4957 int size, /* sizeof MBCS string */
4958 int final,
4959 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004960{
4961 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00004962 Py_ssize_t n;
4963 DWORD usize;
4964 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004965
4966 assert(size >= 0);
4967
Victor Stinner554f3f02010-06-16 23:33:54 +00004968 /* check and handle 'errors' arg */
4969 if (errors==NULL || strcmp(errors, "strict")==0)
4970 flags = MB_ERR_INVALID_CHARS;
4971 else if (strcmp(errors, "ignore")==0)
4972 flags = 0;
4973 else {
4974 PyErr_Format(PyExc_ValueError,
4975 "mbcs encoding does not support errors='%s'",
4976 errors);
4977 return -1;
4978 }
4979
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004980 /* Skip trailing lead-byte unless 'final' is set */
4981 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00004982 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004983
4984 /* First get the size of the result */
4985 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00004986 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4987 if (usize==0)
4988 goto mbcs_decode_error;
4989 } else
4990 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004991
4992 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004993 /* Create unicode object */
4994 *v = _PyUnicode_New(usize);
4995 if (*v == NULL)
4996 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00004997 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00004998 }
4999 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005000 /* Extend unicode object */
5001 n = PyUnicode_GET_SIZE(*v);
5002 if (_PyUnicode_Resize(v, n + usize) < 0)
5003 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005004 }
5005
5006 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00005007 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005008 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005009 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
5010 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00005011 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005012 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005013 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00005014
5015mbcs_decode_error:
5016 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
5017 we raise a UnicodeDecodeError - else it is a 'generic'
5018 windows error
5019 */
5020 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
5021 /* Ideally, we should get reason from FormatMessage - this
5022 is the Windows 2000 English version of the message
5023 */
5024 PyObject *exc = NULL;
5025 const char *reason = "No mapping for the Unicode character exists "
5026 "in the target multi-byte code page.";
5027 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
5028 if (exc != NULL) {
5029 PyCodec_StrictErrors(exc);
5030 Py_DECREF(exc);
5031 }
5032 } else {
5033 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5034 }
5035 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005036}
5037
Alexander Belopolsky40018472011-02-26 01:02:56 +00005038PyObject *
5039PyUnicode_DecodeMBCSStateful(const char *s,
5040 Py_ssize_t size,
5041 const char *errors,
5042 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005043{
5044 PyUnicodeObject *v = NULL;
5045 int done;
5046
5047 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005049
5050#ifdef NEED_RETRY
5051 retry:
5052 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005053 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005054 else
5055#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005056 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005057
5058 if (done < 0) {
5059 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005061 }
5062
5063 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005065
5066#ifdef NEED_RETRY
5067 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 s += done;
5069 size -= done;
5070 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005071 }
5072#endif
5073
5074 return (PyObject *)v;
5075}
5076
Alexander Belopolsky40018472011-02-26 01:02:56 +00005077PyObject *
5078PyUnicode_DecodeMBCS(const char *s,
5079 Py_ssize_t size,
5080 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005081{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005082 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5083}
5084
5085/*
5086 * Convert unicode into string object (MBCS).
5087 * Returns 0 if succeed, -1 otherwise.
5088 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005089static int
5090encode_mbcs(PyObject **repr,
5091 const Py_UNICODE *p, /* unicode */
5092 int size, /* size of unicode */
5093 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005094{
Victor Stinner554f3f02010-06-16 23:33:54 +00005095 BOOL usedDefaultChar = FALSE;
5096 BOOL *pusedDefaultChar;
5097 int mbcssize;
5098 Py_ssize_t n;
5099 PyObject *exc = NULL;
5100 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005101
5102 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005103
Victor Stinner554f3f02010-06-16 23:33:54 +00005104 /* check and handle 'errors' arg */
5105 if (errors==NULL || strcmp(errors, "strict")==0) {
5106 flags = WC_NO_BEST_FIT_CHARS;
5107 pusedDefaultChar = &usedDefaultChar;
5108 } else if (strcmp(errors, "replace")==0) {
5109 flags = 0;
5110 pusedDefaultChar = NULL;
5111 } else {
5112 PyErr_Format(PyExc_ValueError,
5113 "mbcs encoding does not support errors='%s'",
5114 errors);
5115 return -1;
5116 }
5117
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005118 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005119 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00005120 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5121 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00005122 if (mbcssize == 0) {
5123 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5124 return -1;
5125 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005126 /* If we used a default char, then we failed! */
5127 if (pusedDefaultChar && *pusedDefaultChar)
5128 goto mbcs_encode_error;
5129 } else {
5130 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005131 }
5132
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005133 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005134 /* Create string object */
5135 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5136 if (*repr == NULL)
5137 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00005138 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005139 }
5140 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005141 /* Extend string object */
5142 n = PyBytes_Size(*repr);
5143 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5144 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005145 }
5146
5147 /* Do the conversion */
5148 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005149 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00005150 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5151 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005152 PyErr_SetFromWindowsErrWithFilename(0, NULL);
5153 return -1;
5154 }
Victor Stinner554f3f02010-06-16 23:33:54 +00005155 if (pusedDefaultChar && *pusedDefaultChar)
5156 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005157 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005158 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00005159
5160mbcs_encode_error:
5161 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5162 Py_XDECREF(exc);
5163 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005164}
5165
Alexander Belopolsky40018472011-02-26 01:02:56 +00005166PyObject *
5167PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5168 Py_ssize_t size,
5169 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005170{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005171 PyObject *repr = NULL;
5172 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00005173
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005174#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00005175 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005176 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00005177 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005178 else
5179#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00005180 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005181
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005182 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005183 Py_XDECREF(repr);
5184 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005185 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005186
5187#ifdef NEED_RETRY
5188 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 p += INT_MAX;
5190 size -= INT_MAX;
5191 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005192 }
5193#endif
5194
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005195 return repr;
5196}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00005197
Alexander Belopolsky40018472011-02-26 01:02:56 +00005198PyObject *
5199PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005200{
5201 if (!PyUnicode_Check(unicode)) {
5202 PyErr_BadArgument();
5203 return NULL;
5204 }
5205 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 PyUnicode_GET_SIZE(unicode),
5207 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00005208}
5209
Thomas Wouters0e3f5912006-08-11 14:57:12 +00005210#undef NEED_RETRY
5211
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00005212#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00005213
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214/* --- Character Mapping Codec -------------------------------------------- */
5215
Alexander Belopolsky40018472011-02-26 01:02:56 +00005216PyObject *
5217PyUnicode_DecodeCharmap(const char *s,
5218 Py_ssize_t size,
5219 PyObject *mapping,
5220 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005222 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005223 Py_ssize_t startinpos;
5224 Py_ssize_t endinpos;
5225 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005226 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227 PyUnicodeObject *v;
5228 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005229 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005230 PyObject *errorHandler = NULL;
5231 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005232 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005233 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00005234
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235 /* Default to Latin-1 */
5236 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005237 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238
5239 v = _PyUnicode_New(size);
5240 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005245 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005246 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005247 mapstring = PyUnicode_AS_UNICODE(mapping);
5248 maplen = PyUnicode_GET_SIZE(mapping);
5249 while (s < e) {
5250 unsigned char ch = *s;
5251 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252
Benjamin Peterson29060642009-01-31 22:14:21 +00005253 if (ch < maplen)
5254 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255
Benjamin Peterson29060642009-01-31 22:14:21 +00005256 if (x == 0xfffe) {
5257 /* undefined mapping */
5258 outpos = p-PyUnicode_AS_UNICODE(v);
5259 startinpos = s-starts;
5260 endinpos = startinpos+1;
5261 if (unicode_decode_call_errorhandler(
5262 errors, &errorHandler,
5263 "charmap", "character maps to <undefined>",
5264 &starts, &e, &startinpos, &endinpos, &exc, &s,
5265 &v, &outpos, &p)) {
5266 goto onError;
5267 }
5268 continue;
5269 }
5270 *p++ = x;
5271 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005272 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005273 }
5274 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005275 while (s < e) {
5276 unsigned char ch = *s;
5277 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00005278
Benjamin Peterson29060642009-01-31 22:14:21 +00005279 /* Get mapping (char ordinal -> integer, Unicode char or None) */
5280 w = PyLong_FromLong((long)ch);
5281 if (w == NULL)
5282 goto onError;
5283 x = PyObject_GetItem(mapping, w);
5284 Py_DECREF(w);
5285 if (x == NULL) {
5286 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5287 /* No mapping found means: mapping is undefined. */
5288 PyErr_Clear();
5289 x = Py_None;
5290 Py_INCREF(x);
5291 } else
5292 goto onError;
5293 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005294
Benjamin Peterson29060642009-01-31 22:14:21 +00005295 /* Apply mapping */
5296 if (PyLong_Check(x)) {
5297 long value = PyLong_AS_LONG(x);
5298 if (value < 0 || value > 65535) {
5299 PyErr_SetString(PyExc_TypeError,
5300 "character mapping must be in range(65536)");
5301 Py_DECREF(x);
5302 goto onError;
5303 }
5304 *p++ = (Py_UNICODE)value;
5305 }
5306 else if (x == Py_None) {
5307 /* undefined mapping */
5308 outpos = p-PyUnicode_AS_UNICODE(v);
5309 startinpos = s-starts;
5310 endinpos = startinpos+1;
5311 if (unicode_decode_call_errorhandler(
5312 errors, &errorHandler,
5313 "charmap", "character maps to <undefined>",
5314 &starts, &e, &startinpos, &endinpos, &exc, &s,
5315 &v, &outpos, &p)) {
5316 Py_DECREF(x);
5317 goto onError;
5318 }
5319 Py_DECREF(x);
5320 continue;
5321 }
5322 else if (PyUnicode_Check(x)) {
5323 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005324
Benjamin Peterson29060642009-01-31 22:14:21 +00005325 if (targetsize == 1)
5326 /* 1-1 mapping */
5327 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005328
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 else if (targetsize > 1) {
5330 /* 1-n mapping */
5331 if (targetsize > extrachars) {
5332 /* resize first */
5333 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5334 Py_ssize_t needed = (targetsize - extrachars) + \
5335 (targetsize << 2);
5336 extrachars += needed;
5337 /* XXX overflow detection missing */
5338 if (_PyUnicode_Resize(&v,
5339 PyUnicode_GET_SIZE(v) + needed) < 0) {
5340 Py_DECREF(x);
5341 goto onError;
5342 }
5343 p = PyUnicode_AS_UNICODE(v) + oldpos;
5344 }
5345 Py_UNICODE_COPY(p,
5346 PyUnicode_AS_UNICODE(x),
5347 targetsize);
5348 p += targetsize;
5349 extrachars -= targetsize;
5350 }
5351 /* 1-0 mapping: skip the character */
5352 }
5353 else {
5354 /* wrong return value */
5355 PyErr_SetString(PyExc_TypeError,
5356 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00005357 Py_DECREF(x);
5358 goto onError;
5359 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 Py_DECREF(x);
5361 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005362 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 }
5364 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5366 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005367 Py_XDECREF(errorHandler);
5368 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00005370
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005372 Py_XDECREF(errorHandler);
5373 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 Py_XDECREF(v);
5375 return NULL;
5376}
5377
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005378/* Charmap encoding: the lookup table */
5379
Alexander Belopolsky40018472011-02-26 01:02:56 +00005380struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00005381 PyObject_HEAD
5382 unsigned char level1[32];
5383 int count2, count3;
5384 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005385};
5386
5387static PyObject*
5388encoding_map_size(PyObject *obj, PyObject* args)
5389{
5390 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005391 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005393}
5394
5395static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005396 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00005397 PyDoc_STR("Return the size (in bytes) of this object") },
5398 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005399};
5400
5401static void
5402encoding_map_dealloc(PyObject* o)
5403{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005404 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005405}
5406
5407static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005408 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005409 "EncodingMap", /*tp_name*/
5410 sizeof(struct encoding_map), /*tp_basicsize*/
5411 0, /*tp_itemsize*/
5412 /* methods */
5413 encoding_map_dealloc, /*tp_dealloc*/
5414 0, /*tp_print*/
5415 0, /*tp_getattr*/
5416 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00005417 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 0, /*tp_repr*/
5419 0, /*tp_as_number*/
5420 0, /*tp_as_sequence*/
5421 0, /*tp_as_mapping*/
5422 0, /*tp_hash*/
5423 0, /*tp_call*/
5424 0, /*tp_str*/
5425 0, /*tp_getattro*/
5426 0, /*tp_setattro*/
5427 0, /*tp_as_buffer*/
5428 Py_TPFLAGS_DEFAULT, /*tp_flags*/
5429 0, /*tp_doc*/
5430 0, /*tp_traverse*/
5431 0, /*tp_clear*/
5432 0, /*tp_richcompare*/
5433 0, /*tp_weaklistoffset*/
5434 0, /*tp_iter*/
5435 0, /*tp_iternext*/
5436 encoding_map_methods, /*tp_methods*/
5437 0, /*tp_members*/
5438 0, /*tp_getset*/
5439 0, /*tp_base*/
5440 0, /*tp_dict*/
5441 0, /*tp_descr_get*/
5442 0, /*tp_descr_set*/
5443 0, /*tp_dictoffset*/
5444 0, /*tp_init*/
5445 0, /*tp_alloc*/
5446 0, /*tp_new*/
5447 0, /*tp_free*/
5448 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005449};
5450
5451PyObject*
5452PyUnicode_BuildEncodingMap(PyObject* string)
5453{
5454 Py_UNICODE *decode;
5455 PyObject *result;
5456 struct encoding_map *mresult;
5457 int i;
5458 int need_dict = 0;
5459 unsigned char level1[32];
5460 unsigned char level2[512];
5461 unsigned char *mlevel1, *mlevel2, *mlevel3;
5462 int count2 = 0, count3 = 0;
5463
5464 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5465 PyErr_BadArgument();
5466 return NULL;
5467 }
5468 decode = PyUnicode_AS_UNICODE(string);
5469 memset(level1, 0xFF, sizeof level1);
5470 memset(level2, 0xFF, sizeof level2);
5471
5472 /* If there isn't a one-to-one mapping of NULL to \0,
5473 or if there are non-BMP characters, we need to use
5474 a mapping dictionary. */
5475 if (decode[0] != 0)
5476 need_dict = 1;
5477 for (i = 1; i < 256; i++) {
5478 int l1, l2;
5479 if (decode[i] == 0
Benjamin Peterson29060642009-01-31 22:14:21 +00005480#ifdef Py_UNICODE_WIDE
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005481 || decode[i] > 0xFFFF
Benjamin Peterson29060642009-01-31 22:14:21 +00005482#endif
5483 ) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005484 need_dict = 1;
5485 break;
5486 }
5487 if (decode[i] == 0xFFFE)
5488 /* unmapped character */
5489 continue;
5490 l1 = decode[i] >> 11;
5491 l2 = decode[i] >> 7;
5492 if (level1[l1] == 0xFF)
5493 level1[l1] = count2++;
5494 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00005495 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005496 }
5497
5498 if (count2 >= 0xFF || count3 >= 0xFF)
5499 need_dict = 1;
5500
5501 if (need_dict) {
5502 PyObject *result = PyDict_New();
5503 PyObject *key, *value;
5504 if (!result)
5505 return NULL;
5506 for (i = 0; i < 256; i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00005507 key = PyLong_FromLong(decode[i]);
5508 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005509 if (!key || !value)
5510 goto failed1;
5511 if (PyDict_SetItem(result, key, value) == -1)
5512 goto failed1;
5513 Py_DECREF(key);
5514 Py_DECREF(value);
5515 }
5516 return result;
5517 failed1:
5518 Py_XDECREF(key);
5519 Py_XDECREF(value);
5520 Py_DECREF(result);
5521 return NULL;
5522 }
5523
5524 /* Create a three-level trie */
5525 result = PyObject_MALLOC(sizeof(struct encoding_map) +
5526 16*count2 + 128*count3 - 1);
5527 if (!result)
5528 return PyErr_NoMemory();
5529 PyObject_Init(result, &EncodingMapType);
5530 mresult = (struct encoding_map*)result;
5531 mresult->count2 = count2;
5532 mresult->count3 = count3;
5533 mlevel1 = mresult->level1;
5534 mlevel2 = mresult->level23;
5535 mlevel3 = mresult->level23 + 16*count2;
5536 memcpy(mlevel1, level1, 32);
5537 memset(mlevel2, 0xFF, 16*count2);
5538 memset(mlevel3, 0, 128*count3);
5539 count3 = 0;
5540 for (i = 1; i < 256; i++) {
5541 int o1, o2, o3, i2, i3;
5542 if (decode[i] == 0xFFFE)
5543 /* unmapped character */
5544 continue;
5545 o1 = decode[i]>>11;
5546 o2 = (decode[i]>>7) & 0xF;
5547 i2 = 16*mlevel1[o1] + o2;
5548 if (mlevel2[i2] == 0xFF)
5549 mlevel2[i2] = count3++;
5550 o3 = decode[i] & 0x7F;
5551 i3 = 128*mlevel2[i2] + o3;
5552 mlevel3[i3] = i;
5553 }
5554 return result;
5555}
5556
5557static int
5558encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5559{
5560 struct encoding_map *map = (struct encoding_map*)mapping;
5561 int l1 = c>>11;
5562 int l2 = (c>>7) & 0xF;
5563 int l3 = c & 0x7F;
5564 int i;
5565
5566#ifdef Py_UNICODE_WIDE
5567 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005569 }
5570#endif
5571 if (c == 0)
5572 return 0;
5573 /* level 1*/
5574 i = map->level1[l1];
5575 if (i == 0xFF) {
5576 return -1;
5577 }
5578 /* level 2*/
5579 i = map->level23[16*i+l2];
5580 if (i == 0xFF) {
5581 return -1;
5582 }
5583 /* level 3 */
5584 i = map->level23[16*map->count2 + 128*i + l3];
5585 if (i == 0) {
5586 return -1;
5587 }
5588 return i;
5589}
5590
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591/* Lookup the character ch in the mapping. If the character
5592 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00005593 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005594static PyObject *
5595charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596{
Christian Heimes217cfd12007-12-02 14:31:20 +00005597 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005598 PyObject *x;
5599
5600 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005602 x = PyObject_GetItem(mapping, w);
5603 Py_DECREF(w);
5604 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5606 /* No mapping found means: mapping is undefined. */
5607 PyErr_Clear();
5608 x = Py_None;
5609 Py_INCREF(x);
5610 return x;
5611 } else
5612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00005614 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00005615 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00005616 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 long value = PyLong_AS_LONG(x);
5618 if (value < 0 || value > 255) {
5619 PyErr_SetString(PyExc_TypeError,
5620 "character mapping must be in range(256)");
5621 Py_DECREF(x);
5622 return NULL;
5623 }
5624 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005626 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00005627 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005629 /* wrong return value */
5630 PyErr_Format(PyExc_TypeError,
5631 "character mapping must return integer, bytes or None, not %.400s",
5632 x->ob_type->tp_name);
5633 Py_DECREF(x);
5634 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 }
5636}
5637
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005638static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00005639charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005640{
Benjamin Peterson14339b62009-01-31 16:36:08 +00005641 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5642 /* exponentially overallocate to minimize reallocations */
5643 if (requiredsize < 2*outsize)
5644 requiredsize = 2*outsize;
5645 if (_PyBytes_Resize(outobj, requiredsize))
5646 return -1;
5647 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005648}
5649
Benjamin Peterson14339b62009-01-31 16:36:08 +00005650typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00005651 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00005652} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00005654 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005655 space is available. Return a new reference to the object that
5656 was put in the output buffer, or Py_None, if the mapping was undefined
5657 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00005658 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005659static charmapencode_result
5660charmapencode_output(Py_UNICODE c, PyObject *mapping,
5661 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005662{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005663 PyObject *rep;
5664 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00005665 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005666
Christian Heimes90aa7642007-12-19 02:45:37 +00005667 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005668 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005670 if (res == -1)
5671 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 if (outsize<requiredsize)
5673 if (charmapencode_resize(outobj, outpos, requiredsize))
5674 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00005675 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 outstart[(*outpos)++] = (char)res;
5677 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005678 }
5679
5680 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005681 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005683 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 Py_DECREF(rep);
5685 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005686 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 if (PyLong_Check(rep)) {
5688 Py_ssize_t requiredsize = *outpos+1;
5689 if (outsize<requiredsize)
5690 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5691 Py_DECREF(rep);
5692 return enc_EXCEPTION;
5693 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005694 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005695 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005696 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005697 else {
5698 const char *repchars = PyBytes_AS_STRING(rep);
5699 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5700 Py_ssize_t requiredsize = *outpos+repsize;
5701 if (outsize<requiredsize)
5702 if (charmapencode_resize(outobj, outpos, requiredsize)) {
5703 Py_DECREF(rep);
5704 return enc_EXCEPTION;
5705 }
Christian Heimes72b710a2008-05-26 13:28:38 +00005706 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 memcpy(outstart + *outpos, repchars, repsize);
5708 *outpos += repsize;
5709 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005710 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005711 Py_DECREF(rep);
5712 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005713}
5714
5715/* handle an error in PyUnicode_EncodeCharmap
5716 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005717static int
5718charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00005719 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005720 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00005721 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00005722 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005723{
5724 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005725 Py_ssize_t repsize;
5726 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005727 Py_UNICODE *uni2;
5728 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005729 Py_ssize_t collstartpos = *inpos;
5730 Py_ssize_t collendpos = *inpos+1;
5731 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005732 char *encoding = "charmap";
5733 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005734 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005736 /* find all unencodable characters */
5737 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00005738 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00005739 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 int res = encoding_map_lookup(p[collendpos], mapping);
5741 if (res != -1)
5742 break;
5743 ++collendpos;
5744 continue;
5745 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005746
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 rep = charmapencode_lookup(p[collendpos], mapping);
5748 if (rep==NULL)
5749 return -1;
5750 else if (rep!=Py_None) {
5751 Py_DECREF(rep);
5752 break;
5753 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005754 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756 }
5757 /* cache callback name lookup
5758 * (if not done yet, i.e. it's the first error) */
5759 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 if ((errors==NULL) || (!strcmp(errors, "strict")))
5761 *known_errorHandler = 1;
5762 else if (!strcmp(errors, "replace"))
5763 *known_errorHandler = 2;
5764 else if (!strcmp(errors, "ignore"))
5765 *known_errorHandler = 3;
5766 else if (!strcmp(errors, "xmlcharrefreplace"))
5767 *known_errorHandler = 4;
5768 else
5769 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770 }
5771 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005772 case 1: /* strict */
5773 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5774 return -1;
5775 case 2: /* replace */
5776 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 x = charmapencode_output('?', mapping, res, respos);
5778 if (x==enc_EXCEPTION) {
5779 return -1;
5780 }
5781 else if (x==enc_FAILED) {
5782 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5783 return -1;
5784 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005785 }
5786 /* fall through */
5787 case 3: /* ignore */
5788 *inpos = collendpos;
5789 break;
5790 case 4: /* xmlcharrefreplace */
5791 /* generate replacement (temporarily (mis)uses p) */
5792 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 char buffer[2+29+1+1];
5794 char *cp;
5795 sprintf(buffer, "&#%d;", (int)p[collpos]);
5796 for (cp = buffer; *cp; ++cp) {
5797 x = charmapencode_output(*cp, mapping, res, respos);
5798 if (x==enc_EXCEPTION)
5799 return -1;
5800 else if (x==enc_FAILED) {
5801 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5802 return -1;
5803 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005804 }
5805 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005806 *inpos = collendpos;
5807 break;
5808 default:
5809 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 encoding, reason, p, size, exceptionObject,
5811 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00005812 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00005814 if (PyBytes_Check(repunicode)) {
5815 /* Directly copy bytes result to output. */
5816 Py_ssize_t outsize = PyBytes_Size(*res);
5817 Py_ssize_t requiredsize;
5818 repsize = PyBytes_Size(repunicode);
5819 requiredsize = *respos + repsize;
5820 if (requiredsize > outsize)
5821 /* Make room for all additional bytes. */
5822 if (charmapencode_resize(res, respos, requiredsize)) {
5823 Py_DECREF(repunicode);
5824 return -1;
5825 }
5826 memcpy(PyBytes_AsString(*res) + *respos,
5827 PyBytes_AsString(repunicode), repsize);
5828 *respos += repsize;
5829 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005830 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00005831 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00005832 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005833 /* generate replacement */
5834 repsize = PyUnicode_GET_SIZE(repunicode);
5835 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 x = charmapencode_output(*uni2, mapping, res, respos);
5837 if (x==enc_EXCEPTION) {
5838 return -1;
5839 }
5840 else if (x==enc_FAILED) {
5841 Py_DECREF(repunicode);
5842 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5843 return -1;
5844 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005845 }
5846 *inpos = newpos;
5847 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005848 }
5849 return 0;
5850}
5851
Alexander Belopolsky40018472011-02-26 01:02:56 +00005852PyObject *
5853PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5854 Py_ssize_t size,
5855 PyObject *mapping,
5856 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005858 /* output object */
5859 PyObject *res = NULL;
5860 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005861 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005862 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00005863 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005864 PyObject *errorHandler = NULL;
5865 PyObject *exc = NULL;
5866 /* the following variable is used for caching string comparisons
5867 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5868 * 3=ignore, 4=xmlcharrefreplace */
5869 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870
5871 /* Default to Latin-1 */
5872 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005873 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005875 /* allocate enough for a simple encoding without
5876 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00005877 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005878 if (res == NULL)
5879 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00005880 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005881 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005883 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 /* try to encode it */
5885 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5886 if (x==enc_EXCEPTION) /* error */
5887 goto onError;
5888 if (x==enc_FAILED) { /* unencodable character */
5889 if (charmap_encoding_error(p, size, &inpos, mapping,
5890 &exc,
5891 &known_errorHandler, &errorHandler, errors,
5892 &res, &respos)) {
5893 goto onError;
5894 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005895 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 else
5897 /* done with this character => adjust input position */
5898 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005901 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00005902 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005903 if (_PyBytes_Resize(&res, respos) < 0)
5904 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005905
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906 Py_XDECREF(exc);
5907 Py_XDECREF(errorHandler);
5908 return res;
5909
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005911 Py_XDECREF(res);
5912 Py_XDECREF(exc);
5913 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 return NULL;
5915}
5916
Alexander Belopolsky40018472011-02-26 01:02:56 +00005917PyObject *
5918PyUnicode_AsCharmapString(PyObject *unicode,
5919 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920{
5921 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 PyErr_BadArgument();
5923 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 }
5925 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 PyUnicode_GET_SIZE(unicode),
5927 mapping,
5928 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929}
5930
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005931/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005932static void
5933make_translate_exception(PyObject **exceptionObject,
5934 const Py_UNICODE *unicode, Py_ssize_t size,
5935 Py_ssize_t startpos, Py_ssize_t endpos,
5936 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005938 if (*exceptionObject == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00005939 *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 }
5942 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5944 goto onError;
5945 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5946 goto onError;
5947 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5948 goto onError;
5949 return;
5950 onError:
5951 Py_DECREF(*exceptionObject);
5952 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 }
5954}
5955
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005956/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005957static void
5958raise_translate_exception(PyObject **exceptionObject,
5959 const Py_UNICODE *unicode, Py_ssize_t size,
5960 Py_ssize_t startpos, Py_ssize_t endpos,
5961 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005962{
5963 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005964 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005965 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005966 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005967}
5968
5969/* error handling callback helper:
5970 build arguments, call the callback and check the arguments,
5971 put the result into newpos and return the replacement string, which
5972 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00005973static PyObject *
5974unicode_translate_call_errorhandler(const char *errors,
5975 PyObject **errorHandler,
5976 const char *reason,
5977 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5978 Py_ssize_t startpos, Py_ssize_t endpos,
5979 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005980{
Benjamin Peterson142957c2008-07-04 19:55:29 +00005981 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005982
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00005983 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005984 PyObject *restuple;
5985 PyObject *resunicode;
5986
5987 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005989 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005990 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005991 }
5992
5993 make_translate_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005995 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005996 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005997
5998 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006000 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006002 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00006003 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006004 Py_DECREF(restuple);
6005 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006006 }
6007 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 &resunicode, &i_newpos)) {
6009 Py_DECREF(restuple);
6010 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006011 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00006012 if (i_newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 *newpos = size+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006014 else
6015 *newpos = i_newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006016 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6018 Py_DECREF(restuple);
6019 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006020 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006021 Py_INCREF(resunicode);
6022 Py_DECREF(restuple);
6023 return resunicode;
6024}
6025
6026/* Lookup the character ch in the mapping and put the result in result,
6027 which must be decrefed by the caller.
6028 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006029static int
6030charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006031{
Christian Heimes217cfd12007-12-02 14:31:20 +00006032 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006033 PyObject *x;
6034
6035 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006037 x = PyObject_GetItem(mapping, w);
6038 Py_DECREF(w);
6039 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6041 /* No mapping found means: use 1:1 mapping. */
6042 PyErr_Clear();
6043 *result = NULL;
6044 return 0;
6045 } else
6046 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006047 }
6048 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 *result = x;
6050 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006051 }
Christian Heimes217cfd12007-12-02 14:31:20 +00006052 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006053 long value = PyLong_AS_LONG(x);
6054 long max = PyUnicode_GetMax();
6055 if (value < 0 || value > max) {
6056 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00006057 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 Py_DECREF(x);
6059 return -1;
6060 }
6061 *result = x;
6062 return 0;
6063 }
6064 else if (PyUnicode_Check(x)) {
6065 *result = x;
6066 return 0;
6067 }
6068 else {
6069 /* wrong return value */
6070 PyErr_SetString(PyExc_TypeError,
6071 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00006072 Py_DECREF(x);
6073 return -1;
6074 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006075}
6076/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 if not reallocate and adjust various state variables.
6078 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006079static int
6080charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006082{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006083 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
Walter Dörwald4894c302003-10-24 14:25:28 +00006084 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 /* remember old output position */
6086 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6087 /* exponentially overallocate to minimize reallocations */
6088 if (requiredsize < 2 * oldsize)
6089 requiredsize = 2 * oldsize;
6090 if (PyUnicode_Resize(outobj, requiredsize) < 0)
6091 return -1;
6092 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006093 }
6094 return 0;
6095}
6096/* lookup the character, put the result in the output string and adjust
6097 various state variables. Return a new reference to the object that
6098 was put in the output buffer in *result, or Py_None, if the mapping was
6099 undefined (in which case no character was written).
6100 The called must decref result.
6101 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006102static int
6103charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
6104 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6105 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006106{
Walter Dörwald4894c302003-10-24 14:25:28 +00006107 if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006109 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 /* not found => default to 1:1 mapping */
6111 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006112 }
6113 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00006114 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00006115 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006116 /* no overflow check, because we know that the space is enough */
6117 *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006118 }
6119 else if (PyUnicode_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6121 if (repsize==1) {
6122 /* no overflow check, because we know that the space is enough */
6123 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6124 }
6125 else if (repsize!=0) {
6126 /* more than one character */
6127 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6128 (insize - (curinp-startinp)) +
6129 repsize - 1;
6130 if (charmaptranslate_makespace(outobj, outp, requiredsize))
6131 return -1;
6132 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6133 *outp += repsize;
6134 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006135 }
6136 else
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138 return 0;
6139}
6140
Alexander Belopolsky40018472011-02-26 01:02:56 +00006141PyObject *
6142PyUnicode_TranslateCharmap(const Py_UNICODE *p,
6143 Py_ssize_t size,
6144 PyObject *mapping,
6145 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147 /* output object */
6148 PyObject *res = NULL;
6149 /* pointers to the beginning and end+1 of input */
6150 const Py_UNICODE *startp = p;
6151 const Py_UNICODE *endp = p + size;
6152 /* pointer into the output */
6153 Py_UNICODE *str;
6154 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006155 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006156 char *reason = "character maps to <undefined>";
6157 PyObject *errorHandler = NULL;
6158 PyObject *exc = NULL;
6159 /* the following variable is used for caching string comparisons
6160 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6161 * 3=ignore, 4=xmlcharrefreplace */
6162 int known_errorHandler = -1;
6163
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006165 PyErr_BadArgument();
6166 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006168
6169 /* allocate enough for a simple 1:1 translation without
6170 replacements, if we need more, we'll resize */
6171 res = PyUnicode_FromUnicode(NULL, size);
6172 if (res == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006173 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006175 return res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006176 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006178 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006179 /* try to encode it */
6180 PyObject *x = NULL;
6181 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6182 Py_XDECREF(x);
6183 goto onError;
6184 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006185 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 if (x!=Py_None) /* it worked => adjust input pointer */
6187 ++p;
6188 else { /* untranslatable character */
6189 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6190 Py_ssize_t repsize;
6191 Py_ssize_t newpos;
6192 Py_UNICODE *uni2;
6193 /* startpos for collecting untranslatable chars */
6194 const Py_UNICODE *collstart = p;
6195 const Py_UNICODE *collend = p+1;
6196 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 /* find all untranslatable characters */
6199 while (collend < endp) {
6200 if (charmaptranslate_lookup(*collend, mapping, &x))
6201 goto onError;
6202 Py_XDECREF(x);
6203 if (x!=Py_None)
6204 break;
6205 ++collend;
6206 }
6207 /* cache callback name lookup
6208 * (if not done yet, i.e. it's the first error) */
6209 if (known_errorHandler==-1) {
6210 if ((errors==NULL) || (!strcmp(errors, "strict")))
6211 known_errorHandler = 1;
6212 else if (!strcmp(errors, "replace"))
6213 known_errorHandler = 2;
6214 else if (!strcmp(errors, "ignore"))
6215 known_errorHandler = 3;
6216 else if (!strcmp(errors, "xmlcharrefreplace"))
6217 known_errorHandler = 4;
6218 else
6219 known_errorHandler = 0;
6220 }
6221 switch (known_errorHandler) {
6222 case 1: /* strict */
6223 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006224 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006225 case 2: /* replace */
6226 /* No need to check for space, this is a 1:1 replacement */
6227 for (coll = collstart; coll<collend; ++coll)
6228 *str++ = '?';
6229 /* fall through */
6230 case 3: /* ignore */
6231 p = collend;
6232 break;
6233 case 4: /* xmlcharrefreplace */
6234 /* generate replacement (temporarily (mis)uses p) */
6235 for (p = collstart; p < collend; ++p) {
6236 char buffer[2+29+1+1];
6237 char *cp;
6238 sprintf(buffer, "&#%d;", (int)*p);
6239 if (charmaptranslate_makespace(&res, &str,
6240 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6241 goto onError;
6242 for (cp = buffer; *cp; ++cp)
6243 *str++ = *cp;
6244 }
6245 p = collend;
6246 break;
6247 default:
6248 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6249 reason, startp, size, &exc,
6250 collstart-startp, collend-startp, &newpos);
6251 if (repunicode == NULL)
6252 goto onError;
6253 /* generate replacement */
6254 repsize = PyUnicode_GET_SIZE(repunicode);
6255 if (charmaptranslate_makespace(&res, &str,
6256 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6257 Py_DECREF(repunicode);
6258 goto onError;
6259 }
6260 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6261 *str++ = *uni2;
6262 p = startp + newpos;
6263 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006264 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006265 }
6266 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006267 /* Resize if we allocated to much */
6268 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00006269 if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006270 if (PyUnicode_Resize(&res, respos) < 0)
6271 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006272 }
6273 Py_XDECREF(exc);
6274 Py_XDECREF(errorHandler);
6275 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006278 Py_XDECREF(res);
6279 Py_XDECREF(exc);
6280 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 return NULL;
6282}
6283
Alexander Belopolsky40018472011-02-26 01:02:56 +00006284PyObject *
6285PyUnicode_Translate(PyObject *str,
6286 PyObject *mapping,
6287 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288{
6289 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006290
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 str = PyUnicode_FromObject(str);
6292 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006293 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson29060642009-01-31 22:14:21 +00006295 PyUnicode_GET_SIZE(str),
6296 mapping,
6297 errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 Py_DECREF(str);
6299 return result;
Tim Petersced69f82003-09-16 20:30:58 +00006300
Benjamin Peterson29060642009-01-31 22:14:21 +00006301 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302 Py_XDECREF(str);
6303 return NULL;
6304}
Tim Petersced69f82003-09-16 20:30:58 +00006305
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00006306PyObject *
6307PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6308 Py_ssize_t length)
6309{
6310 PyObject *result;
6311 Py_UNICODE *p; /* write pointer into result */
6312 Py_ssize_t i;
6313 /* Copy to a new string */
6314 result = (PyObject *)_PyUnicode_New(length);
6315 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6316 if (result == NULL)
6317 return result;
6318 p = PyUnicode_AS_UNICODE(result);
6319 /* Iterate over code points */
6320 for (i = 0; i < length; i++) {
6321 Py_UNICODE ch =s[i];
6322 if (ch > 127) {
6323 int decimal = Py_UNICODE_TODECIMAL(ch);
6324 if (decimal >= 0)
6325 p[i] = '0' + decimal;
6326 }
6327 }
6328 return result;
6329}
Guido van Rossum9e896b32000-04-05 20:11:21 +00006330/* --- Decimal Encoder ---------------------------------------------------- */
6331
Alexander Belopolsky40018472011-02-26 01:02:56 +00006332int
6333PyUnicode_EncodeDecimal(Py_UNICODE *s,
6334 Py_ssize_t length,
6335 char *output,
6336 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00006337{
6338 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006339 PyObject *errorHandler = NULL;
6340 PyObject *exc = NULL;
6341 const char *encoding = "decimal";
6342 const char *reason = "invalid decimal Unicode string";
6343 /* the following variable is used for caching string comparisons
6344 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6345 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006346
6347 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006348 PyErr_BadArgument();
6349 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00006350 }
6351
6352 p = s;
6353 end = s + length;
6354 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 register Py_UNICODE ch = *p;
6356 int decimal;
6357 PyObject *repunicode;
6358 Py_ssize_t repsize;
6359 Py_ssize_t newpos;
6360 Py_UNICODE *uni2;
6361 Py_UNICODE *collstart;
6362 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00006363
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006365 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 ++p;
6367 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006368 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 decimal = Py_UNICODE_TODECIMAL(ch);
6370 if (decimal >= 0) {
6371 *output++ = '0' + decimal;
6372 ++p;
6373 continue;
6374 }
6375 if (0 < ch && ch < 256) {
6376 *output++ = (char)ch;
6377 ++p;
6378 continue;
6379 }
6380 /* All other characters are considered unencodable */
6381 collstart = p;
6382 collend = p+1;
6383 while (collend < end) {
6384 if ((0 < *collend && *collend < 256) ||
6385 !Py_UNICODE_ISSPACE(*collend) ||
6386 Py_UNICODE_TODECIMAL(*collend))
6387 break;
6388 }
6389 /* cache callback name lookup
6390 * (if not done yet, i.e. it's the first error) */
6391 if (known_errorHandler==-1) {
6392 if ((errors==NULL) || (!strcmp(errors, "strict")))
6393 known_errorHandler = 1;
6394 else if (!strcmp(errors, "replace"))
6395 known_errorHandler = 2;
6396 else if (!strcmp(errors, "ignore"))
6397 known_errorHandler = 3;
6398 else if (!strcmp(errors, "xmlcharrefreplace"))
6399 known_errorHandler = 4;
6400 else
6401 known_errorHandler = 0;
6402 }
6403 switch (known_errorHandler) {
6404 case 1: /* strict */
6405 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6406 goto onError;
6407 case 2: /* replace */
6408 for (p = collstart; p < collend; ++p)
6409 *output++ = '?';
6410 /* fall through */
6411 case 3: /* ignore */
6412 p = collend;
6413 break;
6414 case 4: /* xmlcharrefreplace */
6415 /* generate replacement (temporarily (mis)uses p) */
6416 for (p = collstart; p < collend; ++p)
6417 output += sprintf(output, "&#%d;", (int)*p);
6418 p = collend;
6419 break;
6420 default:
6421 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6422 encoding, reason, s, length, &exc,
6423 collstart-s, collend-s, &newpos);
6424 if (repunicode == NULL)
6425 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006426 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006427 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006428 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6429 Py_DECREF(repunicode);
6430 goto onError;
6431 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 /* generate replacement */
6433 repsize = PyUnicode_GET_SIZE(repunicode);
6434 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6435 Py_UNICODE ch = *uni2;
6436 if (Py_UNICODE_ISSPACE(ch))
6437 *output++ = ' ';
6438 else {
6439 decimal = Py_UNICODE_TODECIMAL(ch);
6440 if (decimal >= 0)
6441 *output++ = '0' + decimal;
6442 else if (0 < ch && ch < 256)
6443 *output++ = (char)ch;
6444 else {
6445 Py_DECREF(repunicode);
6446 raise_encode_exception(&exc, encoding,
6447 s, length, collstart-s, collend-s, reason);
6448 goto onError;
6449 }
6450 }
6451 }
6452 p = s + newpos;
6453 Py_DECREF(repunicode);
6454 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00006455 }
6456 /* 0-terminate the output string */
6457 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006458 Py_XDECREF(exc);
6459 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006460 return 0;
6461
Benjamin Peterson29060642009-01-31 22:14:21 +00006462 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463 Py_XDECREF(exc);
6464 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00006465 return -1;
6466}
6467
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468/* --- Helpers ------------------------------------------------------------ */
6469
Eric Smith8c663262007-08-25 02:26:07 +00006470#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006471#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006472
Thomas Wouters477c8d52006-05-27 19:21:47 +00006473#include "stringlib/count.h"
6474#include "stringlib/find.h"
6475#include "stringlib/partition.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006476#include "stringlib/split.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00006477
Eric Smith5807c412008-05-11 21:00:57 +00006478#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
Eric Smitha3b1ac82009-04-03 14:45:06 +00006479#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
Eric Smith5807c412008-05-11 21:00:57 +00006480#include "stringlib/localeutil.h"
6481
Thomas Wouters477c8d52006-05-27 19:21:47 +00006482/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006483#define ADJUST_INDICES(start, end, len) \
6484 if (end > len) \
6485 end = len; \
6486 else if (end < 0) { \
6487 end += len; \
6488 if (end < 0) \
6489 end = 0; \
6490 } \
6491 if (start < 0) { \
6492 start += len; \
6493 if (start < 0) \
6494 start = 0; \
6495 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00006496
Alexander Belopolsky40018472011-02-26 01:02:56 +00006497Py_ssize_t
6498PyUnicode_Count(PyObject *str,
6499 PyObject *substr,
6500 Py_ssize_t start,
6501 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006503 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006504 PyUnicodeObject* str_obj;
6505 PyUnicodeObject* sub_obj;
Tim Petersced69f82003-09-16 20:30:58 +00006506
Thomas Wouters477c8d52006-05-27 19:21:47 +00006507 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6508 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006510 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6511 if (!sub_obj) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 Py_DECREF(str_obj);
6513 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 }
Tim Petersced69f82003-09-16 20:30:58 +00006515
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006516 ADJUST_INDICES(start, end, str_obj->length);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006517 result = stringlib_count(
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006518 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6519 PY_SSIZE_T_MAX
Thomas Wouters477c8d52006-05-27 19:21:47 +00006520 );
6521
6522 Py_DECREF(sub_obj);
6523 Py_DECREF(str_obj);
6524
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 return result;
6526}
6527
Alexander Belopolsky40018472011-02-26 01:02:56 +00006528Py_ssize_t
6529PyUnicode_Find(PyObject *str,
6530 PyObject *sub,
6531 Py_ssize_t start,
6532 Py_ssize_t end,
6533 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006535 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006536
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 str = PyUnicode_FromObject(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006538 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006540 sub = PyUnicode_FromObject(sub);
6541 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 Py_DECREF(str);
6543 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 }
Tim Petersced69f82003-09-16 20:30:58 +00006545
Thomas Wouters477c8d52006-05-27 19:21:47 +00006546 if (direction > 0)
6547 result = stringlib_find_slice(
6548 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6549 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6550 start, end
6551 );
6552 else
6553 result = stringlib_rfind_slice(
6554 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6555 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6556 start, end
6557 );
6558
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00006560 Py_DECREF(sub);
6561
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562 return result;
6563}
6564
Alexander Belopolsky40018472011-02-26 01:02:56 +00006565static int
6566tailmatch(PyUnicodeObject *self,
6567 PyUnicodeObject *substring,
6568 Py_ssize_t start,
6569 Py_ssize_t end,
6570 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 if (substring->length == 0)
6573 return 1;
6574
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006575 ADJUST_INDICES(start, end, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 end -= substring->length;
6577 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579
6580 if (direction > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 if (Py_UNICODE_MATCH(self, end, substring))
6582 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583 } else {
6584 if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006586 }
6587
6588 return 0;
6589}
6590
Alexander Belopolsky40018472011-02-26 01:02:56 +00006591Py_ssize_t
6592PyUnicode_Tailmatch(PyObject *str,
6593 PyObject *substr,
6594 Py_ssize_t start,
6595 Py_ssize_t end,
6596 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006598 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00006599
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 str = PyUnicode_FromObject(str);
6601 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 substr = PyUnicode_FromObject(substr);
6604 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006605 Py_DECREF(str);
6606 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 }
Tim Petersced69f82003-09-16 20:30:58 +00006608
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 (PyUnicodeObject *)substr,
6611 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 Py_DECREF(str);
6613 Py_DECREF(substr);
6614 return result;
6615}
6616
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617/* Apply fixfct filter to the Unicode object self and return a
6618 reference to the modified object */
6619
Alexander Belopolsky40018472011-02-26 01:02:56 +00006620static PyObject *
6621fixup(PyUnicodeObject *self,
6622 int (*fixfct)(PyUnicodeObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623{
6624
6625 PyUnicodeObject *u;
6626
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006627 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006629 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006630
6631 Py_UNICODE_COPY(u->str, self->str, self->length);
6632
Tim Peters7a29bd52001-09-12 03:03:31 +00006633 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006634 /* fixfct should return TRUE if it modified the buffer. If
6635 FALSE, return a reference to the original buffer instead
6636 (to save space, not time) */
6637 Py_INCREF(self);
6638 Py_DECREF(u);
6639 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640 }
6641 return (PyObject*) u;
6642}
6643
Alexander Belopolsky40018472011-02-26 01:02:56 +00006644static int
6645fixupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006647 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648 Py_UNICODE *s = self->str;
6649 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006650
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006652 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006653
Benjamin Peterson29060642009-01-31 22:14:21 +00006654 ch = Py_UNICODE_TOUPPER(*s);
6655 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006657 *s = ch;
6658 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659 s++;
6660 }
6661
6662 return status;
6663}
6664
Alexander Belopolsky40018472011-02-26 01:02:56 +00006665static int
6666fixlower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006668 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006669 Py_UNICODE *s = self->str;
6670 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006671
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672 while (len-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00006674
Benjamin Peterson29060642009-01-31 22:14:21 +00006675 ch = Py_UNICODE_TOLOWER(*s);
6676 if (ch != *s) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 status = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006678 *s = ch;
6679 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680 s++;
6681 }
6682
6683 return status;
6684}
6685
Alexander Belopolsky40018472011-02-26 01:02:56 +00006686static int
6687fixswapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006689 Py_ssize_t len = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690 Py_UNICODE *s = self->str;
6691 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006692
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693 while (len-- > 0) {
6694 if (Py_UNICODE_ISUPPER(*s)) {
6695 *s = Py_UNICODE_TOLOWER(*s);
6696 status = 1;
6697 } else if (Py_UNICODE_ISLOWER(*s)) {
6698 *s = Py_UNICODE_TOUPPER(*s);
6699 status = 1;
6700 }
6701 s++;
6702 }
6703
6704 return status;
6705}
6706
Alexander Belopolsky40018472011-02-26 01:02:56 +00006707static int
6708fixcapitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006709{
Martin v. Löwis18e16552006-02-15 17:27:45 +00006710 Py_ssize_t len = self->length;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006711 Py_UNICODE *s = self->str;
6712 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00006713
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006714 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006715 return 0;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006716 if (Py_UNICODE_ISLOWER(*s)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006717 *s = Py_UNICODE_TOUPPER(*s);
6718 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00006720 s++;
6721 while (--len > 0) {
6722 if (Py_UNICODE_ISUPPER(*s)) {
6723 *s = Py_UNICODE_TOLOWER(*s);
6724 status = 1;
6725 }
6726 s++;
6727 }
6728 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729}
6730
Alexander Belopolsky40018472011-02-26 01:02:56 +00006731static int
6732fixtitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733{
6734 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6735 register Py_UNICODE *e;
6736 int previous_is_cased;
6737
6738 /* Shortcut for single character strings */
6739 if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6741 if (*p != ch) {
6742 *p = ch;
6743 return 1;
6744 }
6745 else
6746 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747 }
Tim Petersced69f82003-09-16 20:30:58 +00006748
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 e = p + PyUnicode_GET_SIZE(self);
6750 previous_is_cased = 0;
6751 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00006753
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 if (previous_is_cased)
6755 *p = Py_UNICODE_TOLOWER(ch);
6756 else
6757 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00006758
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 if (Py_UNICODE_ISLOWER(ch) ||
6760 Py_UNICODE_ISUPPER(ch) ||
6761 Py_UNICODE_ISTITLE(ch))
6762 previous_is_cased = 1;
6763 else
6764 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765 }
6766 return 1;
6767}
6768
Tim Peters8ce9f162004-08-27 01:49:32 +00006769PyObject *
6770PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771{
Skip Montanaro6543b452004-09-16 03:28:13 +00006772 const Py_UNICODE blank = ' ';
6773 const Py_UNICODE *sep = &blank;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006774 Py_ssize_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006775 PyUnicodeObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00006776 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
6777 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006778 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
6779 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00006780 PyObject *item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006781 Py_ssize_t sz, i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782
Tim Peters05eba1f2004-08-27 21:32:02 +00006783 fseq = PySequence_Fast(seq, "");
6784 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006785 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00006786 }
6787
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006788 /* NOTE: the following code can't call back into Python code,
6789 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00006790 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006791
Tim Peters05eba1f2004-08-27 21:32:02 +00006792 seqlen = PySequence_Fast_GET_SIZE(fseq);
6793 /* If empty sequence, return u"". */
6794 if (seqlen == 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00006795 res = _PyUnicode_New(0); /* empty sequence; return u"" */
6796 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00006797 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006798 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00006799 /* If singleton sequence with an exact Unicode, return that. */
6800 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006801 item = items[0];
6802 if (PyUnicode_CheckExact(item)) {
6803 Py_INCREF(item);
6804 res = (PyUnicodeObject *)item;
6805 goto Done;
6806 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006807 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006808 else {
6809 /* Set up sep and seplen */
6810 if (separator == NULL) {
6811 sep = &blank;
6812 seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00006813 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006814 else {
6815 if (!PyUnicode_Check(separator)) {
6816 PyErr_Format(PyExc_TypeError,
6817 "separator: expected str instance,"
6818 " %.80s found",
6819 Py_TYPE(separator)->tp_name);
6820 goto onError;
6821 }
6822 sep = PyUnicode_AS_UNICODE(separator);
6823 seplen = PyUnicode_GET_SIZE(separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00006824 }
6825 }
6826
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006827 /* There are at least two things to join, or else we have a subclass
6828 * of str in the sequence.
6829 * Do a pre-pass to figure out the total amount of space we'll
6830 * need (sz), and see whether all argument are strings.
6831 */
6832 sz = 0;
6833 for (i = 0; i < seqlen; i++) {
6834 const Py_ssize_t old_sz = sz;
6835 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00006836 if (!PyUnicode_Check(item)) {
6837 PyErr_Format(PyExc_TypeError,
6838 "sequence item %zd: expected str instance,"
6839 " %.80s found",
6840 i, Py_TYPE(item)->tp_name);
6841 goto onError;
6842 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006843 sz += PyUnicode_GET_SIZE(item);
6844 if (i != 0)
6845 sz += seplen;
6846 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6847 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00006848 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006849 goto onError;
6850 }
6851 }
Tim Petersced69f82003-09-16 20:30:58 +00006852
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006853 res = _PyUnicode_New(sz);
6854 if (res == NULL)
6855 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00006856
Antoine Pitrouaf14b792008-08-07 21:50:41 +00006857 /* Catenate everything. */
6858 res_p = PyUnicode_AS_UNICODE(res);
6859 for (i = 0; i < seqlen; ++i) {
6860 Py_ssize_t itemlen;
6861 item = items[i];
6862 itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 /* Copy item, and maybe the separator. */
6864 if (i) {
6865 Py_UNICODE_COPY(res_p, sep, seplen);
6866 res_p += seplen;
6867 }
6868 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6869 res_p += itemlen;
Tim Peters05eba1f2004-08-27 21:32:02 +00006870 }
Tim Peters8ce9f162004-08-27 01:49:32 +00006871
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00006873 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874 return (PyObject *)res;
6875
Benjamin Peterson29060642009-01-31 22:14:21 +00006876 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00006877 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00006878 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 return NULL;
6880}
6881
Alexander Belopolsky40018472011-02-26 01:02:56 +00006882static PyUnicodeObject *
6883pad(PyUnicodeObject *self,
6884 Py_ssize_t left,
6885 Py_ssize_t right,
6886 Py_UNICODE fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006887{
6888 PyUnicodeObject *u;
6889
6890 if (left < 0)
6891 left = 0;
6892 if (right < 0)
6893 right = 0;
6894
Tim Peters7a29bd52001-09-12 03:03:31 +00006895 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006896 Py_INCREF(self);
6897 return self;
6898 }
6899
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006900 if (left > PY_SSIZE_T_MAX - self->length ||
6901 right > PY_SSIZE_T_MAX - (left + self->length)) {
6902 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6903 return NULL;
6904 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 u = _PyUnicode_New(left + self->length + right);
6906 if (u) {
6907 if (left)
6908 Py_UNICODE_FILL(u->str, fill, left);
6909 Py_UNICODE_COPY(u->str + left, self->str, self->length);
6910 if (right)
6911 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6912 }
6913
6914 return u;
6915}
6916
Alexander Belopolsky40018472011-02-26 01:02:56 +00006917PyObject *
6918PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006919{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006920 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921
6922 string = PyUnicode_FromObject(string);
6923 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006925
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006926 list = stringlib_splitlines(
6927 (PyObject*) string, PyUnicode_AS_UNICODE(string),
6928 PyUnicode_GET_SIZE(string), keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929
6930 Py_DECREF(string);
6931 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006932}
6933
Alexander Belopolsky40018472011-02-26 01:02:56 +00006934static PyObject *
6935split(PyUnicodeObject *self,
6936 PyUnicodeObject *substring,
6937 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006940 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006943 return stringlib_split_whitespace(
6944 (PyObject*) self, self->str, self->length, maxcount
6945 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006947 return stringlib_split(
6948 (PyObject*) self, self->str, self->length,
6949 substring->str, substring->length,
6950 maxcount
6951 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952}
6953
Alexander Belopolsky40018472011-02-26 01:02:56 +00006954static PyObject *
6955rsplit(PyUnicodeObject *self,
6956 PyUnicodeObject *substring,
6957 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006958{
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006959 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006960 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006961
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006962 if (substring == NULL)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006963 return stringlib_rsplit_whitespace(
6964 (PyObject*) self, self->str, self->length, maxcount
6965 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006966
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006967 return stringlib_rsplit(
6968 (PyObject*) self, self->str, self->length,
6969 substring->str, substring->length,
6970 maxcount
6971 );
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006972}
6973
Alexander Belopolsky40018472011-02-26 01:02:56 +00006974static PyObject *
6975replace(PyUnicodeObject *self,
6976 PyUnicodeObject *str1,
6977 PyUnicodeObject *str2,
6978 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979{
6980 PyUnicodeObject *u;
6981
6982 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006983 maxcount = PY_SSIZE_T_MAX;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006984 else if (maxcount == 0 || self->length == 0)
6985 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986
Thomas Wouters477c8d52006-05-27 19:21:47 +00006987 if (str1->length == str2->length) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00006988 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006989 /* same length */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00006990 if (str1->length == 0)
6991 goto nothing;
Thomas Wouters477c8d52006-05-27 19:21:47 +00006992 if (str1->length == 1) {
6993 /* replace characters */
6994 Py_UNICODE u1, u2;
6995 if (!findchar(self->str, self->length, str1->str[0]))
6996 goto nothing;
6997 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6998 if (!u)
6999 return NULL;
7000 Py_UNICODE_COPY(u->str, self->str, self->length);
7001 u1 = str1->str[0];
7002 u2 = str2->str[0];
7003 for (i = 0; i < u->length; i++)
7004 if (u->str[i] == u1) {
7005 if (--maxcount < 0)
7006 break;
7007 u->str[i] = u2;
7008 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007009 } else {
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007010 i = stringlib_find(
7011 self->str, self->length, str1->str, str1->length, 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00007012 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00007013 if (i < 0)
7014 goto nothing;
7015 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7016 if (!u)
7017 return NULL;
7018 Py_UNICODE_COPY(u->str, self->str, self->length);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007019
7020 /* change everything in-place, starting with this one */
7021 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7022 i += str1->length;
7023
7024 while ( --maxcount > 0) {
7025 i = stringlib_find(self->str+i, self->length-i,
7026 str1->str, str1->length,
7027 i);
7028 if (i == -1)
7029 break;
7030 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7031 i += str1->length;
7032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007035
Brett Cannonb94767f2011-02-22 20:15:44 +00007036 Py_ssize_t n, i, j;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007037 Py_ssize_t product, new_size, delta;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038 Py_UNICODE *p;
7039
7040 /* replace strings */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007041 n = stringlib_count(self->str, self->length, str1->str, str1->length,
7042 maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007043 if (n == 0)
7044 goto nothing;
7045 /* new_size = self->length + n * (str2->length - str1->length)); */
7046 delta = (str2->length - str1->length);
7047 if (delta == 0) {
7048 new_size = self->length;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007050 product = n * (str2->length - str1->length);
7051 if ((product / (str2->length - str1->length)) != n) {
7052 PyErr_SetString(PyExc_OverflowError,
7053 "replace string is too long");
7054 return NULL;
7055 }
7056 new_size = self->length + product;
7057 if (new_size < 0) {
7058 PyErr_SetString(PyExc_OverflowError,
7059 "replace string is too long");
7060 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061 }
7062 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007063 u = _PyUnicode_New(new_size);
7064 if (!u)
7065 return NULL;
7066 i = 0;
7067 p = u->str;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007068 if (str1->length > 0) {
7069 while (n-- > 0) {
7070 /* look for next match */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007071 j = stringlib_find(self->str+i, self->length-i,
7072 str1->str, str1->length,
7073 i);
7074 if (j == -1)
7075 break;
7076 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00007077 /* copy unchanged part [i:j] */
7078 Py_UNICODE_COPY(p, self->str+i, j-i);
7079 p += j - i;
7080 }
7081 /* copy substitution string */
7082 if (str2->length > 0) {
7083 Py_UNICODE_COPY(p, str2->str, str2->length);
7084 p += str2->length;
7085 }
7086 i = j + str1->length;
7087 }
7088 if (i < self->length)
7089 /* copy tail [i:] */
7090 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7091 } else {
7092 /* interleave */
7093 while (n > 0) {
7094 Py_UNICODE_COPY(p, str2->str, str2->length);
7095 p += str2->length;
7096 if (--n <= 0)
7097 break;
7098 *p++ = self->str[i++];
7099 }
7100 Py_UNICODE_COPY(p, self->str+i, self->length-i);
7101 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103 return (PyObject *) u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007104
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00007106 /* nothing to replace; return original string (when possible) */
7107 if (PyUnicode_CheckExact(self)) {
7108 Py_INCREF(self);
7109 return (PyObject *) self;
7110 }
7111 return PyUnicode_FromUnicode(self->str, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112}
7113
7114/* --- Unicode Object Methods --------------------------------------------- */
7115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007116PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007117 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118\n\
7119Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007120characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007121
7122static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007123unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007125 return fixup(self, fixtitle);
7126}
7127
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007128PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007129 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007130\n\
7131Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00007132have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007133
7134static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007135unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007137 return fixup(self, fixcapitalize);
7138}
7139
7140#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007141PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007143\n\
7144Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007145normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146
7147static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007148unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149{
7150 PyObject *list;
7151 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007152 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153
Guido van Rossumd57fd912000-03-10 22:53:23 +00007154 /* Split into words */
7155 list = split(self, NULL, -1);
7156 if (!list)
7157 return NULL;
7158
7159 /* Capitalize each word */
7160 for (i = 0; i < PyList_GET_SIZE(list); i++) {
7161 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00007162 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163 if (item == NULL)
7164 goto onError;
7165 Py_DECREF(PyList_GET_ITEM(list, i));
7166 PyList_SET_ITEM(list, i, item);
7167 }
7168
7169 /* Join the words to form a new string */
7170 item = PyUnicode_Join(NULL, list);
7171
Benjamin Peterson29060642009-01-31 22:14:21 +00007172 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 Py_DECREF(list);
7174 return (PyObject *)item;
7175}
7176#endif
7177
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007178/* Argument converter. Coerces to a single unicode character */
7179
7180static int
7181convert_uc(PyObject *obj, void *addr)
7182{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007183 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7184 PyObject *uniobj;
7185 Py_UNICODE *unistr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007186
Benjamin Peterson14339b62009-01-31 16:36:08 +00007187 uniobj = PyUnicode_FromObject(obj);
7188 if (uniobj == NULL) {
7189 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007190 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007191 return 0;
7192 }
7193 if (PyUnicode_GET_SIZE(uniobj) != 1) {
7194 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00007195 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007196 Py_DECREF(uniobj);
7197 return 0;
7198 }
7199 unistr = PyUnicode_AS_UNICODE(uniobj);
7200 *fillcharloc = unistr[0];
7201 Py_DECREF(uniobj);
7202 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007203}
7204
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007205PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007208Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007209done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210
7211static PyObject *
7212unicode_center(PyUnicodeObject *self, PyObject *args)
7213{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007214 Py_ssize_t marg, left;
7215 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007216 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007217
Thomas Woutersde017742006-02-16 19:34:37 +00007218 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219 return NULL;
7220
Tim Peters7a29bd52001-09-12 03:03:31 +00007221 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222 Py_INCREF(self);
7223 return (PyObject*) self;
7224 }
7225
7226 marg = width - self->length;
7227 left = marg / 2 + (marg & width & 1);
7228
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00007229 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007230}
7231
Marc-André Lemburge5034372000-08-08 08:04:29 +00007232#if 0
7233
7234/* This code should go into some future Unicode collation support
7235 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00007236 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00007237
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007238/* speedy UTF-16 code point order comparison */
7239/* gleaned from: */
7240/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7241
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007242static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007243{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007244 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00007245 0, 0, 0, 0, 0, 0, 0, 0,
7246 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00007247 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007248};
7249
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250static int
7251unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7252{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007253 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007254
Guido van Rossumd57fd912000-03-10 22:53:23 +00007255 Py_UNICODE *s1 = str1->str;
7256 Py_UNICODE *s2 = str2->str;
7257
7258 len1 = str1->length;
7259 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007260
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007262 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007263
7264 c1 = *s1++;
7265 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00007266
Benjamin Peterson29060642009-01-31 22:14:21 +00007267 if (c1 > (1<<11) * 26)
7268 c1 += utf16Fixup[c1>>11];
7269 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007270 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007271 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00007272
7273 if (c1 != c2)
7274 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00007275
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00007276 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277 }
7278
7279 return (len1 < len2) ? -1 : (len1 != len2);
7280}
7281
Marc-André Lemburge5034372000-08-08 08:04:29 +00007282#else
7283
7284static int
7285unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7286{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007287 register Py_ssize_t len1, len2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007288
7289 Py_UNICODE *s1 = str1->str;
7290 Py_UNICODE *s2 = str2->str;
7291
7292 len1 = str1->length;
7293 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00007294
Marc-André Lemburge5034372000-08-08 08:04:29 +00007295 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00007296 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00007297
Fredrik Lundh45714e92001-06-26 16:39:36 +00007298 c1 = *s1++;
7299 c2 = *s2++;
7300
7301 if (c1 != c2)
7302 return (c1 < c2) ? -1 : 1;
7303
Marc-André Lemburge5034372000-08-08 08:04:29 +00007304 len1--; len2--;
7305 }
7306
7307 return (len1 < len2) ? -1 : (len1 != len2);
7308}
7309
7310#endif
7311
Alexander Belopolsky40018472011-02-26 01:02:56 +00007312int
7313PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314{
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007315 if (PyUnicode_Check(left) && PyUnicode_Check(right))
7316 return unicode_compare((PyUnicodeObject *)left,
7317 (PyUnicodeObject *)right);
Guido van Rossum09dc34f2007-05-04 04:17:33 +00007318 PyErr_Format(PyExc_TypeError,
7319 "Can't compare %.100s and %.100s",
7320 left->ob_type->tp_name,
7321 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322 return -1;
7323}
7324
Martin v. Löwis5b222132007-06-10 09:51:05 +00007325int
7326PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7327{
7328 int i;
7329 Py_UNICODE *id;
7330 assert(PyUnicode_Check(uni));
7331 id = PyUnicode_AS_UNICODE(uni);
7332 /* Compare Unicode string and source character set string */
7333 for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00007334 if (id[i] != str[i])
7335 return ((int)id[i] < (int)str[i]) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00007336 /* This check keeps Python strings that end in '\0' from comparing equal
7337 to C strings identical up to that point. */
Benjamin Petersona23831f2010-04-25 21:54:00 +00007338 if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007340 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00007341 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00007342 return 0;
7343}
7344
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007345
Benjamin Peterson29060642009-01-31 22:14:21 +00007346#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00007347 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007348
Alexander Belopolsky40018472011-02-26 01:02:56 +00007349PyObject *
7350PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007351{
7352 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007353
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007354 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7355 PyObject *v;
7356 if (((PyUnicodeObject *) left)->length !=
7357 ((PyUnicodeObject *) right)->length) {
7358 if (op == Py_EQ) {
7359 Py_INCREF(Py_False);
7360 return Py_False;
7361 }
7362 if (op == Py_NE) {
7363 Py_INCREF(Py_True);
7364 return Py_True;
7365 }
7366 }
7367 if (left == right)
7368 result = 0;
7369 else
7370 result = unicode_compare((PyUnicodeObject *)left,
7371 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007372
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007373 /* Convert the return value to a Boolean */
7374 switch (op) {
7375 case Py_EQ:
7376 v = TEST_COND(result == 0);
7377 break;
7378 case Py_NE:
7379 v = TEST_COND(result != 0);
7380 break;
7381 case Py_LE:
7382 v = TEST_COND(result <= 0);
7383 break;
7384 case Py_GE:
7385 v = TEST_COND(result >= 0);
7386 break;
7387 case Py_LT:
7388 v = TEST_COND(result == -1);
7389 break;
7390 case Py_GT:
7391 v = TEST_COND(result == 1);
7392 break;
7393 default:
7394 PyErr_BadArgument();
7395 return NULL;
7396 }
7397 Py_INCREF(v);
7398 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007399 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007400
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00007401 Py_INCREF(Py_NotImplemented);
7402 return Py_NotImplemented;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00007403}
7404
Alexander Belopolsky40018472011-02-26 01:02:56 +00007405int
7406PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +00007407{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007408 PyObject *str, *sub;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007409 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007410
7411 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +00007412 sub = PyUnicode_FromObject(element);
7413 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 PyErr_Format(PyExc_TypeError,
7415 "'in <string>' requires string as left operand, not %s",
7416 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007417 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007418 }
7419
Thomas Wouters477c8d52006-05-27 19:21:47 +00007420 str = PyUnicode_FromObject(container);
7421 if (!str) {
7422 Py_DECREF(sub);
7423 return -1;
7424 }
7425
7426 result = stringlib_contains_obj(str, sub);
7427
7428 Py_DECREF(str);
7429 Py_DECREF(sub);
7430
Guido van Rossum403d68b2000-03-13 15:55:09 +00007431 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +00007432}
7433
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434/* Concat to string or Unicode object giving a new Unicode object. */
7435
Alexander Belopolsky40018472011-02-26 01:02:56 +00007436PyObject *
7437PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007438{
7439 PyUnicodeObject *u = NULL, *v = NULL, *w;
7440
7441 /* Coerce the two arguments */
7442 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7443 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7446 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448
7449 /* Shortcuts */
7450 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 Py_DECREF(v);
7452 return (PyObject *)u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 }
7454 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 Py_DECREF(u);
7456 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457 }
7458
7459 /* Concat the two Unicode strings */
7460 w = _PyUnicode_New(u->length + v->length);
7461 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007463 Py_UNICODE_COPY(w->str, u->str, u->length);
7464 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7465
7466 Py_DECREF(u);
7467 Py_DECREF(v);
7468 return (PyObject *)w;
7469
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007471 Py_XDECREF(u);
7472 Py_XDECREF(v);
7473 return NULL;
7474}
7475
Walter Dörwald1ab83302007-05-18 17:15:44 +00007476void
7477PyUnicode_Append(PyObject **pleft, PyObject *right)
7478{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007479 PyObject *new;
7480 if (*pleft == NULL)
7481 return;
7482 if (right == NULL || !PyUnicode_Check(*pleft)) {
7483 Py_DECREF(*pleft);
7484 *pleft = NULL;
7485 return;
7486 }
7487 new = PyUnicode_Concat(*pleft, right);
7488 Py_DECREF(*pleft);
7489 *pleft = new;
Walter Dörwald1ab83302007-05-18 17:15:44 +00007490}
7491
7492void
7493PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7494{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007495 PyUnicode_Append(pleft, right);
7496 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +00007497}
7498
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007499PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007500 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007501\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00007502Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00007503string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007504interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007505
7506static PyObject *
7507unicode_count(PyUnicodeObject *self, PyObject *args)
7508{
7509 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007510 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007511 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007512 PyObject *result;
7513
Guido van Rossumb8872e62000-05-09 14:14:27 +00007514 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007516 return NULL;
7517
7518 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007519 (PyObject *)substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007520 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007521 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007522
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007523 ADJUST_INDICES(start, end, self->length);
Christian Heimes217cfd12007-12-02 14:31:20 +00007524 result = PyLong_FromSsize_t(
Thomas Wouters477c8d52006-05-27 19:21:47 +00007525 stringlib_count(self->str + start, end - start,
Antoine Pitrouf2c54842010-01-13 08:07:53 +00007526 substring->str, substring->length,
7527 PY_SSIZE_T_MAX)
Thomas Wouters477c8d52006-05-27 19:21:47 +00007528 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007529
7530 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007531
Guido van Rossumd57fd912000-03-10 22:53:23 +00007532 return result;
7533}
7534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007535PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +00007536 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007537\n\
Victor Stinnere14e2122010-11-07 18:41:46 +00007538Encode S using the codec registered for encoding. Default encoding\n\
7539is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00007540handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007541a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7542'xmlcharrefreplace' as well as any other name registered with\n\
7543codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544
7545static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +00007546unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007547{
Benjamin Peterson308d6372009-09-18 21:42:35 +00007548 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +00007549 char *encoding = NULL;
7550 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +00007551
Benjamin Peterson308d6372009-09-18 21:42:35 +00007552 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7553 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007554 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +00007555 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00007556}
7557
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007558PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007559 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007560\n\
7561Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007562If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007563
7564static PyObject*
7565unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7566{
7567 Py_UNICODE *e;
7568 Py_UNICODE *p;
7569 Py_UNICODE *q;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007570 Py_UNICODE *qe;
7571 Py_ssize_t i, j, incr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007572 PyUnicodeObject *u;
7573 int tabsize = 8;
7574
7575 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007576 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007577
Thomas Wouters7e474022000-07-16 12:04:32 +00007578 /* First pass: determine size of output string */
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007579 i = 0; /* chars up to and including most recent \n or \r */
7580 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7581 e = self->str + self->length; /* end of input */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582 for (p = self->str; p < e; p++)
7583 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007584 if (tabsize > 0) {
7585 incr = tabsize - (j % tabsize); /* cannot overflow */
7586 if (j > PY_SSIZE_T_MAX - incr)
7587 goto overflow1;
7588 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007589 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007590 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007592 if (j > PY_SSIZE_T_MAX - 1)
7593 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594 j++;
7595 if (*p == '\n' || *p == '\r') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 if (i > PY_SSIZE_T_MAX - j)
7597 goto overflow1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598 i += j;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007599 j = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 }
7601 }
7602
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007603 if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson29060642009-01-31 22:14:21 +00007604 goto overflow1;
Guido van Rossumcd16bf62007-06-13 18:07:49 +00007605
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606 /* Second pass: create output string and fill it */
7607 u = _PyUnicode_New(i + j);
7608 if (!u)
7609 return NULL;
7610
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007611 j = 0; /* same as in first pass */
7612 q = u->str; /* next output char */
7613 qe = u->str + u->length; /* end of output */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614
7615 for (p = self->str; p < e; p++)
7616 if (*p == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 if (tabsize > 0) {
7618 i = tabsize - (j % tabsize);
7619 j += i;
7620 while (i--) {
7621 if (q >= qe)
7622 goto overflow2;
7623 *q++ = ' ';
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007624 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007626 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007627 else {
7628 if (q >= qe)
7629 goto overflow2;
7630 *q++ = *p;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007631 j++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632 if (*p == '\n' || *p == '\r')
7633 j = 0;
7634 }
7635
7636 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +00007637
7638 overflow2:
7639 Py_DECREF(u);
7640 overflow1:
7641 PyErr_SetString(PyExc_OverflowError, "new string is too long");
7642 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007643}
7644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007645PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007647\n\
7648Return the lowest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00007649such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007650arguments start and end are interpreted as in slice notation.\n\
7651\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007652Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653
7654static PyObject *
7655unicode_find(PyUnicodeObject *self, PyObject *args)
7656{
Thomas Wouters477c8d52006-05-27 19:21:47 +00007657 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007658 Py_ssize_t start;
7659 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007660 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007661
Christian Heimes9cd17752007-11-18 19:35:23 +00007662 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007664
Thomas Wouters477c8d52006-05-27 19:21:47 +00007665 result = stringlib_find_slice(
7666 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7667 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7668 start, end
7669 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670
7671 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007672
Christian Heimes217cfd12007-12-02 14:31:20 +00007673 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674}
7675
7676static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00007677unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007678{
7679 if (index < 0 || index >= self->length) {
7680 PyErr_SetString(PyExc_IndexError, "string index out of range");
7681 return NULL;
7682 }
7683
7684 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7685}
7686
Guido van Rossumc2504932007-09-18 19:42:40 +00007687/* Believe it or not, this produces the same value for ASCII strings
7688 as string_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007689static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +00007690unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007691{
Guido van Rossumc2504932007-09-18 19:42:40 +00007692 Py_ssize_t len;
7693 Py_UNICODE *p;
Benjamin Peterson8f67d082010-10-17 20:54:53 +00007694 Py_hash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +00007695
7696 if (self->hash != -1)
7697 return self->hash;
Christian Heimes90aa7642007-12-19 02:45:37 +00007698 len = Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007699 p = self->str;
7700 x = *p << 7;
7701 while (--len >= 0)
7702 x = (1000003*x) ^ *p++;
Christian Heimes90aa7642007-12-19 02:45:37 +00007703 x ^= Py_SIZE(self);
Guido van Rossumc2504932007-09-18 19:42:40 +00007704 if (x == -1)
7705 x = -2;
7706 self->hash = x;
7707 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708}
7709
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007710PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007711 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007713Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714
7715static PyObject *
7716unicode_index(PyUnicodeObject *self, PyObject *args)
7717{
Martin v. Löwis18e16552006-02-15 17:27:45 +00007718 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00007719 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00007720 Py_ssize_t start;
7721 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722
Christian Heimes9cd17752007-11-18 19:35:23 +00007723 if (!_ParseTupleFinds(args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725
Thomas Wouters477c8d52006-05-27 19:21:47 +00007726 result = stringlib_find_slice(
7727 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7728 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7729 start, end
7730 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731
7732 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00007733
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734 if (result < 0) {
7735 PyErr_SetString(PyExc_ValueError, "substring not found");
7736 return NULL;
7737 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00007738
Christian Heimes217cfd12007-12-02 14:31:20 +00007739 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007740}
7741
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007742PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007743 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007744\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007745Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007746at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747
7748static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007749unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750{
7751 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7752 register const Py_UNICODE *e;
7753 int cased;
7754
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755 /* Shortcut for single character strings */
7756 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007757 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007759 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007760 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007762
Guido van Rossumd57fd912000-03-10 22:53:23 +00007763 e = p + PyUnicode_GET_SIZE(self);
7764 cased = 0;
7765 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007766 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007767
Benjamin Peterson29060642009-01-31 22:14:21 +00007768 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7769 return PyBool_FromLong(0);
7770 else if (!cased && Py_UNICODE_ISLOWER(ch))
7771 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007773 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007774}
7775
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007776PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007777 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007778\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007779Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007780at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007781
7782static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007783unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007784{
7785 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7786 register const Py_UNICODE *e;
7787 int cased;
7788
Guido van Rossumd57fd912000-03-10 22:53:23 +00007789 /* Shortcut for single character strings */
7790 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007792
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007793 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007794 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007796
Guido van Rossumd57fd912000-03-10 22:53:23 +00007797 e = p + PyUnicode_GET_SIZE(self);
7798 cased = 0;
7799 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007801
Benjamin Peterson29060642009-01-31 22:14:21 +00007802 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7803 return PyBool_FromLong(0);
7804 else if (!cased && Py_UNICODE_ISUPPER(ch))
7805 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007806 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007807 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808}
7809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007810PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007811 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007813Return True if S is a titlecased string and there is at least one\n\
7814character in S, i.e. upper- and titlecase characters may only\n\
7815follow uncased characters and lowercase characters only cased ones.\n\
7816Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007817
7818static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007819unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007820{
7821 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7822 register const Py_UNICODE *e;
7823 int cased, previous_is_cased;
7824
Guido van Rossumd57fd912000-03-10 22:53:23 +00007825 /* Shortcut for single character strings */
7826 if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007827 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7828 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007829
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007830 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007831 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007833
Guido van Rossumd57fd912000-03-10 22:53:23 +00007834 e = p + PyUnicode_GET_SIZE(self);
7835 cased = 0;
7836 previous_is_cased = 0;
7837 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00007839
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7841 if (previous_is_cased)
7842 return PyBool_FromLong(0);
7843 previous_is_cased = 1;
7844 cased = 1;
7845 }
7846 else if (Py_UNICODE_ISLOWER(ch)) {
7847 if (!previous_is_cased)
7848 return PyBool_FromLong(0);
7849 previous_is_cased = 1;
7850 cased = 1;
7851 }
7852 else
7853 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007855 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007856}
7857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007858PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007859 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007861Return True if all characters in S are whitespace\n\
7862and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007863
7864static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007865unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007866{
7867 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7868 register const Py_UNICODE *e;
7869
Guido van Rossumd57fd912000-03-10 22:53:23 +00007870 /* Shortcut for single character strings */
7871 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 Py_UNICODE_ISSPACE(*p))
7873 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007875 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007876 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007878
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879 e = p + PyUnicode_GET_SIZE(self);
7880 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007881 if (!Py_UNICODE_ISSPACE(*p))
7882 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007883 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007884 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007885}
7886
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007887PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007888 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007889\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007890Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007891and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007892
7893static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007894unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007895{
7896 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7897 register const Py_UNICODE *e;
7898
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007899 /* Shortcut for single character strings */
7900 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 Py_UNICODE_ISALPHA(*p))
7902 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007903
7904 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007905 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007907
7908 e = p + PyUnicode_GET_SIZE(self);
7909 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 if (!Py_UNICODE_ISALPHA(*p))
7911 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007912 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007913 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007914}
7915
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007916PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007917 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007918\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007919Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007920and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007921
7922static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007923unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007924{
7925 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7926 register const Py_UNICODE *e;
7927
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007928 /* Shortcut for single character strings */
7929 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 Py_UNICODE_ISALNUM(*p))
7931 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007932
7933 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007934 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007936
7937 e = p + PyUnicode_GET_SIZE(self);
7938 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 if (!Py_UNICODE_ISALNUM(*p))
7940 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007941 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007942 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00007943}
7944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007945PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007946 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007947\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00007948Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007949False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007950
7951static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007952unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953{
7954 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7955 register const Py_UNICODE *e;
7956
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957 /* Shortcut for single character strings */
7958 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 Py_UNICODE_ISDECIMAL(*p))
7960 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007961
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007962 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007963 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007964 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007965
Guido van Rossumd57fd912000-03-10 22:53:23 +00007966 e = p + PyUnicode_GET_SIZE(self);
7967 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 if (!Py_UNICODE_ISDECIMAL(*p))
7969 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00007971 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972}
7973
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007974PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00007977Return True if all characters in S are digits\n\
7978and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979
7980static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00007981unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007982{
7983 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7984 register const Py_UNICODE *e;
7985
Guido van Rossumd57fd912000-03-10 22:53:23 +00007986 /* Shortcut for single character strings */
7987 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 Py_UNICODE_ISDIGIT(*p))
7989 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007991 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00007992 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00007994
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 e = p + PyUnicode_GET_SIZE(self);
7996 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007997 if (!Py_UNICODE_ISDIGIT(*p))
7998 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008000 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001}
8002
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008003PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00008006Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008007False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008008
8009static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008010unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011{
8012 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8013 register const Py_UNICODE *e;
8014
Guido van Rossumd57fd912000-03-10 22:53:23 +00008015 /* Shortcut for single character strings */
8016 if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 Py_UNICODE_ISNUMERIC(*p))
8018 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008019
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008020 /* Special case for empty strings */
Martin v. Löwisdea59e52006-01-05 10:00:36 +00008021 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00008023
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 e = p + PyUnicode_GET_SIZE(self);
8025 for (; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 if (!Py_UNICODE_ISNUMERIC(*p))
8027 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00008029 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008030}
8031
Martin v. Löwis47383402007-08-15 07:32:56 +00008032int
8033PyUnicode_IsIdentifier(PyObject *self)
8034{
8035 register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
8036 register const Py_UNICODE *e;
8037
8038 /* Special case for empty strings */
8039 if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008041
8042 /* PEP 3131 says that the first character must be in
8043 XID_Start and subsequent characters in XID_Continue,
8044 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +00008045 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +00008046 letters, digits, underscore). However, given the current
8047 definition of XID_Start and XID_Continue, it is sufficient
8048 to check just for these, except that _ must be allowed
8049 as starting an identifier. */
8050 if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
8051 return 0;
8052
8053 e = p + PyUnicode_GET_SIZE(self);
8054 for (p++; p < e; p++) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 if (!_PyUnicode_IsXidContinue(*p))
8056 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +00008057 }
8058 return 1;
8059}
8060
8061PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +00008063\n\
8064Return True if S is a valid identifier according\n\
8065to the language definition.");
8066
8067static PyObject*
8068unicode_isidentifier(PyObject *self)
8069{
8070 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8071}
8072
Georg Brandl559e5d72008-06-11 18:37:52 +00008073PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +00008075\n\
8076Return True if all characters in S are considered\n\
8077printable in repr() or S is empty, False otherwise.");
8078
8079static PyObject*
8080unicode_isprintable(PyObject *self)
8081{
8082 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8083 register const Py_UNICODE *e;
8084
8085 /* Shortcut for single character strings */
8086 if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8087 Py_RETURN_TRUE;
8088 }
8089
8090 e = p + PyUnicode_GET_SIZE(self);
8091 for (; p < e; p++) {
8092 if (!Py_UNICODE_ISPRINTABLE(*p)) {
8093 Py_RETURN_FALSE;
8094 }
8095 }
8096 Py_RETURN_TRUE;
8097}
8098
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008099PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +00008100 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008101\n\
8102Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +00008103iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008104
8105static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008106unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008107{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008108 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109}
8110
Martin v. Löwis18e16552006-02-15 17:27:45 +00008111static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112unicode_length(PyUnicodeObject *self)
8113{
8114 return self->length;
8115}
8116
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008117PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008119\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008120Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008121done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122
8123static PyObject *
8124unicode_ljust(PyUnicodeObject *self, PyObject *args)
8125{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008126 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008127 Py_UNICODE fillchar = ' ';
8128
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008129 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008130 return NULL;
8131
Tim Peters7a29bd52001-09-12 03:03:31 +00008132 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008133 Py_INCREF(self);
8134 return (PyObject*) self;
8135 }
8136
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008137 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008138}
8139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008140PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008142\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008143Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144
8145static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008146unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 return fixup(self, fixlower);
8149}
8150
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008151#define LEFTSTRIP 0
8152#define RIGHTSTRIP 1
8153#define BOTHSTRIP 2
8154
8155/* Arrays indexed by above */
8156static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8157
8158#define STRIPNAME(i) (stripformat[i]+3)
8159
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008160/* externally visible for str.strip(unicode) */
8161PyObject *
8162_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8163{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008164 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8165 Py_ssize_t len = PyUnicode_GET_SIZE(self);
8166 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8167 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8168 Py_ssize_t i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008169
Benjamin Peterson29060642009-01-31 22:14:21 +00008170 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008171
Benjamin Peterson14339b62009-01-31 16:36:08 +00008172 i = 0;
8173 if (striptype != RIGHTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008174 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8175 i++;
8176 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008177 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008178
Benjamin Peterson14339b62009-01-31 16:36:08 +00008179 j = len;
8180 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 do {
8182 j--;
8183 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8184 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008185 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008186
Benjamin Peterson14339b62009-01-31 16:36:08 +00008187 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 Py_INCREF(self);
8189 return (PyObject*)self;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008190 }
8191 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008192 return PyUnicode_FromUnicode(s+i, j-i);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008193}
8194
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195
8196static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008197do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008198{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008199 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8200 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008201
Benjamin Peterson14339b62009-01-31 16:36:08 +00008202 i = 0;
8203 if (striptype != RIGHTSTRIP) {
8204 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8205 i++;
8206 }
8207 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008208
Benjamin Peterson14339b62009-01-31 16:36:08 +00008209 j = len;
8210 if (striptype != LEFTSTRIP) {
8211 do {
8212 j--;
8213 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8214 j++;
8215 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008216
Benjamin Peterson14339b62009-01-31 16:36:08 +00008217 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8218 Py_INCREF(self);
8219 return (PyObject*)self;
8220 }
8221 else
8222 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008223}
8224
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008225
8226static PyObject *
8227do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8228{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008229 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008230
Benjamin Peterson14339b62009-01-31 16:36:08 +00008231 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8232 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008233
Benjamin Peterson14339b62009-01-31 16:36:08 +00008234 if (sep != NULL && sep != Py_None) {
8235 if (PyUnicode_Check(sep))
8236 return _PyUnicode_XStrip(self, striptype, sep);
8237 else {
8238 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 "%s arg must be None or str",
8240 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +00008241 return NULL;
8242 }
8243 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008244
Benjamin Peterson14339b62009-01-31 16:36:08 +00008245 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008246}
8247
8248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008249PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008251\n\
8252Return a copy of the string S with leading and trailing\n\
8253whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008254If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008255
8256static PyObject *
8257unicode_strip(PyUnicodeObject *self, PyObject *args)
8258{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008259 if (PyTuple_GET_SIZE(args) == 0)
8260 return do_strip(self, BOTHSTRIP); /* Common case */
8261 else
8262 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008263}
8264
8265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008266PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008268\n\
8269Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008270If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008271
8272static PyObject *
8273unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8274{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008275 if (PyTuple_GET_SIZE(args) == 0)
8276 return do_strip(self, LEFTSTRIP); /* Common case */
8277 else
8278 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008279}
8280
8281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008282PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008284\n\
8285Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008286If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008287
8288static PyObject *
8289unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8290{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008291 if (PyTuple_GET_SIZE(args) == 0)
8292 return do_strip(self, RIGHTSTRIP); /* Common case */
8293 else
8294 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00008295}
8296
8297
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +00008299unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300{
8301 PyUnicodeObject *u;
8302 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008303 Py_ssize_t nchars;
Tim Peters8f422462000-09-09 06:13:41 +00008304 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305
Georg Brandl222de0f2009-04-12 12:01:50 +00008306 if (len < 1) {
8307 Py_INCREF(unicode_empty);
8308 return (PyObject *)unicode_empty;
8309 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008310
Tim Peters7a29bd52001-09-12 03:03:31 +00008311 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312 /* no repeat, return original string */
8313 Py_INCREF(str);
8314 return (PyObject*) str;
8315 }
Tim Peters8f422462000-09-09 06:13:41 +00008316
8317 /* ensure # of chars needed doesn't overflow int and # of bytes
8318 * needed doesn't overflow size_t
8319 */
8320 nchars = len * str->length;
Georg Brandl222de0f2009-04-12 12:01:50 +00008321 if (nchars / len != str->length) {
Tim Peters8f422462000-09-09 06:13:41 +00008322 PyErr_SetString(PyExc_OverflowError,
8323 "repeated string is too long");
8324 return NULL;
8325 }
8326 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8327 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8328 PyErr_SetString(PyExc_OverflowError,
8329 "repeated string is too long");
8330 return NULL;
8331 }
8332 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333 if (!u)
8334 return NULL;
8335
8336 p = u->str;
8337
Georg Brandl222de0f2009-04-12 12:01:50 +00008338 if (str->length == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00008339 Py_UNICODE_FILL(p, str->str[0], len);
8340 } else {
Georg Brandl222de0f2009-04-12 12:01:50 +00008341 Py_ssize_t done = str->length; /* number of characters copied this far */
8342 Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 while (done < nchars) {
Christian Heimescc47b052008-03-25 14:56:36 +00008344 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008345 Py_UNICODE_COPY(p+done, p, n);
8346 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348 }
8349
8350 return (PyObject*) u;
8351}
8352
Alexander Belopolsky40018472011-02-26 01:02:56 +00008353PyObject *
8354PyUnicode_Replace(PyObject *obj,
8355 PyObject *subobj,
8356 PyObject *replobj,
8357 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358{
8359 PyObject *self;
8360 PyObject *str1;
8361 PyObject *str2;
8362 PyObject *result;
8363
8364 self = PyUnicode_FromObject(obj);
8365 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367 str1 = PyUnicode_FromObject(subobj);
8368 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 Py_DECREF(self);
8370 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371 }
8372 str2 = PyUnicode_FromObject(replobj);
8373 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 Py_DECREF(self);
8375 Py_DECREF(str1);
8376 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008377 }
Tim Petersced69f82003-09-16 20:30:58 +00008378 result = replace((PyUnicodeObject *)self,
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 (PyUnicodeObject *)str1,
8380 (PyUnicodeObject *)str2,
8381 maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008382 Py_DECREF(self);
8383 Py_DECREF(str1);
8384 Py_DECREF(str2);
8385 return result;
8386}
8387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008388PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +00008389 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008390\n\
8391Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +00008392old replaced by new. If the optional argument count is\n\
8393given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394
8395static PyObject*
8396unicode_replace(PyUnicodeObject *self, PyObject *args)
8397{
8398 PyUnicodeObject *str1;
8399 PyUnicodeObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008400 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401 PyObject *result;
8402
Martin v. Löwis18e16552006-02-15 17:27:45 +00008403 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404 return NULL;
8405 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8406 if (str1 == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008408 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008409 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 Py_DECREF(str1);
8411 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00008412 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413
8414 result = replace(self, str1, str2, maxcount);
8415
8416 Py_DECREF(str1);
8417 Py_DECREF(str2);
8418 return result;
8419}
8420
Alexander Belopolsky40018472011-02-26 01:02:56 +00008421static PyObject *
8422unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008423{
Walter Dörwald79e913e2007-05-12 11:08:06 +00008424 PyObject *repr;
Walter Dörwald1ab83302007-05-18 17:15:44 +00008425 Py_UNICODE *p;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008426 Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8427 Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8428
8429 /* XXX(nnorwitz): rather than over-allocating, it would be
8430 better to choose a different scheme. Perhaps scan the
8431 first N-chars of the string and allocate based on that size.
8432 */
8433 /* Initial allocation is based on the longest-possible unichr
8434 escape.
8435
8436 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8437 unichr, so in this case it's the longest unichr escape. In
8438 narrow (UTF-16) builds this is five chars per source unichr
8439 since there are two unichrs in the surrogate pair, so in narrow
8440 (UTF-16) builds it's not the longest unichr escape.
8441
8442 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8443 so in the narrow (UTF-16) build case it's the longest unichr
8444 escape.
8445 */
8446
Walter Dörwald1ab83302007-05-18 17:15:44 +00008447 repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 2 /* quotes */
Walter Dörwald79e913e2007-05-12 11:08:06 +00008449#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 + 10*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008451#else
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 + 6*size
Walter Dörwald79e913e2007-05-12 11:08:06 +00008453#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 + 1);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008455 if (repr == NULL)
8456 return NULL;
8457
Walter Dörwald1ab83302007-05-18 17:15:44 +00008458 p = PyUnicode_AS_UNICODE(repr);
Walter Dörwald79e913e2007-05-12 11:08:06 +00008459
8460 /* Add quote */
8461 *p++ = (findchar(s, size, '\'') &&
8462 !findchar(s, size, '"')) ? '"' : '\'';
8463 while (size-- > 0) {
8464 Py_UNICODE ch = *s++;
8465
8466 /* Escape quotes and backslashes */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008467 if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008468 *p++ = '\\';
Walter Dörwald1ab83302007-05-18 17:15:44 +00008469 *p++ = ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00008470 continue;
8471 }
8472
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008474 if (ch == '\t') {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008475 *p++ = '\\';
8476 *p++ = 't';
8477 }
8478 else if (ch == '\n') {
8479 *p++ = '\\';
8480 *p++ = 'n';
8481 }
8482 else if (ch == '\r') {
8483 *p++ = '\\';
8484 *p++ = 'r';
8485 }
8486
8487 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +00008488 else if (ch < ' ' || ch == 0x7F) {
Walter Dörwald79e913e2007-05-12 11:08:06 +00008489 *p++ = '\\';
8490 *p++ = 'x';
8491 *p++ = hexdigits[(ch >> 4) & 0x000F];
8492 *p++ = hexdigits[ch & 0x000F];
8493 }
8494
Georg Brandl559e5d72008-06-11 18:37:52 +00008495 /* Copy ASCII characters as-is */
8496 else if (ch < 0x7F) {
8497 *p++ = ch;
8498 }
8499
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +00008501 else {
8502 Py_UCS4 ucs = ch;
8503
8504#ifndef Py_UNICODE_WIDE
8505 Py_UNICODE ch2 = 0;
8506 /* Get code point from surrogate pair */
8507 if (size > 0) {
8508 ch2 = *s;
8509 if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 && ch2 <= 0xDFFF) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008511 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 + 0x00010000;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008513 s++;
Georg Brandl559e5d72008-06-11 18:37:52 +00008514 size--;
8515 }
8516 }
8517#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00008518 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +00008519 (categories Z* and C* except ASCII space)
8520 */
8521 if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8522 /* Map 8-bit characters to '\xhh' */
8523 if (ucs <= 0xff) {
8524 *p++ = '\\';
8525 *p++ = 'x';
8526 *p++ = hexdigits[(ch >> 4) & 0x000F];
8527 *p++ = hexdigits[ch & 0x000F];
8528 }
8529 /* Map 21-bit characters to '\U00xxxxxx' */
8530 else if (ucs >= 0x10000) {
8531 *p++ = '\\';
8532 *p++ = 'U';
8533 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8534 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8535 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8536 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8537 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8538 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8539 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8540 *p++ = hexdigits[ucs & 0x0000000F];
8541 }
8542 /* Map 16-bit characters to '\uxxxx' */
8543 else {
8544 *p++ = '\\';
8545 *p++ = 'u';
8546 *p++ = hexdigits[(ucs >> 12) & 0x000F];
8547 *p++ = hexdigits[(ucs >> 8) & 0x000F];
8548 *p++ = hexdigits[(ucs >> 4) & 0x000F];
8549 *p++ = hexdigits[ucs & 0x000F];
8550 }
8551 }
8552 /* Copy characters as-is */
8553 else {
8554 *p++ = ch;
8555#ifndef Py_UNICODE_WIDE
8556 if (ucs >= 0x10000)
8557 *p++ = ch2;
8558#endif
8559 }
8560 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00008561 }
8562 /* Add quote */
Walter Dörwald1ab83302007-05-18 17:15:44 +00008563 *p++ = PyUnicode_AS_UNICODE(repr)[0];
Walter Dörwald79e913e2007-05-12 11:08:06 +00008564
8565 *p = '\0';
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00008566 PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
Walter Dörwald79e913e2007-05-12 11:08:06 +00008567 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568}
8569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008570PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572\n\
8573Return the highest index in S where substring sub is found,\n\
Guido van Rossum806c2462007-08-06 23:33:07 +00008574such that sub is contained within s[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575arguments start and end are interpreted as in slice notation.\n\
8576\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008577Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578
8579static PyObject *
8580unicode_rfind(PyUnicodeObject *self, PyObject *args)
8581{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008582 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008583 Py_ssize_t start;
8584 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008585 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586
Christian Heimes9cd17752007-11-18 19:35:23 +00008587 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008588 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589
Thomas Wouters477c8d52006-05-27 19:21:47 +00008590 result = stringlib_rfind_slice(
8591 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8592 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8593 start, end
8594 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595
8596 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008597
Christian Heimes217cfd12007-12-02 14:31:20 +00008598 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599}
8600
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008601PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008604Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605
8606static PyObject *
8607unicode_rindex(PyUnicodeObject *self, PyObject *args)
8608{
Thomas Wouters477c8d52006-05-27 19:21:47 +00008609 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +00008610 Py_ssize_t start;
8611 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008612 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613
Christian Heimes9cd17752007-11-18 19:35:23 +00008614 if (!_ParseTupleFinds(args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008615 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616
Thomas Wouters477c8d52006-05-27 19:21:47 +00008617 result = stringlib_rfind_slice(
8618 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8619 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8620 start, end
8621 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622
8623 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008624
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 if (result < 0) {
8626 PyErr_SetString(PyExc_ValueError, "substring not found");
8627 return NULL;
8628 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008629 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630}
8631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008632PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008635Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008636done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637
8638static PyObject *
8639unicode_rjust(PyUnicodeObject *self, PyObject *args)
8640{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008641 Py_ssize_t width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008642 Py_UNICODE fillchar = ' ';
8643
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008644 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645 return NULL;
8646
Tim Peters7a29bd52001-09-12 03:03:31 +00008647 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648 Py_INCREF(self);
8649 return (PyObject*) self;
8650 }
8651
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00008652 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653}
8654
Alexander Belopolsky40018472011-02-26 01:02:56 +00008655PyObject *
8656PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657{
8658 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008659
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660 s = PyUnicode_FromObject(s);
8661 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008662 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 if (sep != NULL) {
8664 sep = PyUnicode_FromObject(sep);
8665 if (sep == NULL) {
8666 Py_DECREF(s);
8667 return NULL;
8668 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669 }
8670
8671 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8672
8673 Py_DECREF(s);
8674 Py_XDECREF(sep);
8675 return result;
8676}
8677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008678PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680\n\
8681Return a list of the words in S, using sep as the\n\
8682delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +00008683splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +00008684whitespace string is a separator and empty strings are\n\
8685removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008686
8687static PyObject*
8688unicode_split(PyUnicodeObject *self, PyObject *args)
8689{
8690 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008691 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008692
Martin v. Löwis18e16552006-02-15 17:27:45 +00008693 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008694 return NULL;
8695
8696 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 return split(self, (PyUnicodeObject *)substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702}
8703
Thomas Wouters477c8d52006-05-27 19:21:47 +00008704PyObject *
8705PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8706{
8707 PyObject* str_obj;
8708 PyObject* sep_obj;
8709 PyObject* out;
8710
8711 str_obj = PyUnicode_FromObject(str_in);
8712 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008714 sep_obj = PyUnicode_FromObject(sep_in);
8715 if (!sep_obj) {
8716 Py_DECREF(str_obj);
8717 return NULL;
8718 }
8719
8720 out = stringlib_partition(
8721 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8722 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8723 );
8724
8725 Py_DECREF(sep_obj);
8726 Py_DECREF(str_obj);
8727
8728 return out;
8729}
8730
8731
8732PyObject *
8733PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8734{
8735 PyObject* str_obj;
8736 PyObject* sep_obj;
8737 PyObject* out;
8738
8739 str_obj = PyUnicode_FromObject(str_in);
8740 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008741 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008742 sep_obj = PyUnicode_FromObject(sep_in);
8743 if (!sep_obj) {
8744 Py_DECREF(str_obj);
8745 return NULL;
8746 }
8747
8748 out = stringlib_rpartition(
8749 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8750 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8751 );
8752
8753 Py_DECREF(sep_obj);
8754 Py_DECREF(str_obj);
8755
8756 return out;
8757}
8758
8759PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008760 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008761\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008762Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008763the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008764found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008765
8766static PyObject*
8767unicode_partition(PyUnicodeObject *self, PyObject *separator)
8768{
8769 return PyUnicode_Partition((PyObject *)self, separator);
8770}
8771
8772PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +00008773 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008774\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +00008775Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +00008776the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +00008777separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +00008778
8779static PyObject*
8780unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8781{
8782 return PyUnicode_RPartition((PyObject *)self, separator);
8783}
8784
Alexander Belopolsky40018472011-02-26 01:02:56 +00008785PyObject *
8786PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008787{
8788 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008789
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008790 s = PyUnicode_FromObject(s);
8791 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008792 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00008793 if (sep != NULL) {
8794 sep = PyUnicode_FromObject(sep);
8795 if (sep == NULL) {
8796 Py_DECREF(s);
8797 return NULL;
8798 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008799 }
8800
8801 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8802
8803 Py_DECREF(s);
8804 Py_XDECREF(sep);
8805 return result;
8806}
8807
8808PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008809 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008810\n\
8811Return a list of the words in S, using sep as the\n\
8812delimiter string, starting at the end of the string and\n\
8813working to the front. If maxsplit is given, at most maxsplit\n\
8814splits are done. If sep is not specified, any whitespace string\n\
8815is a separator.");
8816
8817static PyObject*
8818unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8819{
8820 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008821 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008822
Martin v. Löwis18e16552006-02-15 17:27:45 +00008823 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008824 return NULL;
8825
8826 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008828 else if (PyUnicode_Check(substring))
Benjamin Peterson29060642009-01-31 22:14:21 +00008829 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008830 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008831 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00008832}
8833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008834PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008835 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836\n\
8837Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00008838Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008839is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840
8841static PyObject*
8842unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8843{
Guido van Rossum86662912000-04-11 15:38:46 +00008844 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008845
Guido van Rossum86662912000-04-11 15:38:46 +00008846 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847 return NULL;
8848
Guido van Rossum86662912000-04-11 15:38:46 +00008849 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008850}
8851
8852static
Guido van Rossumf15a29f2007-05-04 00:41:39 +00008853PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854{
Walter Dörwald346737f2007-05-31 10:44:43 +00008855 if (PyUnicode_CheckExact(self)) {
8856 Py_INCREF(self);
8857 return self;
8858 } else
8859 /* Subtype -- return genuine unicode string with the same value. */
8860 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8861 PyUnicode_GET_SIZE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +00008862}
8863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008864PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008865 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008866\n\
8867Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008868and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869
8870static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008871unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008872{
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873 return fixup(self, fixswapcase);
8874}
8875
Georg Brandlceee0772007-11-27 23:48:05 +00008876PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008877 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008878\n\
8879Return a translation table usable for str.translate().\n\
8880If there is only one argument, it must be a dictionary mapping Unicode\n\
8881ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008882Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +00008883If there are two arguments, they must be strings of equal length, and\n\
8884in the resulting dictionary, each character in x will be mapped to the\n\
8885character at the same position in y. If there is a third argument, it\n\
8886must be a string, whose characters will be mapped to None in the result.");
8887
8888static PyObject*
8889unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8890{
8891 PyObject *x, *y = NULL, *z = NULL;
8892 PyObject *new = NULL, *key, *value;
8893 Py_ssize_t i = 0;
8894 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008895
Georg Brandlceee0772007-11-27 23:48:05 +00008896 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8897 return NULL;
8898 new = PyDict_New();
8899 if (!new)
8900 return NULL;
8901 if (y != NULL) {
8902 /* x must be a string too, of equal length */
8903 Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8904 if (!PyUnicode_Check(x)) {
8905 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8906 "be a string if there is a second argument");
8907 goto err;
8908 }
8909 if (PyUnicode_GET_SIZE(x) != ylen) {
8910 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8911 "arguments must have equal length");
8912 goto err;
8913 }
8914 /* create entries for translating chars in x to those in y */
8915 for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008916 key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8917 value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008918 if (!key || !value)
8919 goto err;
8920 res = PyDict_SetItem(new, key, value);
8921 Py_DECREF(key);
8922 Py_DECREF(value);
8923 if (res < 0)
8924 goto err;
8925 }
8926 /* create entries for deleting chars in z */
8927 if (z != NULL) {
8928 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Christian Heimes217cfd12007-12-02 14:31:20 +00008929 key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
Georg Brandlceee0772007-11-27 23:48:05 +00008930 if (!key)
8931 goto err;
8932 res = PyDict_SetItem(new, key, Py_None);
8933 Py_DECREF(key);
8934 if (res < 0)
8935 goto err;
8936 }
8937 }
8938 } else {
8939 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +00008940 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008941 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8942 "to maketrans it must be a dict");
8943 goto err;
8944 }
8945 /* copy entries into the new dict, converting string keys to int keys */
8946 while (PyDict_Next(x, &i, &key, &value)) {
8947 if (PyUnicode_Check(key)) {
8948 /* convert string keys to integer keys */
8949 PyObject *newkey;
8950 if (PyUnicode_GET_SIZE(key) != 1) {
8951 PyErr_SetString(PyExc_ValueError, "string keys in translate "
8952 "table must be of length 1");
8953 goto err;
8954 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008955 newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
Georg Brandlceee0772007-11-27 23:48:05 +00008956 if (!newkey)
8957 goto err;
8958 res = PyDict_SetItem(new, newkey, value);
8959 Py_DECREF(newkey);
8960 if (res < 0)
8961 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +00008962 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +00008963 /* just keep integer keys */
8964 if (PyDict_SetItem(new, key, value) < 0)
8965 goto err;
8966 } else {
8967 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8968 "be strings or integers");
8969 goto err;
8970 }
8971 }
8972 }
8973 return new;
8974 err:
8975 Py_DECREF(new);
8976 return NULL;
8977}
8978
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008979PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008980 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981\n\
8982Return a copy of the string S, where all characters have been mapped\n\
8983through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00008984Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00008985Unmapped characters are left untouched. Characters mapped to None\n\
8986are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987
8988static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00008989unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990{
Georg Brandlceee0772007-11-27 23:48:05 +00008991 return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992}
8993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008994PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00008995 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00008997Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998
8999static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009000unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002 return fixup(self, fixupper);
9003}
9004
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009005PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009006 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009007\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +00009008Pad a numeric string S with zeros on the left, to fill a field\n\
9009of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010
9011static PyObject *
9012unicode_zfill(PyUnicodeObject *self, PyObject *args)
9013{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009014 Py_ssize_t fill;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015 PyUnicodeObject *u;
9016
Martin v. Löwis18e16552006-02-15 17:27:45 +00009017 Py_ssize_t width;
9018 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 return NULL;
9020
9021 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00009022 if (PyUnicode_CheckExact(self)) {
9023 Py_INCREF(self);
9024 return (PyObject*) self;
9025 }
9026 else
9027 return PyUnicode_FromUnicode(
9028 PyUnicode_AS_UNICODE(self),
9029 PyUnicode_GET_SIZE(self)
Benjamin Peterson29060642009-01-31 22:14:21 +00009030 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00009031 }
9032
9033 fill = width - self->length;
9034
9035 u = pad(self, fill, 0, '0');
9036
Walter Dörwald068325e2002-04-15 13:36:47 +00009037 if (u == NULL)
9038 return NULL;
9039
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040 if (u->str[fill] == '+' || u->str[fill] == '-') {
9041 /* move sign to beginning of string */
9042 u->str[0] = u->str[fill];
9043 u->str[fill] = '0';
9044 }
9045
9046 return (PyObject*) u;
9047}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048
9049#if 0
9050static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009051unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009052{
Christian Heimes2202f872008-02-06 14:31:34 +00009053 return PyLong_FromLong(numfree);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009054}
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009055
9056static PyObject *
9057unicode__decimal2ascii(PyObject *self)
9058{
9059 return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9060 PyUnicode_GET_SIZE(self));
9061}
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062#endif
9063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009064PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009065 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009067Return True if S starts with the specified prefix, False otherwise.\n\
9068With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009069With optional end, stop comparing S at that position.\n\
9070prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071
9072static PyObject *
9073unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009074 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009076 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009077 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009078 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009079 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009080 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009082 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009083 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9084 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009085 if (PyTuple_Check(subobj)) {
9086 Py_ssize_t i;
9087 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9088 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009089 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009090 if (substring == NULL)
9091 return NULL;
9092 result = tailmatch(self, substring, start, end, -1);
9093 Py_DECREF(substring);
9094 if (result) {
9095 Py_RETURN_TRUE;
9096 }
9097 }
9098 /* nothing matched */
9099 Py_RETURN_FALSE;
9100 }
9101 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009103 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009104 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009105 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009106 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107}
9108
9109
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009110PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00009113Return True if S ends with the specified suffix, False otherwise.\n\
9114With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009115With optional end, stop comparing S at that position.\n\
9116suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117
9118static PyObject *
9119unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +00009120 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009122 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009124 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009125 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009126 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009128 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
Benjamin Peterson29060642009-01-31 22:14:21 +00009129 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
9130 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009131 if (PyTuple_Check(subobj)) {
9132 Py_ssize_t i;
9133 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9134 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009136 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009138 result = tailmatch(self, substring, start, end, +1);
9139 Py_DECREF(substring);
9140 if (result) {
9141 Py_RETURN_TRUE;
9142 }
9143 }
9144 Py_RETURN_FALSE;
9145 }
9146 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009148 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009150 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00009152 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009153}
9154
Eric Smith8c663262007-08-25 02:26:07 +00009155#include "stringlib/string_format.h"
9156
9157PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009158 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009159\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009160Return a formatted version of S, using substitutions from args and kwargs.\n\
9161The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +00009162
Eric Smith27bbca62010-11-04 17:06:58 +00009163PyDoc_STRVAR(format_map__doc__,
9164 "S.format_map(mapping) -> str\n\
9165\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009166Return a formatted version of S, using substitutions from mapping.\n\
9167The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +00009168
Eric Smith4a7d76d2008-05-30 18:10:19 +00009169static PyObject *
9170unicode__format__(PyObject* self, PyObject* args)
9171{
9172 PyObject *format_spec;
9173
9174 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9175 return NULL;
9176
9177 return _PyUnicode_FormatAdvanced(self,
9178 PyUnicode_AS_UNICODE(format_spec),
9179 PyUnicode_GET_SIZE(format_spec));
9180}
9181
Eric Smith8c663262007-08-25 02:26:07 +00009182PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009183 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +00009184\n\
Eric Smith51d2fd92010-11-06 19:27:37 +00009185Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +00009186
9187static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009188unicode__sizeof__(PyUnicodeObject *v)
9189{
Robert Schuppeniesfbe94c52008-07-14 10:13:31 +00009190 return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9191 sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009192}
9193
9194PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009195 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009196
9197static PyObject *
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009198unicode_getnewargs(PyUnicodeObject *v)
9199{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009200 return Py_BuildValue("(u#)", v->str, v->length);
Guido van Rossum5d9113d2003-01-29 17:58:45 +00009201}
9202
Guido van Rossumd57fd912000-03-10 22:53:23 +00009203static PyMethodDef unicode_methods[] = {
9204
9205 /* Order is according to common usage: often used methods should
9206 appear first, since lookup is done sequentially. */
9207
Benjamin Peterson28a4dce2010-12-12 01:33:04 +00009208 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009209 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9210 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009211 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009212 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9213 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9214 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9215 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9216 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9217 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9218 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009219 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009220 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9221 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9222 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009223 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009224 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9225 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9226 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009227 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +00009228 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009229 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00009230 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009231 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9232 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9233 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9234 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9235 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9236 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9237 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9238 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9239 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9240 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9241 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9242 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9243 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9244 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +00009245 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +00009246 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009247 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +00009248 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +00009249 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +00009250 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +00009251 {"maketrans", (PyCFunction) unicode_maketrans,
9252 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +00009253 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00009254#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009255 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009256#endif
9257
9258#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009259 /* These methods are just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009260 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009261 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009262#endif
9263
Benjamin Peterson14339b62009-01-31 16:36:08 +00009264 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00009265 {NULL, NULL}
9266};
9267
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009268static PyObject *
9269unicode_mod(PyObject *v, PyObject *w)
9270{
Benjamin Peterson29060642009-01-31 22:14:21 +00009271 if (!PyUnicode_Check(v)) {
9272 Py_INCREF(Py_NotImplemented);
9273 return Py_NotImplemented;
9274 }
9275 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009276}
9277
9278static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009279 0, /*nb_add*/
9280 0, /*nb_subtract*/
9281 0, /*nb_multiply*/
9282 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +00009283};
9284
Guido van Rossumd57fd912000-03-10 22:53:23 +00009285static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009286 (lenfunc) unicode_length, /* sq_length */
9287 PyUnicode_Concat, /* sq_concat */
9288 (ssizeargfunc) unicode_repeat, /* sq_repeat */
9289 (ssizeargfunc) unicode_getitem, /* sq_item */
9290 0, /* sq_slice */
9291 0, /* sq_ass_item */
9292 0, /* sq_ass_slice */
9293 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009294};
9295
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009296static PyObject*
9297unicode_subscript(PyUnicodeObject* self, PyObject* item)
9298{
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009299 if (PyIndex_Check(item)) {
9300 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009301 if (i == -1 && PyErr_Occurred())
9302 return NULL;
9303 if (i < 0)
Martin v. Löwisdea59e52006-01-05 10:00:36 +00009304 i += PyUnicode_GET_SIZE(self);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009305 return unicode_getitem(self, i);
9306 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +00009307 Py_ssize_t start, stop, step, slicelength, cur, i;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009308 Py_UNICODE* source_buf;
9309 Py_UNICODE* result_buf;
9310 PyObject* result;
9311
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00009312 if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson29060642009-01-31 22:14:21 +00009313 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009314 return NULL;
9315 }
9316
9317 if (slicelength <= 0) {
9318 return PyUnicode_FromUnicode(NULL, 0);
Thomas Woutersed03b412007-08-28 21:37:11 +00009319 } else if (start == 0 && step == 1 && slicelength == self->length &&
9320 PyUnicode_CheckExact(self)) {
9321 Py_INCREF(self);
9322 return (PyObject *)self;
9323 } else if (step == 1) {
9324 return PyUnicode_FromUnicode(self->str + start, slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009325 } else {
9326 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimesb186d002008-03-18 15:15:01 +00009327 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9328 sizeof(Py_UNICODE));
Benjamin Peterson14339b62009-01-31 16:36:08 +00009329
Benjamin Peterson29060642009-01-31 22:14:21 +00009330 if (result_buf == NULL)
9331 return PyErr_NoMemory();
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009332
9333 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9334 result_buf[i] = source_buf[cur];
9335 }
Tim Petersced69f82003-09-16 20:30:58 +00009336
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009337 result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimesb186d002008-03-18 15:15:01 +00009338 PyObject_FREE(result_buf);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009339 return result;
9340 }
9341 } else {
9342 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9343 return NULL;
9344 }
9345}
9346
9347static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009348 (lenfunc)unicode_length, /* mp_length */
9349 (binaryfunc)unicode_subscript, /* mp_subscript */
9350 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00009351};
9352
Guido van Rossumd57fd912000-03-10 22:53:23 +00009353
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354/* Helpers for PyUnicode_Format() */
9355
9356static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +00009357getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009359 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009361 (*p_argidx)++;
9362 if (arglen < 0)
9363 return args;
9364 else
9365 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009366 }
9367 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009368 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369 return NULL;
9370}
9371
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009372/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009374static PyObject *
9375formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376{
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009377 char *p;
9378 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009379 double x;
Tim Petersced69f82003-09-16 20:30:58 +00009380
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381 x = PyFloat_AsDouble(v);
9382 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009383 return NULL;
9384
Guido van Rossumd57fd912000-03-10 22:53:23 +00009385 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009386 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +00009387
Eric Smith0923d1d2009-04-16 20:16:10 +00009388 p = PyOS_double_to_string(x, type, prec,
9389 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009390 if (p == NULL)
9391 return NULL;
9392 result = PyUnicode_FromStringAndSize(p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +00009393 PyMem_Free(p);
9394 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009395}
9396
Tim Peters38fd5b62000-09-21 05:43:11 +00009397static PyObject*
9398formatlong(PyObject *val, int flags, int prec, int type)
9399{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009400 char *buf;
9401 int len;
9402 PyObject *str; /* temporary string object. */
9403 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009404
Benjamin Peterson14339b62009-01-31 16:36:08 +00009405 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9406 if (!str)
9407 return NULL;
9408 result = PyUnicode_FromStringAndSize(buf, len);
9409 Py_DECREF(str);
9410 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +00009411}
9412
Guido van Rossumd57fd912000-03-10 22:53:23 +00009413static int
9414formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009415 size_t buflen,
9416 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009417{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009418 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009419 if (PyUnicode_Check(v)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009420 if (PyUnicode_GET_SIZE(v) == 1) {
9421 buf[0] = PyUnicode_AS_UNICODE(v)[0];
9422 buf[1] = '\0';
9423 return 1;
9424 }
9425#ifndef Py_UNICODE_WIDE
9426 if (PyUnicode_GET_SIZE(v) == 2) {
9427 /* Decode a valid surrogate pair */
9428 int c0 = PyUnicode_AS_UNICODE(v)[0];
9429 int c1 = PyUnicode_AS_UNICODE(v)[1];
9430 if (0xD800 <= c0 && c0 <= 0xDBFF &&
9431 0xDC00 <= c1 && c1 <= 0xDFFF) {
9432 buf[0] = c0;
9433 buf[1] = c1;
9434 buf[2] = '\0';
9435 return 2;
9436 }
9437 }
9438#endif
9439 goto onError;
9440 }
9441 else {
9442 /* Integer input truncated to a character */
9443 long x;
9444 x = PyLong_AsLong(v);
9445 if (x == -1 && PyErr_Occurred())
9446 goto onError;
9447
9448 if (x < 0 || x > 0x10ffff) {
9449 PyErr_SetString(PyExc_OverflowError,
9450 "%c arg not in range(0x110000)");
9451 return -1;
9452 }
9453
9454#ifndef Py_UNICODE_WIDE
9455 if (x > 0xffff) {
9456 x -= 0x10000;
9457 buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9458 buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9459 return 2;
9460 }
9461#endif
9462 buf[0] = (Py_UNICODE) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009463 buf[1] = '\0';
9464 return 1;
9465 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +00009466
Benjamin Peterson29060642009-01-31 22:14:21 +00009467 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009468 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009469 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00009470 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471}
9472
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009473/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009474 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009475*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009476#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00009477
Alexander Belopolsky40018472011-02-26 01:02:56 +00009478PyObject *
9479PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480{
9481 Py_UNICODE *fmt, *res;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009482 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483 int args_owned = 0;
9484 PyUnicodeObject *result = NULL;
9485 PyObject *dict = NULL;
9486 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00009487
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009489 PyErr_BadInternalCall();
9490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491 }
9492 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00009493 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009495 fmt = PyUnicode_AS_UNICODE(uformat);
9496 fmtcnt = PyUnicode_GET_SIZE(uformat);
9497
9498 reslen = rescnt = fmtcnt + 100;
9499 result = _PyUnicode_New(reslen);
9500 if (result == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009501 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 res = PyUnicode_AS_UNICODE(result);
9503
9504 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009505 arglen = PyTuple_Size(args);
9506 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507 }
9508 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009509 arglen = -1;
9510 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511 }
Christian Heimes90aa7642007-12-19 02:45:37 +00009512 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +00009513 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +00009514 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009515
9516 while (--fmtcnt >= 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009517 if (*fmt != '%') {
9518 if (--rescnt < 0) {
9519 rescnt = fmtcnt + 100;
9520 reslen += rescnt;
9521 if (_PyUnicode_Resize(&result, reslen) < 0)
9522 goto onError;
9523 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9524 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009525 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009526 *res++ = *fmt++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009527 }
9528 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009529 /* Got a format specifier */
9530 int flags = 0;
9531 Py_ssize_t width = -1;
9532 int prec = -1;
9533 Py_UNICODE c = '\0';
9534 Py_UNICODE fill;
9535 int isnumok;
9536 PyObject *v = NULL;
9537 PyObject *temp = NULL;
9538 Py_UNICODE *pbuf;
9539 Py_UNICODE sign;
9540 Py_ssize_t len;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009541 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542
Benjamin Peterson29060642009-01-31 22:14:21 +00009543 fmt++;
9544 if (*fmt == '(') {
9545 Py_UNICODE *keystart;
9546 Py_ssize_t keylen;
9547 PyObject *key;
9548 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +00009549
Benjamin Peterson29060642009-01-31 22:14:21 +00009550 if (dict == NULL) {
9551 PyErr_SetString(PyExc_TypeError,
9552 "format requires a mapping");
9553 goto onError;
9554 }
9555 ++fmt;
9556 --fmtcnt;
9557 keystart = fmt;
9558 /* Skip over balanced parentheses */
9559 while (pcount > 0 && --fmtcnt >= 0) {
9560 if (*fmt == ')')
9561 --pcount;
9562 else if (*fmt == '(')
9563 ++pcount;
9564 fmt++;
9565 }
9566 keylen = fmt - keystart - 1;
9567 if (fmtcnt < 0 || pcount > 0) {
9568 PyErr_SetString(PyExc_ValueError,
9569 "incomplete format key");
9570 goto onError;
9571 }
9572#if 0
9573 /* keys are converted to strings using UTF-8 and
9574 then looked up since Python uses strings to hold
9575 variables names etc. in its namespaces and we
9576 wouldn't want to break common idioms. */
9577 key = PyUnicode_EncodeUTF8(keystart,
9578 keylen,
9579 NULL);
9580#else
9581 key = PyUnicode_FromUnicode(keystart, keylen);
9582#endif
9583 if (key == NULL)
9584 goto onError;
9585 if (args_owned) {
9586 Py_DECREF(args);
9587 args_owned = 0;
9588 }
9589 args = PyObject_GetItem(dict, key);
9590 Py_DECREF(key);
9591 if (args == NULL) {
9592 goto onError;
9593 }
9594 args_owned = 1;
9595 arglen = -1;
9596 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009597 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009598 while (--fmtcnt >= 0) {
9599 switch (c = *fmt++) {
9600 case '-': flags |= F_LJUST; continue;
9601 case '+': flags |= F_SIGN; continue;
9602 case ' ': flags |= F_BLANK; continue;
9603 case '#': flags |= F_ALT; continue;
9604 case '0': flags |= F_ZERO; continue;
9605 }
9606 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009607 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009608 if (c == '*') {
9609 v = getnextarg(args, arglen, &argidx);
9610 if (v == NULL)
9611 goto onError;
9612 if (!PyLong_Check(v)) {
9613 PyErr_SetString(PyExc_TypeError,
9614 "* wants int");
9615 goto onError;
9616 }
9617 width = PyLong_AsLong(v);
9618 if (width == -1 && PyErr_Occurred())
9619 goto onError;
9620 if (width < 0) {
9621 flags |= F_LJUST;
9622 width = -width;
9623 }
9624 if (--fmtcnt >= 0)
9625 c = *fmt++;
9626 }
9627 else if (c >= '0' && c <= '9') {
9628 width = c - '0';
9629 while (--fmtcnt >= 0) {
9630 c = *fmt++;
9631 if (c < '0' || c > '9')
9632 break;
9633 if ((width*10) / 10 != width) {
9634 PyErr_SetString(PyExc_ValueError,
9635 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009636 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00009637 }
9638 width = width*10 + (c - '0');
9639 }
9640 }
9641 if (c == '.') {
9642 prec = 0;
9643 if (--fmtcnt >= 0)
9644 c = *fmt++;
9645 if (c == '*') {
9646 v = getnextarg(args, arglen, &argidx);
9647 if (v == NULL)
9648 goto onError;
9649 if (!PyLong_Check(v)) {
9650 PyErr_SetString(PyExc_TypeError,
9651 "* wants int");
9652 goto onError;
9653 }
9654 prec = PyLong_AsLong(v);
9655 if (prec == -1 && PyErr_Occurred())
9656 goto onError;
9657 if (prec < 0)
9658 prec = 0;
9659 if (--fmtcnt >= 0)
9660 c = *fmt++;
9661 }
9662 else if (c >= '0' && c <= '9') {
9663 prec = c - '0';
9664 while (--fmtcnt >= 0) {
Stefan Krah99212f62010-07-19 17:58:26 +00009665 c = *fmt++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009666 if (c < '0' || c > '9')
9667 break;
9668 if ((prec*10) / 10 != prec) {
9669 PyErr_SetString(PyExc_ValueError,
9670 "prec too big");
9671 goto onError;
9672 }
9673 prec = prec*10 + (c - '0');
9674 }
9675 }
9676 } /* prec */
9677 if (fmtcnt >= 0) {
9678 if (c == 'h' || c == 'l' || c == 'L') {
9679 if (--fmtcnt >= 0)
9680 c = *fmt++;
9681 }
9682 }
9683 if (fmtcnt < 0) {
9684 PyErr_SetString(PyExc_ValueError,
9685 "incomplete format");
9686 goto onError;
9687 }
9688 if (c != '%') {
9689 v = getnextarg(args, arglen, &argidx);
9690 if (v == NULL)
9691 goto onError;
9692 }
9693 sign = 0;
9694 fill = ' ';
9695 switch (c) {
9696
9697 case '%':
9698 pbuf = formatbuf;
9699 /* presume that buffer length is at least 1 */
9700 pbuf[0] = '%';
9701 len = 1;
9702 break;
9703
9704 case 's':
9705 case 'r':
9706 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +00009707 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +00009708 temp = v;
9709 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009710 }
9711 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009712 if (c == 's')
9713 temp = PyObject_Str(v);
9714 else if (c == 'r')
9715 temp = PyObject_Repr(v);
9716 else
9717 temp = PyObject_ASCII(v);
9718 if (temp == NULL)
9719 goto onError;
9720 if (PyUnicode_Check(temp))
9721 /* nothing to do */;
9722 else {
9723 Py_DECREF(temp);
9724 PyErr_SetString(PyExc_TypeError,
9725 "%s argument has non-string str()");
9726 goto onError;
9727 }
9728 }
9729 pbuf = PyUnicode_AS_UNICODE(temp);
9730 len = PyUnicode_GET_SIZE(temp);
9731 if (prec >= 0 && len > prec)
9732 len = prec;
9733 break;
9734
9735 case 'i':
9736 case 'd':
9737 case 'u':
9738 case 'o':
9739 case 'x':
9740 case 'X':
9741 if (c == 'i')
9742 c = 'd';
9743 isnumok = 0;
9744 if (PyNumber_Check(v)) {
9745 PyObject *iobj=NULL;
9746
9747 if (PyLong_Check(v)) {
9748 iobj = v;
9749 Py_INCREF(iobj);
9750 }
9751 else {
9752 iobj = PyNumber_Long(v);
9753 }
9754 if (iobj!=NULL) {
9755 if (PyLong_Check(iobj)) {
9756 isnumok = 1;
9757 temp = formatlong(iobj, flags, prec, c);
9758 Py_DECREF(iobj);
9759 if (!temp)
9760 goto onError;
9761 pbuf = PyUnicode_AS_UNICODE(temp);
9762 len = PyUnicode_GET_SIZE(temp);
9763 sign = 1;
9764 }
9765 else {
9766 Py_DECREF(iobj);
9767 }
9768 }
9769 }
9770 if (!isnumok) {
9771 PyErr_Format(PyExc_TypeError,
9772 "%%%c format: a number is required, "
9773 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9774 goto onError;
9775 }
9776 if (flags & F_ZERO)
9777 fill = '0';
9778 break;
9779
9780 case 'e':
9781 case 'E':
9782 case 'f':
9783 case 'F':
9784 case 'g':
9785 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009786 temp = formatfloat(v, flags, prec, c);
9787 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +00009788 goto onError;
Mark Dickinsonf489caf2009-05-01 11:42:00 +00009789 pbuf = PyUnicode_AS_UNICODE(temp);
9790 len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009791 sign = 1;
9792 if (flags & F_ZERO)
9793 fill = '0';
9794 break;
9795
9796 case 'c':
9797 pbuf = formatbuf;
9798 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9799 if (len < 0)
9800 goto onError;
9801 break;
9802
9803 default:
9804 PyErr_Format(PyExc_ValueError,
9805 "unsupported format character '%c' (0x%x) "
9806 "at index %zd",
9807 (31<=c && c<=126) ? (char)c : '?',
9808 (int)c,
9809 (Py_ssize_t)(fmt - 1 -
9810 PyUnicode_AS_UNICODE(uformat)));
9811 goto onError;
9812 }
9813 if (sign) {
9814 if (*pbuf == '-' || *pbuf == '+') {
9815 sign = *pbuf++;
9816 len--;
9817 }
9818 else if (flags & F_SIGN)
9819 sign = '+';
9820 else if (flags & F_BLANK)
9821 sign = ' ';
9822 else
9823 sign = 0;
9824 }
9825 if (width < len)
9826 width = len;
9827 if (rescnt - (sign != 0) < width) {
9828 reslen -= rescnt;
9829 rescnt = width + fmtcnt + 100;
9830 reslen += rescnt;
9831 if (reslen < 0) {
9832 Py_XDECREF(temp);
9833 PyErr_NoMemory();
9834 goto onError;
9835 }
9836 if (_PyUnicode_Resize(&result, reslen) < 0) {
9837 Py_XDECREF(temp);
9838 goto onError;
9839 }
9840 res = PyUnicode_AS_UNICODE(result)
9841 + reslen - rescnt;
9842 }
9843 if (sign) {
9844 if (fill != ' ')
9845 *res++ = sign;
9846 rescnt--;
9847 if (width > len)
9848 width--;
9849 }
9850 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9851 assert(pbuf[0] == '0');
9852 assert(pbuf[1] == c);
9853 if (fill != ' ') {
9854 *res++ = *pbuf++;
9855 *res++ = *pbuf++;
9856 }
9857 rescnt -= 2;
9858 width -= 2;
9859 if (width < 0)
9860 width = 0;
9861 len -= 2;
9862 }
9863 if (width > len && !(flags & F_LJUST)) {
9864 do {
9865 --rescnt;
9866 *res++ = fill;
9867 } while (--width > len);
9868 }
9869 if (fill == ' ') {
9870 if (sign)
9871 *res++ = sign;
9872 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9873 assert(pbuf[0] == '0');
9874 assert(pbuf[1] == c);
9875 *res++ = *pbuf++;
9876 *res++ = *pbuf++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009877 }
9878 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009879 Py_UNICODE_COPY(res, pbuf, len);
9880 res += len;
9881 rescnt -= len;
9882 while (--width >= len) {
9883 --rescnt;
9884 *res++ = ' ';
9885 }
9886 if (dict && (argidx < arglen) && c != '%') {
9887 PyErr_SetString(PyExc_TypeError,
9888 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +00009889 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +00009890 goto onError;
9891 }
9892 Py_XDECREF(temp);
9893 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894 } /* until end */
9895 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009896 PyErr_SetString(PyExc_TypeError,
9897 "not all arguments converted during string formatting");
9898 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899 }
9900
Thomas Woutersa96affe2006-03-12 00:29:36 +00009901 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009902 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009903 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009904 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009905 }
9906 Py_DECREF(uformat);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907 return (PyObject *)result;
9908
Benjamin Peterson29060642009-01-31 22:14:21 +00009909 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009910 Py_XDECREF(result);
9911 Py_DECREF(uformat);
9912 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009913 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914 }
9915 return NULL;
9916}
9917
Jeremy Hylton938ace62002-07-17 16:30:39 +00009918static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00009919unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9920
Tim Peters6d6c1a32001-08-02 04:15:00 +00009921static PyObject *
9922unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9923{
Benjamin Peterson29060642009-01-31 22:14:21 +00009924 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009925 static char *kwlist[] = {"object", "encoding", "errors", 0};
9926 char *encoding = NULL;
9927 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +00009928
Benjamin Peterson14339b62009-01-31 16:36:08 +00009929 if (type != &PyUnicode_Type)
9930 return unicode_subtype_new(type, args, kwds);
9931 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +00009932 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009933 return NULL;
9934 if (x == NULL)
9935 return (PyObject *)_PyUnicode_New(0);
9936 if (encoding == NULL && errors == NULL)
9937 return PyObject_Str(x);
9938 else
Benjamin Peterson29060642009-01-31 22:14:21 +00009939 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +00009940}
9941
Guido van Rossume023fe02001-08-30 03:12:59 +00009942static PyObject *
9943unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9944{
Benjamin Peterson14339b62009-01-31 16:36:08 +00009945 PyUnicodeObject *tmp, *pnew;
9946 Py_ssize_t n;
Guido van Rossume023fe02001-08-30 03:12:59 +00009947
Benjamin Peterson14339b62009-01-31 16:36:08 +00009948 assert(PyType_IsSubtype(type, &PyUnicode_Type));
9949 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9950 if (tmp == NULL)
9951 return NULL;
9952 assert(PyUnicode_Check(tmp));
9953 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9954 if (pnew == NULL) {
9955 Py_DECREF(tmp);
9956 return NULL;
9957 }
9958 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9959 if (pnew->str == NULL) {
9960 _Py_ForgetReference((PyObject *)pnew);
9961 PyObject_Del(pnew);
9962 Py_DECREF(tmp);
9963 return PyErr_NoMemory();
9964 }
9965 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9966 pnew->length = n;
9967 pnew->hash = tmp->hash;
9968 Py_DECREF(tmp);
9969 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00009970}
9971
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009972PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +00009973 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +00009974\n\
Collin Winterd474ce82007-08-07 19:42:11 +00009975Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00009976encoding defaults to the current default string encoding.\n\
9977errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00009978
Guido van Rossum50e9fb92006-08-17 05:42:55 +00009979static PyObject *unicode_iter(PyObject *seq);
9980
Guido van Rossumd57fd912000-03-10 22:53:23 +00009981PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +00009982 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +00009983 "str", /* tp_name */
9984 sizeof(PyUnicodeObject), /* tp_size */
9985 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009987 (destructor)unicode_dealloc, /* tp_dealloc */
9988 0, /* tp_print */
9989 0, /* tp_getattr */
9990 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +00009991 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +00009992 unicode_repr, /* tp_repr */
9993 &unicode_as_number, /* tp_as_number */
9994 &unicode_as_sequence, /* tp_as_sequence */
9995 &unicode_as_mapping, /* tp_as_mapping */
9996 (hashfunc) unicode_hash, /* tp_hash*/
9997 0, /* tp_call*/
9998 (reprfunc) unicode_str, /* tp_str */
9999 PyObject_GenericGetAttr, /* tp_getattro */
10000 0, /* tp_setattro */
10001 0, /* tp_as_buffer */
10002 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000010003 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010004 unicode_doc, /* tp_doc */
10005 0, /* tp_traverse */
10006 0, /* tp_clear */
10007 PyUnicode_RichCompare, /* tp_richcompare */
10008 0, /* tp_weaklistoffset */
10009 unicode_iter, /* tp_iter */
10010 0, /* tp_iternext */
10011 unicode_methods, /* tp_methods */
10012 0, /* tp_members */
10013 0, /* tp_getset */
10014 &PyBaseObject_Type, /* tp_base */
10015 0, /* tp_dict */
10016 0, /* tp_descr_get */
10017 0, /* tp_descr_set */
10018 0, /* tp_dictoffset */
10019 0, /* tp_init */
10020 0, /* tp_alloc */
10021 unicode_new, /* tp_new */
10022 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023};
10024
10025/* Initialize the Unicode implementation */
10026
Thomas Wouters78890102000-07-22 19:25:51 +000010027void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010029 int i;
10030
Thomas Wouters477c8d52006-05-27 19:21:47 +000010031 /* XXX - move this array to unicodectype.c ? */
10032 Py_UNICODE linebreak[] = {
10033 0x000A, /* LINE FEED */
10034 0x000D, /* CARRIAGE RETURN */
10035 0x001C, /* FILE SEPARATOR */
10036 0x001D, /* GROUP SEPARATOR */
10037 0x001E, /* RECORD SEPARATOR */
10038 0x0085, /* NEXT LINE */
10039 0x2028, /* LINE SEPARATOR */
10040 0x2029, /* PARAGRAPH SEPARATOR */
10041 };
10042
Fred Drakee4315f52000-05-09 19:53:39 +000010043 /* Init the implementation */
Christian Heimes2202f872008-02-06 14:31:34 +000010044 free_list = NULL;
10045 numfree = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010046 unicode_empty = _PyUnicode_New(0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010047 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +000010048 return;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010049
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010050 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000010051 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000010052 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010053 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000010054
10055 /* initialize the linebreak bloom filter */
10056 bloom_linebreak = make_bloom_mask(
10057 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10058 );
Thomas Wouters0e3f5912006-08-11 14:57:12 +000010059
10060 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010061}
10062
10063/* Finalize the Unicode implementation */
10064
Christian Heimesa156e092008-02-16 07:38:31 +000010065int
10066PyUnicode_ClearFreeList(void)
10067{
10068 int freelist_size = numfree;
10069 PyUnicodeObject *u;
10070
10071 for (u = free_list; u != NULL;) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010072 PyUnicodeObject *v = u;
10073 u = *(PyUnicodeObject **)u;
10074 if (v->str)
10075 PyObject_DEL(v->str);
10076 Py_XDECREF(v->defenc);
10077 PyObject_Del(v);
10078 numfree--;
Christian Heimesa156e092008-02-16 07:38:31 +000010079 }
10080 free_list = NULL;
10081 assert(numfree == 0);
10082 return freelist_size;
10083}
10084
Guido van Rossumd57fd912000-03-10 22:53:23 +000010085void
Thomas Wouters78890102000-07-22 19:25:51 +000010086_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010087{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010088 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010089
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000010090 Py_XDECREF(unicode_empty);
10091 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000010092
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010093 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010094 if (unicode_latin1[i]) {
10095 Py_DECREF(unicode_latin1[i]);
10096 unicode_latin1[i] = NULL;
10097 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000010098 }
Christian Heimesa156e092008-02-16 07:38:31 +000010099 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000010100}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000010101
Walter Dörwald16807132007-05-25 13:52:07 +000010102void
10103PyUnicode_InternInPlace(PyObject **p)
10104{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010105 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10106 PyObject *t;
10107 if (s == NULL || !PyUnicode_Check(s))
10108 Py_FatalError(
10109 "PyUnicode_InternInPlace: unicode strings only please!");
10110 /* If it's a subclass, we don't really know what putting
10111 it in the interned dict might do. */
10112 if (!PyUnicode_CheckExact(s))
10113 return;
10114 if (PyUnicode_CHECK_INTERNED(s))
10115 return;
10116 if (interned == NULL) {
10117 interned = PyDict_New();
10118 if (interned == NULL) {
10119 PyErr_Clear(); /* Don't leave an exception */
10120 return;
10121 }
10122 }
10123 /* It might be that the GetItem call fails even
10124 though the key is present in the dictionary,
10125 namely when this happens during a stack overflow. */
10126 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000010127 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010128 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000010129
Benjamin Peterson29060642009-01-31 22:14:21 +000010130 if (t) {
10131 Py_INCREF(t);
10132 Py_DECREF(*p);
10133 *p = t;
10134 return;
10135 }
Walter Dörwald16807132007-05-25 13:52:07 +000010136
Benjamin Peterson14339b62009-01-31 16:36:08 +000010137 PyThreadState_GET()->recursion_critical = 1;
10138 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10139 PyErr_Clear();
10140 PyThreadState_GET()->recursion_critical = 0;
10141 return;
10142 }
10143 PyThreadState_GET()->recursion_critical = 0;
10144 /* The two references in interned are not counted by refcnt.
10145 The deallocator will take care of this */
10146 Py_REFCNT(s) -= 2;
10147 PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000010148}
10149
10150void
10151PyUnicode_InternImmortal(PyObject **p)
10152{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010153 PyUnicode_InternInPlace(p);
10154 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10155 PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10156 Py_INCREF(*p);
10157 }
Walter Dörwald16807132007-05-25 13:52:07 +000010158}
10159
10160PyObject *
10161PyUnicode_InternFromString(const char *cp)
10162{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010163 PyObject *s = PyUnicode_FromString(cp);
10164 if (s == NULL)
10165 return NULL;
10166 PyUnicode_InternInPlace(&s);
10167 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000010168}
10169
Alexander Belopolsky40018472011-02-26 01:02:56 +000010170void
10171_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000010172{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010173 PyObject *keys;
10174 PyUnicodeObject *s;
10175 Py_ssize_t i, n;
10176 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000010177
Benjamin Peterson14339b62009-01-31 16:36:08 +000010178 if (interned == NULL || !PyDict_Check(interned))
10179 return;
10180 keys = PyDict_Keys(interned);
10181 if (keys == NULL || !PyList_Check(keys)) {
10182 PyErr_Clear();
10183 return;
10184 }
Walter Dörwald16807132007-05-25 13:52:07 +000010185
Benjamin Peterson14339b62009-01-31 16:36:08 +000010186 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10187 detector, interned unicode strings are not forcibly deallocated;
10188 rather, we give them their stolen references back, and then clear
10189 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000010190
Benjamin Peterson14339b62009-01-31 16:36:08 +000010191 n = PyList_GET_SIZE(keys);
10192 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000010193 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010194 for (i = 0; i < n; i++) {
10195 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10196 switch (s->state) {
10197 case SSTATE_NOT_INTERNED:
10198 /* XXX Shouldn't happen */
10199 break;
10200 case SSTATE_INTERNED_IMMORTAL:
10201 Py_REFCNT(s) += 1;
10202 immortal_size += s->length;
10203 break;
10204 case SSTATE_INTERNED_MORTAL:
10205 Py_REFCNT(s) += 2;
10206 mortal_size += s->length;
10207 break;
10208 default:
10209 Py_FatalError("Inconsistent interned string state.");
10210 }
10211 s->state = SSTATE_NOT_INTERNED;
10212 }
10213 fprintf(stderr, "total size of all interned strings: "
10214 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10215 "mortal/immortal\n", mortal_size, immortal_size);
10216 Py_DECREF(keys);
10217 PyDict_Clear(interned);
10218 Py_DECREF(interned);
10219 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000010220}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010221
10222
10223/********************* Unicode Iterator **************************/
10224
10225typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010226 PyObject_HEAD
10227 Py_ssize_t it_index;
10228 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010229} unicodeiterobject;
10230
10231static void
10232unicodeiter_dealloc(unicodeiterobject *it)
10233{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010234 _PyObject_GC_UNTRACK(it);
10235 Py_XDECREF(it->it_seq);
10236 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010237}
10238
10239static int
10240unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10241{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010242 Py_VISIT(it->it_seq);
10243 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010244}
10245
10246static PyObject *
10247unicodeiter_next(unicodeiterobject *it)
10248{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010249 PyUnicodeObject *seq;
10250 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010251
Benjamin Peterson14339b62009-01-31 16:36:08 +000010252 assert(it != NULL);
10253 seq = it->it_seq;
10254 if (seq == NULL)
10255 return NULL;
10256 assert(PyUnicode_Check(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010257
Benjamin Peterson14339b62009-01-31 16:36:08 +000010258 if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10259 item = PyUnicode_FromUnicode(
Benjamin Peterson29060642009-01-31 22:14:21 +000010260 PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010261 if (item != NULL)
10262 ++it->it_index;
10263 return item;
10264 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010265
Benjamin Peterson14339b62009-01-31 16:36:08 +000010266 Py_DECREF(seq);
10267 it->it_seq = NULL;
10268 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010269}
10270
10271static PyObject *
10272unicodeiter_len(unicodeiterobject *it)
10273{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010274 Py_ssize_t len = 0;
10275 if (it->it_seq)
10276 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10277 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010278}
10279
10280PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10281
10282static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010283 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000010284 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000010285 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010286};
10287
10288PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010289 PyVarObject_HEAD_INIT(&PyType_Type, 0)
10290 "str_iterator", /* tp_name */
10291 sizeof(unicodeiterobject), /* tp_basicsize */
10292 0, /* tp_itemsize */
10293 /* methods */
10294 (destructor)unicodeiter_dealloc, /* tp_dealloc */
10295 0, /* tp_print */
10296 0, /* tp_getattr */
10297 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000010298 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000010299 0, /* tp_repr */
10300 0, /* tp_as_number */
10301 0, /* tp_as_sequence */
10302 0, /* tp_as_mapping */
10303 0, /* tp_hash */
10304 0, /* tp_call */
10305 0, /* tp_str */
10306 PyObject_GenericGetAttr, /* tp_getattro */
10307 0, /* tp_setattro */
10308 0, /* tp_as_buffer */
10309 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10310 0, /* tp_doc */
10311 (traverseproc)unicodeiter_traverse, /* tp_traverse */
10312 0, /* tp_clear */
10313 0, /* tp_richcompare */
10314 0, /* tp_weaklistoffset */
10315 PyObject_SelfIter, /* tp_iter */
10316 (iternextfunc)unicodeiter_next, /* tp_iternext */
10317 unicodeiter_methods, /* tp_methods */
10318 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010319};
10320
10321static PyObject *
10322unicode_iter(PyObject *seq)
10323{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010324 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010325
Benjamin Peterson14339b62009-01-31 16:36:08 +000010326 if (!PyUnicode_Check(seq)) {
10327 PyErr_BadInternalCall();
10328 return NULL;
10329 }
10330 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10331 if (it == NULL)
10332 return NULL;
10333 it->it_index = 0;
10334 Py_INCREF(seq);
10335 it->it_seq = (PyUnicodeObject *)seq;
10336 _PyObject_GC_TRACK(it);
10337 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000010338}
10339
Martin v. Löwis5b222132007-06-10 09:51:05 +000010340size_t
10341Py_UNICODE_strlen(const Py_UNICODE *u)
10342{
10343 int res = 0;
10344 while(*u++)
10345 res++;
10346 return res;
10347}
10348
10349Py_UNICODE*
10350Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10351{
10352 Py_UNICODE *u = s1;
10353 while ((*u++ = *s2++));
10354 return s1;
10355}
10356
10357Py_UNICODE*
10358Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10359{
10360 Py_UNICODE *u = s1;
10361 while ((*u++ = *s2++))
10362 if (n-- == 0)
10363 break;
10364 return s1;
10365}
10366
Victor Stinnerc4eb7652010-09-01 23:43:50 +000010367Py_UNICODE*
10368Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10369{
10370 Py_UNICODE *u1 = s1;
10371 u1 += Py_UNICODE_strlen(u1);
10372 Py_UNICODE_strcpy(u1, s2);
10373 return s1;
10374}
10375
Martin v. Löwis5b222132007-06-10 09:51:05 +000010376int
10377Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10378{
10379 while (*s1 && *s2 && *s1 == *s2)
10380 s1++, s2++;
10381 if (*s1 && *s2)
10382 return (*s1 < *s2) ? -1 : +1;
10383 if (*s1)
10384 return 1;
10385 if (*s2)
10386 return -1;
10387 return 0;
10388}
10389
Victor Stinneref8d95c2010-08-16 22:03:11 +000010390int
10391Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10392{
10393 register Py_UNICODE u1, u2;
10394 for (; n != 0; n--) {
10395 u1 = *s1;
10396 u2 = *s2;
10397 if (u1 != u2)
10398 return (u1 < u2) ? -1 : +1;
10399 if (u1 == '\0')
10400 return 0;
10401 s1++;
10402 s2++;
10403 }
10404 return 0;
10405}
10406
Martin v. Löwis5b222132007-06-10 09:51:05 +000010407Py_UNICODE*
10408Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10409{
10410 const Py_UNICODE *p;
10411 for (p = s; *p; p++)
10412 if (*p == c)
10413 return (Py_UNICODE*)p;
10414 return NULL;
10415}
10416
Victor Stinner331ea922010-08-10 16:37:20 +000010417Py_UNICODE*
10418Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10419{
10420 const Py_UNICODE *p;
10421 p = s + Py_UNICODE_strlen(s);
10422 while (p != s) {
10423 p--;
10424 if (*p == c)
10425 return (Py_UNICODE*)p;
10426 }
10427 return NULL;
10428}
10429
Victor Stinner71133ff2010-09-01 23:43:53 +000010430Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000010431PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000010432{
10433 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10434 Py_UNICODE *copy;
10435 Py_ssize_t size;
10436
10437 /* Ensure we won't overflow the size. */
10438 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10439 PyErr_NoMemory();
10440 return NULL;
10441 }
10442 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10443 size *= sizeof(Py_UNICODE);
10444 copy = PyMem_Malloc(size);
10445 if (copy == NULL) {
10446 PyErr_NoMemory();
10447 return NULL;
10448 }
10449 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10450 return copy;
10451}
Martin v. Löwis5b222132007-06-10 09:51:05 +000010452
Georg Brandl66c221e2010-10-14 07:04:07 +000010453/* A _string module, to export formatter_parser and formatter_field_name_split
10454 to the string.Formatter class implemented in Python. */
10455
10456static PyMethodDef _string_methods[] = {
10457 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10458 METH_O, PyDoc_STR("split the argument as a field name")},
10459 {"formatter_parser", (PyCFunction) formatter_parser,
10460 METH_O, PyDoc_STR("parse the argument as a format string")},
10461 {NULL, NULL}
10462};
10463
10464static struct PyModuleDef _string_module = {
10465 PyModuleDef_HEAD_INIT,
10466 "_string",
10467 PyDoc_STR("string helper module"),
10468 0,
10469 _string_methods,
10470 NULL,
10471 NULL,
10472 NULL,
10473 NULL
10474};
10475
10476PyMODINIT_FUNC
10477PyInit__string(void)
10478{
10479 return PyModule_Create(&_string_module);
10480}
10481
10482
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010483#ifdef __cplusplus
10484}
10485#endif